# Data preparation

In [None]:
import os
import json
import pandas as pd

directories = ['2018','2019','2020','2021','2022','2023']

extracted_data = []

for directory in directories:

    for filename in os.listdir(directory):
        if filename.endswith('.json'):  
            filepath = os.path.join(directory, filename)
            

            with open(filepath, 'r', encoding="utf-8") as file:
                data = json.load(file)
                
          
                response = data.get('abstracts-retrieval-response', {})
                
     
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = [item.get('@abbrev', None) for item in subject_area_list if '@abbrev' in item]


      
                date = response.get('item', {}).get("ait:process-info", {}).get("ait:date-sort",{})
                day = date.get("@day")
                year = date.get("@year")
                month = date.get('@month')
                format_date = f"{day}/{month}/{year}"

     
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  
                        combined_keywords = ",".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  

                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea,
                    'publication_date': format_date
                })


df = pd.DataFrame(extracted_data)
def change(x):
    x = set(x)
    result = ""
    for area in x:
        result += area + ","
    
    return result[:-1]
df["subjectArea"] = df["subjectArea"].apply(change)

FileNotFoundError: [Errno 2] No such file or directory: '2018'

In [2]:
import pandas as pd
df = pd.read_csv('data.csv')

In [3]:
df.shape

(20216, 6)

In [5]:
df.head(2)

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","MATE,PHYS",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","BUSI,ECON",01/01/2018


In [None]:
df.dropna(subset=['title','publicationName'], inplace=True)

In [7]:
df.shape

(20215, 6)

# Subject Area prediction

In [None]:

df['subjectArea'] = df['subjectArea'].apply(lambda x: x.split(','))

In [9]:
df.head(2)

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","[MATE, PHYS]",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","[BUSI, ECON]",01/01/2018


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report



text_data = df['title'] + ' ' + df['publicationName']  
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(text_data)


mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['subjectArea']) 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = OneVsRestClassifier(XGBClassifier(eval_metric="logloss"))
model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test)

thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold = 0.5
best_f1 = 0
best_report = None

for threshold in thresholds:
    y_pred_adjusted = (y_pred_prob >= threshold).astype(int)
    
    report = classification_report(y_test, y_pred_adjusted, target_names=mlb.classes_, zero_division=0, output_dict=True)
    f1_score = report["samples avg"]["f1-score"]
    
    if f1_score > best_f1:
        best_f1 = f1_score
        best_threshold = threshold
        best_report = report

y_pred_best = (y_pred_prob >= best_threshold).astype(int)

In [11]:
print(classification_report(y_test, y_pred_best, target_names=mlb.classes_))

              precision    recall  f1-score   support

        AGRI       0.78      0.81      0.79       383
        ARTS       0.70      0.70      0.70        93
        BIOC       0.70      0.73      0.71       482
        BUSI       0.61      0.64      0.62       125
        CENG       0.78      0.80      0.79       313
        CHEM       0.80      0.90      0.85       438
        COMP       0.86      0.89      0.87       360
        DECI       0.74      0.69      0.72        75
        DENT       0.77      0.91      0.83        86
        EART       0.77      0.76      0.76       142
        ECON       0.62      0.63      0.62        79
        ENER       0.83      0.86      0.84       220
        ENGI       0.71      0.83      0.77       593
        ENVI       0.83      0.82      0.82       344
        HEAL       0.61      0.62      0.62        53
        IMMU       0.77      0.77      0.77       229
        MATE       0.86      0.87      0.86       403
        MATH       0.74    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.797590351770416


In [15]:
y_pred_labels = mlb.inverse_transform(y_pred_best)
print('predicted labels')
y_pred_labels[:10]

predicted labels


[('MEDI',),
 ('NURS', 'SOCI'),
 ('COMP', 'ENGI', 'MATE', 'MATH'),
 ('MEDI',),
 ('ENGI', 'MATE'),
 ('ARTS', 'COMP', 'DECI', 'ENGI', 'SOCI'),
 ('CHEM', 'MATE'),
 ('CHEM', 'MATH', 'PHYS'),
 ('MEDI', 'NEUR'),
 ('MEDI',)]

In [16]:
y_test_labels = mlb.inverse_transform(y_test)
print('test labels')
y_test_labels[:10]

test labels


[('MEDI',),
 ('NURS',),
 ('COMP', 'ENGI', 'MATE', 'MATH'),
 ('MEDI',),
 ('ENGI', 'MATE'),
 ('ARTS', 'COMP', 'DECI', 'ENGI', 'MEDI', 'SOCI'),
 ('MATE', 'PHYS'),
 ('CHEM', 'COMP', 'MATH', 'PHYS'),
 ('MEDI', 'NEUR'),
 ('BIOC', 'MEDI')]

In [67]:
y_test[2], y_test_labels[2]

(array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0]),
 ('COMP', 'ENGI', 'MATE', 'MATH'))

In [None]:
df = pd.DataFrame({
    'Predicted Labels': y_pred_labels,
    'True Labels': y_test_labels
})

df

Unnamed: 0,Predicted Labels,True Labels
0,"(MEDI,)","(MEDI,)"
1,"(NURS, SOCI)","(NURS,)"
2,"(COMP, ENGI, MATE, MATH)","(COMP, ENGI, MATE, MATH)"
3,"(MEDI,)","(MEDI,)"
4,"(ENGI, MATE)","(ENGI, MATE)"
...,...,...
4038,"(CHEM, PHAR)","(PHAR,)"
4039,"(MEDI,)","(MEDI,)"
4040,"(MEDI,)","(MEDI,)"
4041,"(ARTS, SOCI)","(ARTS, SOCI)"


In [None]:
# df.loc[df['Predicted Labels'] != df['True Labels']]

# Scopus uploading

In [21]:
import os
import json
import pandas as pd

scopus_df = pd.read_csv('scopus_data.csv')

In [22]:
scopus_df.head()

Unnamed: 0,dc:title,prism:publicationName,prism:coverDate
0,Social Progress for Resilient Regions,Region,31/12/2018
1,Response: Fleischhauer and Czardybon evade the...,Studies in Language,31/12/2018
2,Fashion language and translatology,Babel,31/12/2018
3,A pragmatic framework to note-taking in consec...,Babel,31/12/2018
4,An Analytic Approximation to the Density of Tw...,Recoletos Multidisciplinary Research Journal,31/12/2018


In [23]:
scopus_df.shape

(3600, 3)

# Start Subject Area Prediction

In [24]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.797590351770416


In [None]:
scopus_text_data = scopus_df['dc:title'] + ' ' + scopus_df['prism:publicationName']
scopus_X = vectorizer.transform(scopus_text_data)  # use the trained vectorizer

# predict probabilities for scopus_df
scopus_pred_prob = model.predict_proba(scopus_X)

# apply the best threshold to get multi-label predictions (if needed)
scopus_pred_labels_multi = (scopus_pred_prob >= best_threshold).astype(int)

# single-label predictions
scopus_pred_single = np.zeros_like(scopus_pred_prob)
scopus_pred_single[np.arange(scopus_pred_prob.shape[0]), scopus_pred_prob.argmax(axis=1)] = 1

In [None]:

scopus_labels_multi = mlb.inverse_transform(scopus_pred_labels_multi)  
scopus_labels_single = mlb.inverse_transform(scopus_pred_single)    

In [None]:

scopus_df['Predicted Subject Area (Multi)'] = scopus_labels_multi
scopus_df['Predicted Subject Area (Single)'] = scopus_labels_single

In [None]:

df_with_pred = scopus_df[['dc:title', 'prism:publicationName', 'Predicted Subject Area (Multi)', 'Predicted Subject Area (Single)','prism:coverDate']]

pd.set_option('display.max_colwidth', None)
df_with_pred

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
0,Social Progress for Resilient Regions,Region,"(SOCI,)","(SOCI,)",31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS,)","(ARTS,)",31/12/2018
2,Fashion language and translatology,Babel,"(ARTS,)","(ARTS,)",31/12/2018
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,(),"(MEDI,)",31/12/2018
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,(),"(MEDI,)",31/12/2018
...,...,...,...,...,...
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023
3596,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,"(ENGI, MEDI)","(ENGI,)",31/12/2023
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023


In [62]:
df_with_pred['Predicted Subject Area (Multi)']

0            (SOCI,)
1            (ARTS,)
2            (ARTS,)
3                 ()
4                 ()
            ...     
3595         (ENGI,)
3596    (ENGI, MEDI)
3597         (ENGI,)
3598         (ENGI,)
3599         (ENGI,)
Name: Predicted Subject Area (Multi), Length: 3600, dtype: object

In [31]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] == ()]

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,(),"(MEDI,)",31/12/2018
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,(),"(MEDI,)",31/12/2018
5,DANCE MOTIFS ON PREHISTORIC POTTERY FROM EASTERN CROATIA,Vjesnik Arheoloskog Muzeja u Zagrebu,(),"(MEDI,)",31/12/2018
8,Geographic Information System-Based Suitability Analysis for Potential Shallow Tube-Well Irrigation Development,Recoletos Multidisciplinary Research Journal,(),"(COMP,)",31/12/2018
11,ZBOROVANJE AMERIŠKEGA ZDRUŽENJA GEOGRAFOV 2018,Dela,(),"(MEDI,)",31/12/2018
...,...,...,...,...,...
3537,Terminology in the wild: Enactive meaning-making in the Roman surveyors,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023
3538,Coming to terms with aristotle: Technical terminology in the Poetics and beyond,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023
3542,The rise of botanical terminology in the sixteenth and seventeenth centuries,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023
3544,Coming to terms: Approaches to (ancient) terminologies,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023


In [32]:
filtered_df = df_with_pred[
    (df_with_pred['Predicted Subject Area (Multi)'].apply(lambda x: len(x) == 0)) &
    (df_with_pred['Predicted Subject Area (Single)'].apply(lambda x: 'MEDI' not in x))
]

filtered_df.sample(5)

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
2462,Between economy of effort and speech accuracy in hypokinetic dysarthria,Studi AISV,(),"(SOCI,)",31/12/2022
1932,Innovative approaches to the ancient timbre of anatolian kabak kemane,Online Journal of Music Sciences,(),"(ARTS,)",31/12/2021
1332,"A research on the determination of the period when (Nim) zirgule tone began to be used as a tone while finishing of the maqams of buselik, dugah, humayun and hisar",Online Journal of Music Sciences,(),"(SOCI,)",31/12/2020
960,NousSommes: Collectivity and the digital in French thought and culture,NousSommes: Collectivity and the Digital in French Thought and Culture,(),"(ARTS,)",31/12/2019
921,A new approach for prevention the oxidations and mutations: Zinc borate,Journal of Boron,(),"(ARTS,)",31/12/2019


In [33]:
# df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] != ()]

# Clean up

In [34]:
def change(x):
    x = set(x)
    result = ""
    for area in x:
        result += area + ","
    return result.strip(',')

In [35]:
df_final = df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] != ()].copy()

df_final.drop(columns='Predicted Subject Area (Single)', inplace=True)
df_final.rename(
    columns={
        'dc:title': 'title', 
        'prism:publicationName': 'publicationName', 
        'Predicted Subject Area (Multi)': 'subjectArea',
        'prism:coverDate': 'publication_date'
    }, 
    inplace=True
)
df_final

Unnamed: 0,title,publicationName,subjectArea,publication_date
0,Social Progress for Resilient Regions,Region,"(SOCI,)",31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS,)",31/12/2018
2,Fashion language and translatology,Babel,"(ARTS,)",31/12/2018
6,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,"(ENGI,)",31/12/2018
7,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,"(BUSI, ECON)",31/12/2018
...,...,...,...,...
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)",31/12/2023
3596,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,"(ENGI, MEDI)",31/12/2023
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,"(ENGI,)",31/12/2023
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,"(ENGI,)",31/12/2023


In [36]:
df_final['subjectArea'] = df_final['subjectArea'].apply(list).apply(change)

In [37]:
df_final.loc[df_final['subjectArea'].isna()]

Unnamed: 0,title,publicationName,subjectArea,publication_date


# Merge 2 DataFrames

In [39]:
df = pd.read_csv('data.csv')

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216 entries, 0 to 20215
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             20215 non-null  object
 1   publicationName   20216 non-null  object
 2   abstract          19551 non-null  object
 3   keywords          16443 non-null  object
 4   subjectArea       20216 non-null  object
 5   publication_date  20216 non-null  object
dtypes: object(6)
memory usage: 947.8+ KB


In [None]:
df.reset_index(drop=True, inplace=True)
df_columns = ['title','publicationName','abstract','keywords','subjectArea','publication_date']
df_final_columns = ['title','publicationName','subjectArea','publication_date']
df.shape

(20216, 6)

In [42]:
df_final.reset_index(drop=True, inplace=True)

In [None]:

df_final = df_final.reindex(columns=df.columns, fill_value=None)
df_final.head(5)

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Social Progress for Resilient Regions,Region,,,SOCI,31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,,,ARTS,31/12/2018
2,Fashion language and translatology,Babel,,,ARTS,31/12/2018
3,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,,,ENGI,31/12/2018
4,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,,,"ECON,BUSI",31/12/2018


In [None]:

df_all2 = pd.concat([df, df_final], axis=0, ignore_index=True)

In [None]:

df_all2.shape

(22606, 6)

In [51]:
df_all2.sample(1)

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
13552,Biodegradation of 4-nitroaniline by novel isolate Bacillus sp. strain AVPP64 in the presence of pesticides,Environmental Pollution,"© 2022 Elsevier LtdIn this study, Bacillus sp. strain AVPP64 was isolated from diuron-contaminated soil. It showed 4-nitroaniline (4-NA) degradation, pesticide tolerance, and self-nutrient integration via nitrogen (N)-fixation and phosphate (P)-solubilization. The rate constant (k) and half-life period (t1/2) of 4-NA degradation in the aqueous medium inoculated with strain AVPP64 were observed to be 0.445 d−1 and 1.55 d, respectively. Nevertheless, in the presence of chlorpyrifos, profenofos, atrazine and diuron pesticides, strain AVPP64 degraded 4-NA with t1/2 values of 2.55 d, 2.26 d, 2.31 d and 3.54 d, respectively. The strain AVPP64 fixed 140 μg mL−1 of N and solubilized 103 μg mL−1 of P during the presence of 4-NA. In addition, strain AVPP64 produced significant amounts of plant growth-promoting metabolites like indole 3-acetic acid, siderophores, exo-polysaccharides and ammonia. In the presence of 4-NA and various pesticides, strain AVPP64 greatly increased the growth and biomass of Vigna radiata and Crotalaria juncea plants. These results revealed that Bacillus sp. strain AVPP64 can be used as an inoculum for bioremediation of 4-NA contaminated soil and sustainable crop production even when pesticides are present.","4-nitroaniline,Bacillus sp. strain AVPP64,Biodegradation,Nitrogen fixation,Pesticides tolerance,Phosphate solubilization","ENVI,PHAR",01/08/2022


In [52]:
df_all2.tail()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
22601,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,,,ENGI,31/12/2023
22602,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,,,"MEDI,ENGI",31/12/2023
22603,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,,,ENGI,31/12/2023
22604,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,,,ENGI,31/12/2023
22605,High-Precision Modeling and Online Validation of a 200kW-Class Series Hybrid Power System in Aviation,SAE Technical Papers,,,ENGI,31/12/2023


In [118]:
df_all2.to_csv('merged_data.csv', index=False, encoding='utf-8')

print("Data saved to data.csv successfully!")

Data saved to data.csv successfully!
