# Data preparation

In [2]:
import os
import json
import pandas as pd

# Directories containing JSON files
directories = ['2018','2019','2020','2021','2022','2023']

# List to store extracted data
extracted_data = []

# Iterate through each directory
for directory in directories:
    # Iterate through each file in the current directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):  # Ensure it's a JSON file
            filepath = os.path.join(directory, filename)
            
            # Open and load the JSON file
            with open(filepath, 'r', encoding="utf-8") as file:
                data = json.load(file)
                
                # Extract the relevant part of the JSON
                response = data.get('abstracts-retrieval-response', {})
                
                # Extract specific fields
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = [item.get('@abbrev', None) for item in subject_area_list if '@abbrev' in item]


                # publication date
                date = response.get('item', {}).get("ait:process-info", {}).get("ait:date-sort",{})
                day = date.get("@day")
                year = date.get("@year")
                month = date.get('@month')
                format_date = f"{day}/{month}/{year}"

                # Extract author keywords and combine into a single string
                # Extract author keywords and combine into a single string
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  # Ensure it's a dictionary
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  # Ensure 'author-keyword' is a list
                        combined_keywords = ",".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  # Handle cases where auth_keywords is not a dictionary

                # Append the extracted data as a dictionary
                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea,
                    'publication_date': format_date
                })

# Convert the extracted data into a DataFrame
df = pd.DataFrame(extracted_data)
def change(x):
    x=  set(x)
    result = ""
    for area in x:
        result += area + ","
    
    return result[:-1]
df["subjectArea"] = df["subjectArea"].apply(change)

In [3]:
df.dropna(inplace=True)

# Subject Area prediction

In [4]:
# make list
df['subjectArea'] = df['subjectArea'].apply(lambda x: x.split(','))

In [5]:
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","[MATE, PHYS]",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","[ECON, BUSI]",01/01/2018
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,"Cobb angle,Dowager's hump,Round back,Spine",[HEAL],01/12/2018
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","[CHEM, PHAR]",01/01/2018
5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,"© 2017, NATCO. All rights reserved.Introductio...","Choice,Consumer wellness,Decision-making,Neigh...",[MEDI],01/03/2018


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


# Text preprocessing and vectorization
text_data = df['title'] + ' ' + df['publicationName']  # You can also add 'keywords'
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(text_data)

# MultiLabelBinarizer to handle multi-label targets
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['subjectArea'])  # Convert subjectArea to multi-hot encoded labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = OneVsRestClassifier(XGBClassifier(eval_metric="logloss"))
model.fit(X_train, y_train)

# Get prediction probabilities
y_pred_prob = model.predict_proba(X_test)

# Tune Threshold
thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold = 0.5
best_f1 = 0
best_report = None

for threshold in thresholds:
    # Apply threshold
    y_pred_adjusted = (y_pred_prob >= threshold).astype(int)
    
    # Evaluate with classification report and suppress warnings
    report = classification_report(y_test, y_pred_adjusted, target_names=mlb.classes_, zero_division=0, output_dict=True)
    f1_score = report["samples avg"]["f1-score"]
    
    if f1_score > best_f1:
        best_f1 = f1_score
        best_threshold = threshold
        best_report = report

# Final Evaluation
y_pred_best = (y_pred_prob >= best_threshold).astype(int)

In [7]:
print(classification_report(y_test, y_pred_best, target_names=mlb.classes_))

              precision    recall  f1-score   support

        AGRI       0.79      0.86      0.82       324
        ARTS       0.86      0.76      0.81        82
        BIOC       0.71      0.78      0.74       405
        BUSI       0.74      0.83      0.78       106
        CENG       0.78      0.77      0.78       270
        CHEM       0.80      0.87      0.83       360
        COMP       0.85      0.91      0.88       334
        DECI       0.76      0.74      0.75        69
        DENT       0.83      0.81      0.82        77
        EART       0.71      0.71      0.71        91
        ECON       0.74      0.76      0.75        72
        ENER       0.80      0.84      0.82       197
        ENGI       0.77      0.87      0.82       545
        ENVI       0.75      0.80      0.77       300
        HEAL       0.65      0.59      0.62        51
        IMMU       0.80      0.86      0.83       199
        MATE       0.86      0.84      0.85       362
        MATH       0.75    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.7953808038366863


# Scopus uploading

In [11]:
import os
import json
import pandas as pd

scopus_df = pd.read_csv('scopus_data.csv')

In [12]:
scopus_df.head()

Unnamed: 0,dc:title,prism:publicationName,prism:coverDate
0,Social Progress for Resilient Regions,Region,2018-12-31
1,Response: Fleischhauer and Czardybon evade the...,Studies in Language,2018-12-31
2,Fashion language and translatology,Babel,2018-12-31
3,A pragmatic framework to note-taking in consec...,Babel,2018-12-31
4,An Analytic Approximation to the Density of Tw...,Recoletos Multidisciplinary Research Journal,2018-12-31


# Start Subject Area Prediction

In [25]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.7953808038366863


In [13]:
# Preprocess scopus_df
scopus_text_data = scopus_df['dc:title'] + ' ' + scopus_df['prism:publicationName']
scopus_X = vectorizer.transform(scopus_text_data)  # Use the trained vectorizer

# Predict probabilities for scopus_df
scopus_pred_prob = model.predict_proba(scopus_X)

# Apply the best threshold to get multi-label predictions (if needed)
scopus_pred_labels_multi = (scopus_pred_prob >= best_threshold).astype(int)

# Convert multi-label predictions to single-label predictions (choose the highest probability)
scopus_pred_single = np.zeros_like(scopus_pred_prob)
scopus_pred_single[np.arange(scopus_pred_prob.shape[0]), scopus_pred_prob.argmax(axis=1)] = 1

In [14]:
# Convert predictions to readable labels
scopus_labels_multi = mlb.inverse_transform(scopus_pred_labels_multi)  # Multi-label predictions
scopus_labels_single = mlb.inverse_transform(scopus_pred_single)      # Single-label predictions

In [15]:
# Display results
scopus_df['Predicted Subject Area (Multi)'] = scopus_labels_multi
scopus_df['Predicted Subject Area (Single)'] = scopus_labels_single

In [16]:
# Print a sample of the predictions
df_with_pred = scopus_df[['dc:title', 'prism:publicationName', 'Predicted Subject Area (Multi)', 'Predicted Subject Area (Single)']]

pd.set_option('display.max_colwidth', None)
df_with_pred

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single)
0,Social Progress for Resilient Regions,Region,"(SOCI,)","(SOCI,)"
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS, SOCI)","(ARTS,)"
2,Fashion language and translatology,Babel,(),"(ARTS,)"
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,(),"(MEDI,)"
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,(),"(MEDI,)"
...,...,...,...,...
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)","(ENGI,)"
3596,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,"(ENGI, MEDI)","(ENGI,)"
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,(),"(BIOC,)"
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,(),"(MEDI,)"


In [17]:
df_with_pred['Predicted Subject Area (Multi)']

0            (SOCI,)
1       (ARTS, SOCI)
2                 ()
3                 ()
4                 ()
            ...     
3595         (ENGI,)
3596    (ENGI, MEDI)
3597              ()
3598              ()
3599              ()
Name: Predicted Subject Area (Multi), Length: 3600, dtype: object

In [18]:
(df_with_pred['Predicted Subject Area (Multi)'] == ()).sum()

np.int64(1243)

In [19]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] == ()]

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single)
2,Fashion language and translatology,Babel,(),"(ARTS,)"
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,(),"(MEDI,)"
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,(),"(MEDI,)"
5,DANCE MOTIFS ON PREHISTORIC POTTERY FROM EASTERN CROATIA,Vjesnik Arheoloskog Muzeja u Zagrebu,(),"(MEDI,)"
11,ZBOROVANJE AMERIŠKEGA ZDRUŽENJA GEOGRAFOV 2018,Dela,(),"(MEDI,)"
...,...,...,...,...
3590,TD3 Tuned PID Controller for Autonomous Vehicle Platooning,SAE Technical Papers,(),"(MEDI,)"
3594,Machine Learning Based Flight State Prediction for Improving UAV Resistance to Uncertainty,SAE Technical Papers,(),"(BIOC,)"
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,(),"(BIOC,)"
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,(),"(MEDI,)"


In [20]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] == ()].groupby('Predicted Subject Area (Single)')['Predicted Subject Area (Multi)'].count()

Predicted Subject Area (Single)
(AGRI,)     12
(ARTS,)     31
(BIOC,)     52
(BUSI,)      4
(CENG,)      1
(CHEM,)      4
(COMP,)     21
(EART,)      1
(ECON,)      1
(ENGI,)     55
(ENVI,)      4
(IMMU,)      1
(MATE,)      3
(MATH,)      4
(MEDI,)    963
(MULT,)      1
(NEUR,)      2
(NURS,)      1
(PHAR,)      1
(PHYS,)      4
(PSYC,)      2
(SOCI,)     75
Name: Predicted Subject Area (Multi), dtype: int64

In [21]:
filtered_df = df_with_pred[
    (df_with_pred['Predicted Subject Area (Multi)'].apply(lambda x: len(x) == 0)) &
    (df_with_pred['Predicted Subject Area (Single)'].apply(lambda x: 'MEDI' not in x))
]

filtered_df

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single)
2,Fashion language and translatology,Babel,(),"(ARTS,)"
87,Influence of soil macrofauna on soil organic carbon content,Ochrona Srodowiska i Zasobow Naturalnych,(),"(CHEM,)"
115,"Human, not too Human: Technology, Rites, and Identity",Open Information Science,(),"(COMP,)"
142,Bioremediation of textile dyes: Appraisal of conventional and biological approaches,Phytobiont and Ecosystem Restitution,(),"(CENG,)"
155,Rhizoremediation of azodyes by constructed wetland technology using Typha latifolia,Phytobiont and Ecosystem Restitution,(),"(ENGI,)"
...,...,...,...,...
3581,Design and Evaluation of Electric Propulsion System for Electric VTOL,SAE Technical Papers,(),"(ENGI,)"
3584,Improved Prandini Conflict Detection Algorithm Based on Trajectory Prediction,SAE Technical Papers,(),"(BIOC,)"
3594,Machine Learning Based Flight State Prediction for Improving UAV Resistance to Uncertainty,SAE Technical Papers,(),"(BIOC,)"
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,(),"(BIOC,)"


In [22]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] != ()]

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single)
0,Social Progress for Resilient Regions,Region,"(SOCI,)","(SOCI,)"
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS, SOCI)","(ARTS,)"
6,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,"(SOCI,)","(SOCI,)"
7,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,"(BUSI, ECON, PSYC)","(BUSI,)"
8,Geographic Information System-Based Suitability Analysis for Potential Shallow Tube-Well Irrigation Development,Recoletos Multidisciplinary Research Journal,"(COMP, DECI)","(DECI,)"
...,...,...,...,...
3591,Implementation of Predictive Adaptive Cruise Control Strategy Based on ADAS Map,SAE Technical Papers,"(COMP, ENGI)","(COMP,)"
3592,Research on Gear Vibration Evaluation Approach of E-Drive System Based on Order Analysis,SAE Technical Papers,"(BIOC,)","(BIOC,)"
3593,Research on Switchable Energy-Regenerative Suspension System,SAE Technical Papers,"(ENER,)","(ENER,)"
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)","(ENGI,)"


# Clean up

In [45]:
# df_final[df_final['Predicted Subject Area (Multi)'].apply(lambda x: len(x) > 2)]

In [44]:
df_final = df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] != ()].copy()

df_final.rename(
    columns={
        'dc:title': 'title', 
        'prism:publicationName': 'publicationName', 
        'Predicted Subject Area (Multi)': 'subjectArea',
        'Predicted Subject Area (Single)': 'subjectArea_single'
    }, 
    inplace=True
)
df_final['subjectArea'] = df_final['subjectArea'].apply(list)
df_final['subjectArea_single'] = df_final['subjectArea_single'].apply(list)
df_final

Unnamed: 0,title,publicationName,subjectArea,subjectArea_single
0,Social Progress for Resilient Regions,Region,[SOCI],[SOCI]
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"[ARTS, SOCI]",[ARTS]
6,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,[SOCI],[SOCI]
7,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,"[BUSI, ECON, PSYC]",[BUSI]
8,Geographic Information System-Based Suitability Analysis for Potential Shallow Tube-Well Irrigation Development,Recoletos Multidisciplinary Research Journal,"[COMP, DECI]",[DECI]
...,...,...,...,...
3591,Implementation of Predictive Adaptive Cruise Control Strategy Based on ADAS Map,SAE Technical Papers,"[COMP, ENGI]",[COMP]
3592,Research on Gear Vibration Evaluation Approach of E-Drive System Based on Order Analysis,SAE Technical Papers,[BIOC],[BIOC]
3593,Research on Switchable Energy-Regenerative Suspension System,SAE Technical Papers,[ENER],[ENER]
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,[ENGI],[ENGI]


# Try Single prediction with y_test

In [23]:
# Step 1: Predict probabilities for X_test
y_pred_prob_test = model.predict_proba(X_test)

# Step 2: Convert to single-label predictions
y_pred_single_test = np.zeros_like(y_pred_prob_test)
y_pred_single_test[np.arange(y_pred_prob_test.shape[0]), y_pred_prob_test.argmax(axis=1)] = 1

# Step 3: Evaluate against y_test
print("Classification Report (Single-Label Predictions vs y_test):")
print(classification_report(y_test, y_pred_single_test, target_names=mlb.classes_, zero_division=0))

# Step 4: Convert predictions to readable labels
y_test_labels = mlb.inverse_transform(y_test)                # Convert y_test to label lists
y_pred_labels_single_test = mlb.inverse_transform(y_pred_single_test)  # Convert predictions to label lists

# Step 5: Display a few comparisons
comparison_df = pd.DataFrame({
    "True Labels": y_test_labels,
    "Predicted Labels (Single)": y_pred_labels_single_test
})


Classification Report (Single-Label Predictions vs y_test):
              precision    recall  f1-score   support

        AGRI       0.86      0.47      0.61       324
        ARTS       0.93      0.46      0.62        82
        BIOC       0.84      0.34      0.48       405
        BUSI       0.88      0.43      0.58       106
        CENG       0.89      0.30      0.45       270
        CHEM       0.94      0.41      0.57       360
        COMP       0.96      0.60      0.74       334
        DECI       1.00      0.13      0.23        69
        DENT       0.95      0.69      0.80        77
        EART       0.93      0.44      0.60        91
        ECON       0.95      0.49      0.64        72
        ENER       0.91      0.46      0.61       197
        ENGI       0.90      0.35      0.50       545
        ENVI       0.87      0.44      0.59       300
        HEAL       0.93      0.27      0.42        51
        IMMU       0.98      0.41      0.57       199
        MATE       0.

In [24]:
print("Comparison of True and Predicted Labels (First 10 Samples):")
comparison_df.head(10)

Comparison of True and Predicted Labels (First 10 Samples):


Unnamed: 0,True Labels,Predicted Labels (Single)
0,"(ENGI,)","(ENGI,)"
1,"(BUSI, COMP, DECI, ENGI, SOCI)","(COMP,)"
2,"(MATE,)","(MATE,)"
3,"(DENT,)","(DENT,)"
4,"(ARTS, SOCI)","(SOCI,)"
5,"(MEDI, PHAR)","(PHAR,)"
6,"(MEDI,)","(MEDI,)"
7,"(BIOC, MEDI)","(BIOC,)"
8,"(COMP,)","(COMP,)"
9,"(CHEM,)","(CHEM,)"
