# Data preparation

In [30]:
import os
import json
import pandas as pd

# Directories containing JSON files
directories = ['2018','2019','2020','2021','2022','2023']

# List to store extracted data
extracted_data = []

# Iterate through each directory
for directory in directories:
    # Iterate through each file in the current directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):  # Ensure it's a JSON file
            filepath = os.path.join(directory, filename)
            
            # Open and load the JSON file
            with open(filepath, 'r', encoding="utf-8") as file:
                data = json.load(file)
                
                # Extract the relevant part of the JSON
                response = data.get('abstracts-retrieval-response', {})
                
                # Extract specific fields
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = [item.get('@abbrev', None) for item in subject_area_list if '@abbrev' in item]


                # publication date
                date = response.get('item', {}).get("ait:process-info", {}).get("ait:date-sort",{})
                day = date.get("@day")
                year = date.get("@year")
                month = date.get('@month')
                format_date = f"{day}/{month}/{year}"

                # Extract author keywords and combine into a single string
                # Extract author keywords and combine into a single string
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  # Ensure it's a dictionary
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  # Ensure 'author-keyword' is a list
                        combined_keywords = ",".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  # Handle cases where auth_keywords is not a dictionary

                # Append the extracted data as a dictionary
                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea,
                    'publication_date': format_date
                })

# Convert the extracted data into a DataFrame
df = pd.DataFrame(extracted_data)

def change(x):
    x=  set(x)
    result = ""
    for area in x:
        result += area + ","
    
    return result[:-1]
df["subjectArea"] = df["subjectArea"].apply(change)

In [31]:
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","MATE,PHYS",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","BUSI,ECON",01/01/2018
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,"Cobb angle,Dowager's hump,Round back,Spine",HEAL,01/12/2018
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","PHAR,CHEM",01/01/2018
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,,MEDI,01/06/2018


In [None]:
df.to_csv('data.csv', index=False, encoding='utf-8')
print("Data saved to data.csv successfully!")

Data saved to data.csv successfully!


In [34]:
df_new = pd.read_csv('data.csv')
df_new.head()
df_new['keywords'][0].split(',')
#for x in df_new['keywords'][0]

['EIS',
 'Microstructure',
 'Pitting corrosion',
 'Polarization',
 'Titanium alloy']

In [19]:
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"[EIS, Microstructure, Pitting corrosion, Polar...","[MATE, PHYS]",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"[Critical factors, Innovation creation, Public...","[BUSI, ECON]",01/01/2018
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,"[Cobb angle, Dowager's hump, Round back, Spine]",[HEAL],01/12/2018
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"[Agriculture land management, Conserved area, ...","[CHEM, PHAR, CHEM, CHEM]",01/01/2018
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,[],"[MEDI, MEDI]",01/06/2018


In [14]:
df['keywords']

0        anxiety,cultural beliefs,fear of committing er...
1        anxiety,cultural beliefs,fear of committing er...
2        anxiety,cultural beliefs,fear of committing er...
3        anxiety,cultural beliefs,fear of committing er...
4                                                     None
                               ...                        
20211                                                 None
20212                                                 None
20213                                                 None
20214                                                 None
20215                                                 None
Name: keywords, Length: 20216, dtype: object

In [8]:
df.dropna(inplace=True)

# Subject Area prediction

In [9]:
# make list
#df['subjectArea'] = df['subjectArea'].apply(lambda x: x.split(','))

In [10]:
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","[MATE, PHYS]",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","[BUSI, ECON]",01/01/2018
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,"Cobb angle,Dowager's hump,Round back,Spine",[HEAL],01/12/2018
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","[CHEM, PHAR, CHEM, CHEM]",01/01/2018
5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,"© 2017, NATCO. All rights reserved.Introductio...","Choice,Consumer wellness,Decision-making,Neigh...",[MEDI],01/03/2018


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


# Text preprocessing and vectorization
text_data = df['title'] + ' ' + df['publicationName']  # You can also add 'keywords'
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(text_data)

# MultiLabelBinarizer to handle multi-label targets
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['subjectArea'])  # Convert subjectArea to multi-hot encoded labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = OneVsRestClassifier(XGBClassifier(eval_metric="logloss"))
model.fit(X_train, y_train)

# Get prediction probabilities
y_pred_prob = model.predict_proba(X_test)

# Tune Threshold
thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold = 0.5
best_f1 = 0
best_report = None

for threshold in thresholds:
    # Apply threshold
    y_pred_adjusted = (y_pred_prob >= threshold).astype(int)
    
    # Evaluate with classification report and suppress warnings
    report = classification_report(y_test, y_pred_adjusted, target_names=mlb.classes_, zero_division=0, output_dict=True)
    f1_score = report["samples avg"]["f1-score"]
    
    if f1_score > best_f1:
        best_f1 = f1_score
        best_threshold = threshold
        best_report = report

# Final Evaluation
y_pred_best = (y_pred_prob >= best_threshold).astype(int)

In [13]:
print(classification_report(y_test, y_pred_best, target_names=mlb.classes_))

              precision    recall  f1-score   support

        AGRI       0.79      0.86      0.82       324
        ARTS       0.86      0.76      0.81        82
        BIOC       0.71      0.78      0.74       405
        BUSI       0.74      0.83      0.78       106
        CENG       0.78      0.77      0.78       270
        CHEM       0.80      0.87      0.83       360
        COMP       0.85      0.91      0.88       334
        DECI       0.76      0.74      0.75        69
        DENT       0.83      0.81      0.82        77
        EART       0.71      0.71      0.71        91
        ECON       0.74      0.76      0.75        72
        ENER       0.80      0.84      0.82       197
        ENGI       0.77      0.87      0.82       545
        ENVI       0.75      0.80      0.77       300
        HEAL       0.65      0.59      0.62        51
        IMMU       0.80      0.86      0.83       199
        MATE       0.86      0.84      0.85       362
        MATH       0.75    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.7953808038366863


In [15]:
y_pred_labels = mlb.inverse_transform(y_pred_best)
print('predicted labels')
y_pred_labels[:10]

predicted labels


[('ENGI',),
 ('BUSI', 'COMP', 'DECI', 'ENGI', 'SOCI'),
 ('CHEM', 'ENGI', 'MATE', 'PHYS'),
 ('BIOC', 'DENT', 'MEDI'),
 ('ARTS', 'SOCI'),
 ('MEDI', 'PHAR'),
 ('AGRI', 'MEDI'),
 ('AGRI', 'BIOC', 'MEDI'),
 ('COMP', 'ENGI', 'MATH'),
 ('CENG', 'CHEM')]

In [16]:
y_test_labels = mlb.inverse_transform(y_test)
print('test labels')
y_test_labels[:10]

test labels


[('ENGI',),
 ('BUSI', 'COMP', 'DECI', 'ENGI', 'SOCI'),
 ('MATE',),
 ('DENT',),
 ('ARTS', 'SOCI'),
 ('MEDI', 'PHAR'),
 ('MEDI',),
 ('BIOC', 'MEDI'),
 ('COMP',),
 ('CHEM',)]