In [None]:
import os
import json
import pandas as pd

# Directories containing JSON files
directories = ['2018','2019','2020','2021','2022','2023']

# List to store extracted data
extracted_data = []

# Iterate through each directory
for directory in directories:
    # Iterate through each file in the current directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):  # Ensure it's a JSON file
            filepath = os.path.join(directory, filename)
            
            # Open and load the JSON file
            with open(filepath, 'r', encoding="utf-8") as file:
                data = json.load(file)
                
                # Extract the relevant part of the JSON
                response = data.get('abstracts-retrieval-response', {})
                
                # Extract specific fields
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = [item.get('@abbrev', None) for item in subject_area_list if '@abbrev' in item]


                # publication date
                date = response.get('item', {}).get("ait:process-info", {}).get("ait:date-sort",{})
                day = date.get("@day")
                year = date.get("@year")
                month = date.get('@month')
                format_date = f"{day}/{month}/{year}"

                # Extract author keywords and combine into a single string
                # Extract author keywords and combine into a single string
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  # Ensure it's a dictionary
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  # Ensure 'author-keyword' is a list
                        combined_keywords = ",".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  # Handle cases where auth_keywords is not a dictionary

                # Append the extracted data as a dictionary
                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea,
                    'publication_date': format_date
                })

# Convert the extracted data into a DataFrame
df = pd.DataFrame(extracted_data)
def change(x):
    x=  set(x)
    result = ""
    for area in x:
        result += area + ","
    
    return result[:-1]
df["subjectArea"] = df["subjectArea"].apply(change)

In [6]:
df

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Public health and international epidemiology f...,"Radiology in Global Health: Strategies, Implem...",,,MEDI,31/12/2018
1,Flexible Printed Active Antenna for Digital Te...,Progress in Electromagnetics Research Symposium,"© 2018 The Institute of Electronics, Informati...",,ENGI,31/12/2018
2,Parametric study of hydrogen production via so...,Chemical Engineering Science,© 2018 Elsevier LtdComputational fluid dynamic...,Circulating fluidized bed Computational fluid ...,CHEM,31/12/2018
3,Superhydrophobic coating from fluoroalkylsilan...,Applied Surface Science,© 2018 Elsevier B.V. A superhydrophobic/supero...,Encapsulation Fluoroalkylsilane Natural rubber...,CHEM,31/12/2018
4,Electrochemical impedance-based DNA sensor usi...,Analytica Chimica Acta,© 2018 Elsevier B.V. A label-free electrochemi...,acpcPNA Electrochemical impedance spectroscopy...,CHEM,31/12/2018
...,...,...,...,...,...,...
16818,Long-chain bio-olefins production via oxidativ...,Catalysis Today,© 2021 Elsevier B.V.Long-chain α-olefins (≥ C1...,Long-chain olefins Mesoporous KIT-6 Oleic acid...,CENG,01/01/2023
16819,Recent Developments and Applications of Microf...,Critical Reviews in Analytical Chemistry,"© 2021 Taylor & Francis Group, LLC.Nowadays, f...",Biological hazards chemical hazards food conta...,CHEM,01/01/2023
16820,"Social justice, education and peacebuilding: c...",Compare,© 2021 The Author(s). Published by Informa UK ...,conflict Education peacebuilding social justic...,SOCI,01/01/2023
16821,Effects of black soldier fly (Hermetia illucen...,Journal of Applied Aquaculture,© 2021 Taylor & Francis.The effects of replaci...,Anabas testudineus Black soldier fly fish meal...,ENVI,01/01/2023


In [12]:
df.dropna(inplace=True) #axis=0 

In [4]:
df.shape

(13572, 6)

In [8]:
df.groupby('subjectArea').describe()

Unnamed: 0_level_0,title,title,title,title,publicationName,publicationName,publicationName,publicationName,abstract,abstract,abstract,abstract,keywords,keywords,keywords,keywords,abstract_cleaned,abstract_cleaned,abstract_cleaned,abstract_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
subjectArea,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
AGRI,899,899,"Penicillium Imranianum, a new species from the...",1,899,289,Nutrients,39,899,899,pakistan botanical society rights reserved pen...,1,899,897,African swine fever virus Compound disinfectan...,2,899,899,pakistan botanical society rights reserved pen...,1
ARTS,171,171,The long reach of English law: A case of incid...,1,171,63,Nakhara: Journal of Environmental Design and P...,26,171,171,informa uk limited trading taylor francis gr...,1,171,171,Comparative law Employers’ liability Legal tra...,1,171,171,informa uk limited trading taylor francis gr...,1
BIOC,934,933,"The effects of beliefs, knowledge, and attitud...",2,934,323,International Journal of Biological Macromolec...,37,934,934,© author(s 2017.taste perception influenced fa...,1,934,933,bioenergy biomolecule production biorefineries...,2,934,934,© author(s 2017.taste perception influenced fa...,1
BUSI,280,280,The critical factors of research and innovatio...,1,280,146,International Journal of Supply Chain Management,23,280,280,copyright inderscience enterprises ltd. univ...,1,280,279,Green supply chain Sustainability Thailand,2,280,280,copyright inderscience enterprises ltd. univ...,1
CENG,581,581,Fibrous platelet carbon nanofibers-silica fibe...,1,581,101,International Journal of Molecular Sciences,85,581,581,elsevier b.v.a novel fibrous composite plate...,1,581,578,epigenetics Ezh2 lipopolysaccharide macrophage...,2,581,581,elsevier b.v.a novel fibrous composite plate...,1
CHEM,825,825,Comparison of soil composition between farmlan...,1,825,159,Molecules,82,825,825,society innovative research rights reserved ...,1,825,825,Agriculture land management Conserved area Soi...,1,825,825,society innovative research rights reserved ...,1
COMP,870,869,Deep multispectral painting reproduction via m...,2,870,277,ACM International Conference Proceeding Series,96,870,870,power system security assessment operational...,1,870,868,machine learning Neural networks recommender s...,2,870,870,power system security assessment operational...,1
DECI,17,17,BEstream: Batch Capturing with Elliptic Functi...,1,17,12,2021 IEEE 8th International Conference on Indu...,6,17,17,elsevier b.v.tremendous data generated forms...,1,17,17,Data stream clustering Elliptic-micro-cluster ...,1,17,17,elsevier b.v.tremendous data generated forms...,1
DENT,233,233,Pediatric cleft palate patients show a 3- to 5...,1,233,70,BMC Oral Health,20,233,233,springer verlag gmbh germany springer nature o...,1,233,233,Cleft palate Pediatric dentistry Radiation dos...,1,233,233,springer verlag gmbh germany springer nature o...,1
EART,183,183,Acid volatile sulphide estimation using spatia...,1,183,79,Frontiers in Earth Science,13,183,183,institute oceanology polish academy sciences...,1,183,182,active fault Mae Hong Son Basin Mae Hong Son F...,2,183,183,institute oceanology polish academy sciences...,1


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import compute_class_weight
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Assuming you have a DataFrame 'df' with columns: title, publicationName, abstract, keywords, subjectArea

# Text preprocessing and vectorization
text_data = df['title'] + ' ' + df['publicationName'] + ' ' + df['abstract'] + ' ' + df['keywords']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = df['subjectArea']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create and train XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.72      0.70       184
           1       0.83      0.48      0.61        31
           2       0.54      0.44      0.48       201
           3       0.62      0.47      0.53        49
           4       0.79      0.63      0.70       125
           5       0.60      0.81      0.69       144
           6       0.78      0.85      0.81       185
           7       0.00      0.00      0.00         1
           8       0.79      0.66      0.72        50
           9       0.82      0.50      0.62        28
          10       0.69      0.42      0.52        26
          11       0.63      0.60      0.62        73
          12       0.59      0.58      0.58       149
          13       0.61      0.60      0.61       112
          14       0.57      0.20      0.30        20
          15       0.71      0.52      0.60        90
          16       0.67      0.67      0.67       145
          17       0.85    