In [40]:
import os
import json
import pandas as pd

# Directories containing JSON files
directories = ['2018','2019','2020','2021','2022','2023']

# List to store extracted data
extracted_data = []

# Iterate through each directory
for directory in directories:
    # Iterate through each file in the current directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):  # Ensure it's a JSON file
            filepath = os.path.join(directory, filename)
            
            # Open and load the JSON file
            with open(filepath, 'r', encoding="utf-8") as file:
                data = json.load(file)
                
                # Extract the relevant part of the JSON
                response = data.get('abstracts-retrieval-response', {})
                
                # Extract specific fields
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = [item.get('@abbrev', None) for item in subject_area_list if '@abbrev' in item]

                # publication date
                date = response.get('item', {}).get("ait:process-info", {}).get("ait:date-sort",{})
                day = date.get("@day")
                year = date.get("@year")
                month = date.get('@month')
                format_date = f"{day}/{month}/{year}"

                # Extract author keywords and combine into a single string
                # Extract author keywords and combine into a single string
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  # Ensure it's a dictionary
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  # Ensure 'author-keyword' is a list
                        combined_keywords = " ".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  # Handle cases where auth_keywords is not a dictionary

                # Append the extracted data as a dictionary
                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea,
                    'publication_date': format_date
                })

# Convert the extracted data into a DataFrame
df = pd.DataFrame(extracted_data)
def change(x):
        x=  set(x)
        result = ""
        for area in x:
            result += area + ","

        return result[:-1]
df["subjectArea"] = df["subjectArea"].apply(change)

In [39]:
df

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,EIS Microstructure Pitting corrosion Polarizat...,"PHYS,MATE",01/10/2018
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,Critical factors Innovation creation Public un...,"BUSI,ECON",01/01/2018
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,Cobb angle Dowager's hump Round back Spine,HEAL,01/12/2018
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,Agriculture land management Conserved area Soi...,"CHEM,PHAR",01/01/2018
5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,"© 2017, NATCO. All rights reserved.Introductio...",Choice Consumer wellness Decision-making Neigh...,MEDI,01/03/2018
...,...,...,...,...,...,...
20211,A Techno-Economic Assessment of a Second-Life ...,Sustainability (Switzerland),© 2023 by the authors.This study discusses the...,battery degradation electric vehicle charging ...,"ENER,ENVI,COMP,ENGI,SOCI",01/04/2023
20212,Encouraging green product purchase: Green valu...,Business Strategy and the Environment,© 2022 ERP Environment and John Wiley & Sons L...,attitude–behavior gap environmental knowledge ...,"BUSI,ENVI,SOCI",01/01/2023
20213,Does leukocytosis remain a predictive factor f...,Hematology (United Kingdom),© 2023 The Author(s). Published by Informa UK ...,Acute promyelocytic leukemia APL Thailand,MEDI,01/01/2023
20214,Administration of ketoprofen in postpartum sow...,Animal Bioscience,© 2023 by Animal Bioscience.Objective: Inflamm...,Colostrum Inflammation Ketoprofen Lactational Pig,"VETE,BIOC,AGRI",01/08/2023


In [31]:
df.dropna(inplace=True) #axis=0 

In [32]:
df.shape

(16319, 6)

Clean Abstract

In [33]:
import spacy
print(spacy.__version__)

3.8.2


In [34]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to clean abstract text
def clean_abstract(abstract):
    # Remove copyright symbols and years (like "© 2019")
    abstract = re.sub(r'©\s*\d{4}', '', abstract)
    doc = nlp(abstract)
    cleaned_text = " ".join([token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_digit])

    return cleaned_text

# Example abstract
abstract = "© 2019 2019 Aungsuroch et al., published by Sciendo. There is no single study that has examined nursing research priorities in Vietnam..."

# Clean the abstract
cleaned_abstract = clean_abstract(abstract)

# Output cleaned abstract
print(cleaned_abstract)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [13]:
df['abstract_cleaned'] = [clean_abstract(a) for a in df['abstract'].to_list()]

In [14]:
df['abstract_cleaned'] = df['abstract_cleaned'].str.replace(r'\bieee.\b', '', regex=True)
df['abstract'] = df['abstract_cleaned']
df.drop(columns=['abstract_cleaned'], inplace=True)
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
2,Parametric study of hydrogen production via so...,Chemical Engineering Science,elsevier ltdcomputational fluid dynamics app...,Circulating fluidized bed Computational fluid ...,CHEM,31/12/2018
3,Superhydrophobic coating from fluoroalkylsilan...,Applied Surface Science,elsevier b.v. superhydrophobic superoleophil...,Encapsulation Fluoroalkylsilane Natural rubber...,CHEM,31/12/2018
4,Electrochemical impedance-based DNA sensor usi...,Analytica Chimica Acta,elsevier b.v. label free electrochemical dna...,acpcPNA Electrochemical impedance spectroscopy...,CHEM,31/12/2018
5,Evaluation of outsourcing transportation contr...,Polish Journal of Management Studies,czestochowa university technology rights reser...,Design of experiment Optimal fleet size Outsou...,BUSI,30/12/2018
6,The phenotypic and mutational spectrum of Thai...,Gene,elsevier b.v.ornithine transcarbamylase defi...,Female Hyperammonemia Novel mutations Ornithin...,BIOC,30/12/2018


In [15]:
# Save DataFrame as a CSV file
df.to_csv('output.csv', index=True)

In [8]:
df.groupby('subjectArea').describe()

Unnamed: 0_level_0,title,title,title,title,publicationName,publicationName,publicationName,publicationName,abstract,abstract,abstract,abstract,keywords,keywords,keywords,keywords,abstract_cleaned,abstract_cleaned,abstract_cleaned,abstract_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
subjectArea,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
AGRI,899,899,"Penicillium Imranianum, a new species from the...",1,899,289,Nutrients,39,899,899,pakistan botanical society rights reserved pen...,1,899,897,African swine fever virus Compound disinfectan...,2,899,899,pakistan botanical society rights reserved pen...,1
ARTS,171,171,The long reach of English law: A case of incid...,1,171,63,Nakhara: Journal of Environmental Design and P...,26,171,171,informa uk limited trading taylor francis gr...,1,171,171,Comparative law Employers’ liability Legal tra...,1,171,171,informa uk limited trading taylor francis gr...,1
BIOC,934,933,"The effects of beliefs, knowledge, and attitud...",2,934,323,International Journal of Biological Macromolec...,37,934,934,© author(s 2017.taste perception influenced fa...,1,934,933,bioenergy biomolecule production biorefineries...,2,934,934,© author(s 2017.taste perception influenced fa...,1
BUSI,280,280,The critical factors of research and innovatio...,1,280,146,International Journal of Supply Chain Management,23,280,280,copyright inderscience enterprises ltd. univ...,1,280,279,Green supply chain Sustainability Thailand,2,280,280,copyright inderscience enterprises ltd. univ...,1
CENG,581,581,Fibrous platelet carbon nanofibers-silica fibe...,1,581,101,International Journal of Molecular Sciences,85,581,581,elsevier b.v.a novel fibrous composite plate...,1,581,578,epigenetics Ezh2 lipopolysaccharide macrophage...,2,581,581,elsevier b.v.a novel fibrous composite plate...,1
CHEM,825,825,Comparison of soil composition between farmlan...,1,825,159,Molecules,82,825,825,society innovative research rights reserved ...,1,825,825,Agriculture land management Conserved area Soi...,1,825,825,society innovative research rights reserved ...,1
COMP,870,869,Deep multispectral painting reproduction via m...,2,870,277,ACM International Conference Proceeding Series,96,870,870,power system security assessment operational...,1,870,868,machine learning Neural networks recommender s...,2,870,870,power system security assessment operational...,1
DECI,17,17,BEstream: Batch Capturing with Elliptic Functi...,1,17,12,2021 IEEE 8th International Conference on Indu...,6,17,17,elsevier b.v.tremendous data generated forms...,1,17,17,Data stream clustering Elliptic-micro-cluster ...,1,17,17,elsevier b.v.tremendous data generated forms...,1
DENT,233,233,Pediatric cleft palate patients show a 3- to 5...,1,233,70,BMC Oral Health,20,233,233,springer verlag gmbh germany springer nature o...,1,233,233,Cleft palate Pediatric dentistry Radiation dos...,1,233,233,springer verlag gmbh germany springer nature o...,1
EART,183,183,Acid volatile sulphide estimation using spatia...,1,183,79,Frontiers in Earth Science,13,183,183,institute oceanology polish academy sciences...,1,183,182,active fault Mae Hong Son Basin Mae Hong Son F...,2,183,183,institute oceanology polish academy sciences...,1


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import compute_class_weight
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Assuming you have a DataFrame 'df' with columns: title, publicationName, abstract, keywords, subjectArea

# Text preprocessing and vectorization
text_data = df['title'] + ' ' + df['publicationName'] + ' ' + df['abstract'] + ' ' + df['keywords']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = df['subjectArea']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create and train XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.72      0.70       184
           1       0.83      0.48      0.61        31
           2       0.54      0.44      0.48       201
           3       0.62      0.47      0.53        49
           4       0.79      0.63      0.70       125
           5       0.60      0.81      0.69       144
           6       0.78      0.85      0.81       185
           7       0.00      0.00      0.00         1
           8       0.79      0.66      0.72        50
           9       0.82      0.50      0.62        28
          10       0.69      0.42      0.52        26
          11       0.63      0.60      0.62        73
          12       0.59      0.58      0.58       149
          13       0.61      0.60      0.61       112
          14       0.57      0.20      0.30        20
          15       0.71      0.52      0.60        90
          16       0.67      0.67      0.67       145
          17       0.85    

In [None]:
# Decode numeric predictions back to original labels
y_pred_labels = le.inverse_transform(y_pred)

# Example: Print the first 10 decoded predictions
print(y_pred_labels[:10])
