# Data preparation

In [1]:
import os
import json
import pandas as pd

# Directories containing JSON files
directories = ['2018','2019','2020','2021','2022','2023']

# List to store extracted data
extracted_data = []

# Iterate through each directory
for directory in directories:
    # Iterate through each file in the current directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):  # Ensure it's a JSON file
            filepath = os.path.join(directory, filename)
            
            # Open and load the JSON file
            with open(filepath, 'r', encoding="utf-8") as file:
                data = json.load(file)
                
                # Extract the relevant part of the JSON
                response = data.get('abstracts-retrieval-response', {})
                
                # Extract specific fields
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = [item.get('@abbrev', None) for item in subject_area_list if '@abbrev' in item]


                # publication date
                date = response.get('item', {}).get("ait:process-info", {}).get("ait:date-sort",{})
                day = date.get("@day")
                year = date.get("@year")
                month = date.get('@month')
                format_date = f"{day}/{month}/{year}"

                # Extract author keywords and combine into a single string
                # Extract author keywords and combine into a single string
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  # Ensure it's a dictionary
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  # Ensure 'author-keyword' is a list
                        combined_keywords = ",".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  # Handle cases where auth_keywords is not a dictionary

                # Append the extracted data as a dictionary
                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea,
                    'publication_date': format_date
                })

# Convert the extracted data into a DataFrame
df = pd.DataFrame(extracted_data)
def change(x):
    x=  set(x)
    result = ""
    for area in x:
        result += area + ","
    
    return result[:-1]
df["subjectArea"] = df["subjectArea"].apply(change)

In [41]:
df.shape

(20216, 6)

In [42]:
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure and corrosion behavior of Ti-30Zr-5Al-3V-xFe alloys,Materials Chemistry and Physics,"© 2018The microstructure and corrosion behavior of the hot-rolled Ti-30Zr-5Al-3V-xFe alloys were investigated. All the alloys are composed of α and β phases. With the increase of Fe content, α laths gradually decreases and the relative content of β phase increases. Potentiodynamic polarization curves recorded in 3.5% NaCl and 5% HCl solutions show that all the alloys exhibit a passivation behavior at potential in the range of 0.25–1.25 V (SCE). An evident change in corrosion current density is presented for all of the alloys in both solutions, which can be attributed to the passive film breakdown caused by pitting corrosion. All the alloys exhibit more positive pitting potential compared with the typical 304 stainless steel. The impedance spectra were fitted using equivalent circuit with two time constants. The alloys with different Fe contents exhibit capacitive behavior with phase angles closed to −80° and high impedance values at low frequency. Containing Fe alloys exhibit commendable combination of mechanical properties and corrosion resistance, which is markedly superior to the typical 304 stainless steel although the corrosion resistance decreases by the addition of Fe. The relative content of α and β phases and the segregations of Fe and V elements are identified as the main factors that affect the corrosion resistance of the hot-rolled alloys.","EIS,Microstructure,Pitting corrosion,Polarization,Titanium alloy","MATE,PHYS",01/10/2018
1,The critical factors of research and innovation creation in public universities in Thailand,International Journal of Trade and Global Markets,"Copyright © 2018 Inderscience Enterprises Ltd.A university, which is the main form of higher education institution (HEI), aims to develop and promote education, generate academic excellence and transfer academic knowledge to solve social problems. To enhance the innovation creation in universities, the relevant factors that affect such creation should be investigated. Hence, this study aims to investigate the critical factors that affect research and innovation creation in public universities in Thailand. The relevant literature was reviewed and research was conducted using a qualitative research approach. Data were collected from in-depth interviews of 11 executives purposely selected from 8 frontier public universities in Thailand. The data were analysed using NVivo qualitative research software. Results revealed 15 critical factors in the following descending order: resource, goal, database and information technology, working environment, managerial process, policy, network, organisational structure, human resource management, government support, strategy, compensation, organisation culture, vision and leader factor. Moreover, results identified the issues that should be addressed.","Critical factors,Innovation creation,Public university,Research","BUSI,ECON",01/01/2018
2,Is the occiput-wall distance valid and reliable to determine the presence of thoracic hyperkyphosis?,Musculoskeletal Science and Practice,"© 2018Background: Hyperkyphosis may be frequently found nowadays due to the change in current lifestyles of sustained flexion postures and age-related system decline. The occiput-wall distance (OWD) is a practical measure that is commonly used to screen and monitor thoracic hyperkyphosis in epidemiologic studies. However, there was no clear evidence to support the clinical utility of the tool as compared to the data from direct standard measures. Objectives: To investigate psychometric properties—including validity, reliability, and appropriate cut-off point—of the OWD to determine the presence of thoracic hyperkyphosis, as compared to a standard Cobb's method. Design: Observational study. Methods: This study was conducted in ninety-nine participants, aged 10 years and above who had different degrees of thoracic hyperkyphosis from several communities. All participants were assessed for their severity of thoracic hyperkyphosis using the OWD, and 14 participants were involved in a reliability study. Within 7 days later, all participants were at a hospital to complete a radiographic examination. Results: Outcomes from OWD had good concurrent validity with the Cobb angles (r = 0.683, P < 0.001) and excellent rater reliability when assessed by well-trained health professionals (ICCs > 0.9, P < 0.001). The OWD of at least 6.5 cm had the best diagnostic properties to determine the presence of thoracic hyperkyphosis (sensitivity = 71.4%, specificity = 76.6%, and area under the curve = 0.846). Conclusion: The findings support validity and reliability of OWD, and offer a clear cut-off point to determine the presence of thoracic hyperkyphosis for clinical utility in various settings.","Cobb angle,Dowager's hump,Round back,Spine",HEAL,01/12/2018
3,Comparison of soil composition between farmlands and conserved area,Eurasian Journal of Analytical Chemistry,"© 2018 Society for Innovative Research. All rights reserved.Thai farmers usually have low formal education and lack of knowledge on soil quality improvement and proper use of fertilizers. After a few years of farming, they try to trespass in conserved forest areas because of soil deterioration in their own limited expanses of farmland and they believe that soils in the conserved area are more fertile than soils in their own farms. Consequently, most of them are arrested, creating individual and family problems. This project will compare the physical and chemical properties of soils from farmlands and soils from the conserved area. The results showed that soil nutrients from both farmlands and soils from the conserved area were not significantly different in nearly all parameters of analysis except soils from the conserved area have more organic matters and nitrogen content. However, both soils from farmlands and conserved area were sandy loam which has low water content and low cation exchange capacity. The analysis data were informed to the farmers and suggested them to improve their farmlands using appropriate organic matters and suitable plants have to be chosen to match the sandy loam soil in order to get more productivity. Water supply management also has to be improved. However, mixed farming is another good planting method. Some kinds of plants can fix nitrogen in soil such as legumes. They can enrich soil nutrients. The most important issue is that farmers have to be acknowledged that soils in the conserved areas are not more fertile than soils in their farmlands. Therefore, in the future, farmers will not trespass conserve areas.","Agriculture land management,Conserved area,Soil conservation","PHAR,CHEM",01/01/2018
4,The impact of wire caliber on ERCP outcomes: a multicenter randomized controlled trial of 0.025-inch and 0.035-inch guidewires,Gastrointestinal Endoscopy,"© 2018Background and Aims: Wire-guided biliary cannulation has been demonstrated to improve cannulation rates and reduce post-ERCP pancreatitis (PEP), but the impact of wire caliber has not been studied. This study compares successful cannulation rates and ERCP adverse events by using a 0.025-inch and 0.035-inch guidewire. Methods: A randomized, single blinded, prospective, multicenter trial at 9 high-volume tertiary-care referral centers in the Asia-Pacific region was performed. Patients with an intact papilla and conventional anatomy who did not have malignancy in the head of the pancreas or ampulla and were undergoing ERCP were recruited. ERCP was performed by using a standardized cannulation algorithm, and patients were randomized to either a 0.025-inch or 0.035-inch guidewire. The primary outcomes of the study were successful wire-guided cannulation and the incidence of PEP. Overall successful cannulation and ERCP adverse events also were studied. Results: A total of 710 patients were enrolled in the study. The primary wire-guided biliary cannulation rate was similar in 0.025-inch and 0.035-inch wire groups (80.7% vs 80.3%; P =.90). The rate of PEP between the 0.025-inch and the 0.035-inch wire groups did not differ significantly (7.8% vs 9.3%; P =.51). No differences were noted in secondary outcomes. Conclusion: Similar rates of successful cannulation and PEP were demonstrated in the use of 0.025-inch and 0.035-inch guidewires. (Clinical trial registration number: NCT01408264.)",,MEDI,01/06/2018


In [43]:
df.dropna(subset=['title','publicationName'], inplace=True)

In [44]:
df.shape

(20215, 6)

# Subject Area prediction

In [45]:
# make list
df['subjectArea'] = df['subjectArea'].apply(lambda x: x.split(','))

In [46]:
df.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure and corrosion behavior of Ti-30Zr-5Al-3V-xFe alloys,Materials Chemistry and Physics,"© 2018The microstructure and corrosion behavior of the hot-rolled Ti-30Zr-5Al-3V-xFe alloys were investigated. All the alloys are composed of α and β phases. With the increase of Fe content, α laths gradually decreases and the relative content of β phase increases. Potentiodynamic polarization curves recorded in 3.5% NaCl and 5% HCl solutions show that all the alloys exhibit a passivation behavior at potential in the range of 0.25–1.25 V (SCE). An evident change in corrosion current density is presented for all of the alloys in both solutions, which can be attributed to the passive film breakdown caused by pitting corrosion. All the alloys exhibit more positive pitting potential compared with the typical 304 stainless steel. The impedance spectra were fitted using equivalent circuit with two time constants. The alloys with different Fe contents exhibit capacitive behavior with phase angles closed to −80° and high impedance values at low frequency. Containing Fe alloys exhibit commendable combination of mechanical properties and corrosion resistance, which is markedly superior to the typical 304 stainless steel although the corrosion resistance decreases by the addition of Fe. The relative content of α and β phases and the segregations of Fe and V elements are identified as the main factors that affect the corrosion resistance of the hot-rolled alloys.","EIS,Microstructure,Pitting corrosion,Polarization,Titanium alloy","[MATE, PHYS]",01/10/2018
1,The critical factors of research and innovation creation in public universities in Thailand,International Journal of Trade and Global Markets,"Copyright © 2018 Inderscience Enterprises Ltd.A university, which is the main form of higher education institution (HEI), aims to develop and promote education, generate academic excellence and transfer academic knowledge to solve social problems. To enhance the innovation creation in universities, the relevant factors that affect such creation should be investigated. Hence, this study aims to investigate the critical factors that affect research and innovation creation in public universities in Thailand. The relevant literature was reviewed and research was conducted using a qualitative research approach. Data were collected from in-depth interviews of 11 executives purposely selected from 8 frontier public universities in Thailand. The data were analysed using NVivo qualitative research software. Results revealed 15 critical factors in the following descending order: resource, goal, database and information technology, working environment, managerial process, policy, network, organisational structure, human resource management, government support, strategy, compensation, organisation culture, vision and leader factor. Moreover, results identified the issues that should be addressed.","Critical factors,Innovation creation,Public university,Research","[BUSI, ECON]",01/01/2018
2,Is the occiput-wall distance valid and reliable to determine the presence of thoracic hyperkyphosis?,Musculoskeletal Science and Practice,"© 2018Background: Hyperkyphosis may be frequently found nowadays due to the change in current lifestyles of sustained flexion postures and age-related system decline. The occiput-wall distance (OWD) is a practical measure that is commonly used to screen and monitor thoracic hyperkyphosis in epidemiologic studies. However, there was no clear evidence to support the clinical utility of the tool as compared to the data from direct standard measures. Objectives: To investigate psychometric properties—including validity, reliability, and appropriate cut-off point—of the OWD to determine the presence of thoracic hyperkyphosis, as compared to a standard Cobb's method. Design: Observational study. Methods: This study was conducted in ninety-nine participants, aged 10 years and above who had different degrees of thoracic hyperkyphosis from several communities. All participants were assessed for their severity of thoracic hyperkyphosis using the OWD, and 14 participants were involved in a reliability study. Within 7 days later, all participants were at a hospital to complete a radiographic examination. Results: Outcomes from OWD had good concurrent validity with the Cobb angles (r = 0.683, P < 0.001) and excellent rater reliability when assessed by well-trained health professionals (ICCs > 0.9, P < 0.001). The OWD of at least 6.5 cm had the best diagnostic properties to determine the presence of thoracic hyperkyphosis (sensitivity = 71.4%, specificity = 76.6%, and area under the curve = 0.846). Conclusion: The findings support validity and reliability of OWD, and offer a clear cut-off point to determine the presence of thoracic hyperkyphosis for clinical utility in various settings.","Cobb angle,Dowager's hump,Round back,Spine",[HEAL],01/12/2018
3,Comparison of soil composition between farmlands and conserved area,Eurasian Journal of Analytical Chemistry,"© 2018 Society for Innovative Research. All rights reserved.Thai farmers usually have low formal education and lack of knowledge on soil quality improvement and proper use of fertilizers. After a few years of farming, they try to trespass in conserved forest areas because of soil deterioration in their own limited expanses of farmland and they believe that soils in the conserved area are more fertile than soils in their own farms. Consequently, most of them are arrested, creating individual and family problems. This project will compare the physical and chemical properties of soils from farmlands and soils from the conserved area. The results showed that soil nutrients from both farmlands and soils from the conserved area were not significantly different in nearly all parameters of analysis except soils from the conserved area have more organic matters and nitrogen content. However, both soils from farmlands and conserved area were sandy loam which has low water content and low cation exchange capacity. The analysis data were informed to the farmers and suggested them to improve their farmlands using appropriate organic matters and suitable plants have to be chosen to match the sandy loam soil in order to get more productivity. Water supply management also has to be improved. However, mixed farming is another good planting method. Some kinds of plants can fix nitrogen in soil such as legumes. They can enrich soil nutrients. The most important issue is that farmers have to be acknowledged that soils in the conserved areas are not more fertile than soils in their farmlands. Therefore, in the future, farmers will not trespass conserve areas.","Agriculture land management,Conserved area,Soil conservation","[PHAR, CHEM]",01/01/2018
4,The impact of wire caliber on ERCP outcomes: a multicenter randomized controlled trial of 0.025-inch and 0.035-inch guidewires,Gastrointestinal Endoscopy,"© 2018Background and Aims: Wire-guided biliary cannulation has been demonstrated to improve cannulation rates and reduce post-ERCP pancreatitis (PEP), but the impact of wire caliber has not been studied. This study compares successful cannulation rates and ERCP adverse events by using a 0.025-inch and 0.035-inch guidewire. Methods: A randomized, single blinded, prospective, multicenter trial at 9 high-volume tertiary-care referral centers in the Asia-Pacific region was performed. Patients with an intact papilla and conventional anatomy who did not have malignancy in the head of the pancreas or ampulla and were undergoing ERCP were recruited. ERCP was performed by using a standardized cannulation algorithm, and patients were randomized to either a 0.025-inch or 0.035-inch guidewire. The primary outcomes of the study were successful wire-guided cannulation and the incidence of PEP. Overall successful cannulation and ERCP adverse events also were studied. Results: A total of 710 patients were enrolled in the study. The primary wire-guided biliary cannulation rate was similar in 0.025-inch and 0.035-inch wire groups (80.7% vs 80.3%; P =.90). The rate of PEP between the 0.025-inch and the 0.035-inch wire groups did not differ significantly (7.8% vs 9.3%; P =.51). No differences were noted in secondary outcomes. Conclusion: Similar rates of successful cannulation and PEP were demonstrated in the use of 0.025-inch and 0.035-inch guidewires. (Clinical trial registration number: NCT01408264.)",,[MEDI],01/06/2018


In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


# Text preprocessing and vectorization
text_data = df['title'] + ' ' + df['publicationName']  # You can also add 'keywords'
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(text_data)

# MultiLabelBinarizer to handle multi-label targets
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['subjectArea'])  # Convert subjectArea to multi-hot encoded labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = OneVsRestClassifier(XGBClassifier(eval_metric="logloss"))
model.fit(X_train, y_train)

# Get prediction probabilities
y_pred_prob = model.predict_proba(X_test)

# Tune Threshold
thresholds = np.arange(0.1, 0.9, 0.1)
best_threshold = 0.5
best_f1 = 0
best_report = None

for threshold in thresholds:
    # Apply threshold
    y_pred_adjusted = (y_pred_prob >= threshold).astype(int)
    
    # Evaluate with classification report and suppress warnings
    report = classification_report(y_test, y_pred_adjusted, target_names=mlb.classes_, zero_division=0, output_dict=True)
    f1_score = report["samples avg"]["f1-score"]
    
    if f1_score > best_f1:
        best_f1 = f1_score
        best_threshold = threshold
        best_report = report

# Final Evaluation
y_pred_best = (y_pred_prob >= best_threshold).astype(int)

In [6]:
print(classification_report(y_test, y_pred_best, target_names=mlb.classes_))

              precision    recall  f1-score   support

        AGRI       0.78      0.81      0.79       383
        ARTS       0.70      0.70      0.70        93
        BIOC       0.70      0.73      0.71       482
        BUSI       0.61      0.64      0.62       125
        CENG       0.78      0.80      0.79       313
        CHEM       0.80      0.90      0.85       438
        COMP       0.86      0.89      0.87       360
        DECI       0.74      0.69      0.72        75
        DENT       0.77      0.91      0.83        86
        EART       0.77      0.76      0.76       142
        ECON       0.62      0.63      0.62        79
        ENER       0.83      0.86      0.84       220
        ENGI       0.71      0.83      0.77       593
        ENVI       0.83      0.82      0.82       344
        HEAL       0.61      0.62      0.62        53
        IMMU       0.77      0.77      0.77       229
        MATE       0.86      0.87      0.86       403
        MATH       0.74    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.797590351770416


# Scopus uploading

In [49]:
import os
import json
import pandas as pd

scopus_df = pd.read_csv('scopus_data.csv')

In [50]:
scopus_df.head()

Unnamed: 0,dc:title,prism:publicationName,prism:coverDate
0,Social Progress for Resilient Regions,Region,31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,31/12/2018
2,Fashion language and translatology,Babel,31/12/2018
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,31/12/2018
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,31/12/2018


In [10]:
'''
def set_date(date_str):
    d = date_str.split('-')[2]
    m = date_str.split('-')[1]
    y = date_str.split('-')[0]
    formatted_date = d+'/'+m+'/'+y
    return formatted_date
'''

"\ndef set_date(date_str):\n    d = date_str.split('-')[2]\n    m = date_str.split('-')[1]\n    y = date_str.split('-')[0]\n    formatted_date = d+'/'+m+'/'+y\n    return formatted_date\n"

In [11]:
#scopus_df['prism:coverDate'] = scopus_df['prism:coverDate'].apply(set_date)

In [51]:
scopus_df.shape

(3600, 3)

In [13]:
#scopus_df.head()

In [14]:
#scopus_df.to_csv('scopus_data.csv', index=False, encoding='utf-8')

# Start Subject Area Prediction

In [66]:
print(f"Best Threshold: {best_threshold}")
print(f"Best F1-Score: {best_f1}")

Best Threshold: 0.2
Best F1-Score: 0.797590351770416


In [142]:
# Preprocess scopus_df
scopus_text_data = scopus_df['dc:title'] + ' ' + scopus_df['prism:publicationName']
scopus_X = vectorizer.transform(scopus_text_data)  # Use the trained vectorizer

# Predict probabilities for scopus_df
scopus_pred_prob = model.predict_proba(scopus_X)

# Apply the best threshold to get multi-label predictions (if needed)
scopus_pred_labels_multi = (scopus_pred_prob >= best_threshold).astype(int)

# Convert multi-label predictions to single-label predictions (choose the highest probability)
scopus_pred_single = np.zeros_like(scopus_pred_prob)
scopus_pred_single[np.arange(scopus_pred_prob.shape[0]), scopus_pred_prob.argmax(axis=1)] = 1

In [143]:
# Convert predictions to readable labels
scopus_labels_multi = mlb.inverse_transform(scopus_pred_labels_multi)  # Multi-label predictions
scopus_labels_single = mlb.inverse_transform(scopus_pred_single)      # Single-label predictions

In [144]:
# Display results
scopus_df['Predicted Subject Area (Multi)'] = scopus_labels_multi
scopus_df['Predicted Subject Area (Single)'] = scopus_labels_single

In [145]:
# Print a sample of the predictions
df_with_pred = scopus_df[['dc:title', 'prism:publicationName', 'Predicted Subject Area (Multi)', 'Predicted Subject Area (Single)','prism:coverDate']]

pd.set_option('display.max_colwidth', None)
df_with_pred

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
0,Social Progress for Resilient Regions,Region,"(SOCI,)","(SOCI,)",31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS,)","(ARTS,)",31/12/2018
2,Fashion language and translatology,Babel,"(ARTS,)","(ARTS,)",31/12/2018
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,(),"(MEDI,)",31/12/2018
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,(),"(MEDI,)",31/12/2018
...,...,...,...,...,...
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023
3596,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,"(ENGI, MEDI)","(ENGI,)",31/12/2023
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023


In [71]:
df_with_pred['Predicted Subject Area (Multi)']

0            (SOCI,)
1            (ARTS,)
2            (ARTS,)
3                 ()
4                 ()
            ...     
3595         (ENGI,)
3596    (ENGI, MEDI)
3597         (ENGI,)
3598         (ENGI,)
3599         (ENGI,)
Name: Predicted Subject Area (Multi), Length: 3600, dtype: object

In [146]:
(df_with_pred['Predicted Subject Area (Multi)'] == ()).sum()

np.int64(1210)

In [148]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] == ()]

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
3,A pragmatic framework to note-taking in consecutive interpretation,Babel,(),"(MEDI,)",31/12/2018
4,An Analytic Approximation to the Density of Twin Primes,Recoletos Multidisciplinary Research Journal,(),"(MEDI,)",31/12/2018
5,DANCE MOTIFS ON PREHISTORIC POTTERY FROM EASTERN CROATIA,Vjesnik Arheoloskog Muzeja u Zagrebu,(),"(MEDI,)",31/12/2018
8,Geographic Information System-Based Suitability Analysis for Potential Shallow Tube-Well Irrigation Development,Recoletos Multidisciplinary Research Journal,(),"(COMP,)",31/12/2018
11,ZBOROVANJE AMERIŠKEGA ZDRUŽENJA GEOGRAFOV 2018,Dela,(),"(MEDI,)",31/12/2018
...,...,...,...,...,...
3537,Terminology in the wild: Enactive meaning-making in the Roman surveyors,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023
3538,Coming to terms with aristotle: Technical terminology in the Poetics and beyond,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023
3542,The rise of botanical terminology in the sixteenth and seventeenth centuries,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023
3544,Coming to terms: Approaches to (ancient) terminologies,Coming to Terms: Approaches to (Ancient) Terminologies,(),"(MEDI,)",31/12/2023


In [60]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] == ()].groupby('Predicted Subject Area (Single)')['Predicted Subject Area (Multi)'].count()

Predicted Subject Area (Single)
(AGRI,)       7
(ARTS,)      20
(BIOC,)      14
(BUSI,)       4
(CENG,)       5
(CHEM,)       5
(COMP,)      13
(DENT,)       1
(EART,)       3
(ECON,)       1
(ENER,)       1
(ENGI,)      15
(ENVI,)       6
(MATE,)       4
(MATH,)       5
(MEDI,)    1023
(MULT,)       6
(NEUR,)       2
(PHYS,)       1
(SOCI,)      72
(VETE,)       2
Name: Predicted Subject Area (Multi), dtype: int64

In [149]:
filtered_df = df_with_pred[
    (df_with_pred['Predicted Subject Area (Multi)'].apply(lambda x: len(x) == 0)) &
    (df_with_pred['Predicted Subject Area (Single)'].apply(lambda x: 'MEDI' not in x))
]

filtered_df

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
8,Geographic Information System-Based Suitability Analysis for Potential Shallow Tube-Well Irrigation Development,Recoletos Multidisciplinary Research Journal,(),"(COMP,)",31/12/2018
115,"Human, not too Human: Technology, Rites, and Identity",Open Information Science,(),"(SOCI,)",31/12/2018
155,Rhizoremediation of azodyes by constructed wetland technology using Typha latifolia,Phytobiont and Ecosystem Restitution,(),"(ENGI,)",31/12/2018
161,Using PLS-SEM to model Family Business Behavior when addressing the protocol,European Journal of Family Business,(),"(BUSI,)",31/12/2018
205,Thermodynamic speed of sound for multiphase multi-reactive equilibrium systems,Freeze-Out and HYSYS Implementation,(),"(COMP,)",31/12/2018
...,...,...,...,...,...
3508,Catalytic role of ionic liquids in the synthesis of bioactive O-heterocycles under solvent-free conditions,Solvent-Free Synthesis: Bioactive Heterocycles,(),"(CENG,)",31/12/2023
3524,Synthesis of nitrogen-containing heterocyclic rings using grinding approach,Solvent-Free Synthesis: Bioactive Heterocycles,(),"(CENG,)",31/12/2023
3526,Solvent-free microwave-assisted green synthesis of heterocyclic compounds,Solvent-Free Synthesis: Bioactive Heterocycles,(),"(ENVI,)",31/12/2023
3527,Microwave-assisted solvent-free synthesis of benzazoles,Solvent-Free Synthesis: Bioactive Heterocycles,(),"(CENG,)",31/12/2023


In [62]:
df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] != ()]

Unnamed: 0,dc:title,prism:publicationName,Predicted Subject Area (Multi),Predicted Subject Area (Single),prism:coverDate
0,Social Progress for Resilient Regions,Region,"(SOCI,)","(SOCI,)",31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS,)","(ARTS,)",31/12/2018
2,Fashion language and translatology,Babel,"(ARTS,)","(ARTS,)",31/12/2018
6,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,"(ENGI,)","(ENGI,)",31/12/2018
7,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,"(BUSI, ECON)","(BUSI,)",31/12/2018
...,...,...,...,...,...
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023
3596,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,"(ENGI, MEDI)","(ENGI,)",31/12/2023
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,"(ENGI,)","(ENGI,)",31/12/2023


# Clean up

In [26]:
# df_final[df_final['Predicted Subject Area (Multi)'].apply(lambda x: len(x) > 2)]

In [102]:
def change(x):
    x = set(x)
    result = ""
    for area in x:
        result += area + ","
    return result.strip(',')

In [99]:
df_final = df_with_pred[df_with_pred['Predicted Subject Area (Multi)'] != ()].copy()

df_final.drop(columns='Predicted Subject Area (Single)', inplace=True)
df_final.rename(
    columns={
        'dc:title': 'title', 
        'prism:publicationName': 'publicationName', 
        'Predicted Subject Area (Multi)': 'subjectArea',
        'prism:coverDate': 'publication_date'
    }, 
    inplace=True
)
df_final

Unnamed: 0,title,publicationName,subjectArea,publication_date
0,Social Progress for Resilient Regions,Region,"(SOCI,)",31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,"(ARTS,)",31/12/2018
2,Fashion language and translatology,Babel,"(ARTS,)",31/12/2018
6,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,"(ENGI,)",31/12/2018
7,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,"(BUSI, ECON)",31/12/2018
...,...,...,...,...
3595,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,"(ENGI,)",31/12/2023
3596,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,"(ENGI, MEDI)",31/12/2023
3597,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,"(ENGI,)",31/12/2023
3598,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,"(ENGI,)",31/12/2023


In [103]:
df_final['subjectArea'] = df_final['subjectArea'].apply(list).apply(change)

In [109]:
df_final.loc[df_final['subjectArea'].isna()]

Unnamed: 0,title,publicationName,subjectArea,publication_date


# Merge 2 DataFrames

In [110]:
df = pd.read_csv('data.csv')

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216 entries, 0 to 20215
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             20215 non-null  object
 1   publicationName   20216 non-null  object
 2   abstract          19551 non-null  object
 3   keywords          16443 non-null  object
 4   subjectArea       20216 non-null  object
 5   publication_date  20216 non-null  object
dtypes: object(6)
memory usage: 947.8+ KB


In [113]:
df.reset_index(drop=True, inplace=True)
df_columns = ['title','publicationName','abstract','keywords','subjectArea','publication_date']
df_final_columns = ['title','publicationName','subjectArea','publication_date']
# I want to concat or merge these two dataframes.The columns acording to df if not have in df_final just make it None
df.shape

(20216, 6)

In [114]:
df_final.reset_index(drop=True, inplace=True)

In [115]:
# Ensure that df_final has all the columns from df with NaN where the column is missing
df_final = df_final.reindex(columns=df.columns, fill_value=None)
df_final

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Social Progress for Resilient Regions,Region,,,SOCI,31/12/2018
1,Response: Fleischhauer and Czardybon evade the burden of proof,Studies in Language,,,ARTS,31/12/2018
2,Fashion language and translatology,Babel,,,ARTS,31/12/2018
3,Ricoeur’s Hermeneutics: Transforming Political Structures into Just Institutions through the Critical Appropriation of Political Power,Recoletos Multidisciplinary Research Journal,,,ENGI,31/12/2018
4,Socio-Economic Indicators of Coastal Resource Management Participation: The Bataan Case,Recoletos Multidisciplinary Research Journal,,,"ECON,BUSI",31/12/2018
...,...,...,...,...,...,...
2385,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,,,ENGI,31/12/2023
2386,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,,,"ENGI,MEDI",31/12/2023
2387,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,,,ENGI,31/12/2023
2388,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,,,ENGI,31/12/2023


In [116]:
# Concatenate the two DataFrames
df_all2 = pd.concat([df, df_final], axis=0, ignore_index=True)

In [117]:
# Check the result
df_all2.shape

(22606, 6)

In [36]:
df_all2.head()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
0,Effects of iron content on the microstructure and corrosion behavior of Ti-30Zr-5Al-3V-xFe alloys,Materials Chemistry and Physics,"© 2018The microstructure and corrosion behavior of the hot-rolled Ti-30Zr-5Al-3V-xFe alloys were investigated. All the alloys are composed of α and β phases. With the increase of Fe content, α laths gradually decreases and the relative content of β phase increases. Potentiodynamic polarization curves recorded in 3.5% NaCl and 5% HCl solutions show that all the alloys exhibit a passivation behavior at potential in the range of 0.25–1.25 V (SCE). An evident change in corrosion current density is presented for all of the alloys in both solutions, which can be attributed to the passive film breakdown caused by pitting corrosion. All the alloys exhibit more positive pitting potential compared with the typical 304 stainless steel. The impedance spectra were fitted using equivalent circuit with two time constants. The alloys with different Fe contents exhibit capacitive behavior with phase angles closed to −80° and high impedance values at low frequency. Containing Fe alloys exhibit commendable combination of mechanical properties and corrosion resistance, which is markedly superior to the typical 304 stainless steel although the corrosion resistance decreases by the addition of Fe. The relative content of α and β phases and the segregations of Fe and V elements are identified as the main factors that affect the corrosion resistance of the hot-rolled alloys.","EIS,Microstructure,Pitting corrosion,Polarization,Titanium alloy","MATE,PHYS",01/10/2018
1,The critical factors of research and innovation creation in public universities in Thailand,International Journal of Trade and Global Markets,"Copyright © 2018 Inderscience Enterprises Ltd.A university, which is the main form of higher education institution (HEI), aims to develop and promote education, generate academic excellence and transfer academic knowledge to solve social problems. To enhance the innovation creation in universities, the relevant factors that affect such creation should be investigated. Hence, this study aims to investigate the critical factors that affect research and innovation creation in public universities in Thailand. The relevant literature was reviewed and research was conducted using a qualitative research approach. Data were collected from in-depth interviews of 11 executives purposely selected from 8 frontier public universities in Thailand. The data were analysed using NVivo qualitative research software. Results revealed 15 critical factors in the following descending order: resource, goal, database and information technology, working environment, managerial process, policy, network, organisational structure, human resource management, government support, strategy, compensation, organisation culture, vision and leader factor. Moreover, results identified the issues that should be addressed.","Critical factors,Innovation creation,Public university,Research","BUSI,ECON",01/01/2018
2,Is the occiput-wall distance valid and reliable to determine the presence of thoracic hyperkyphosis?,Musculoskeletal Science and Practice,"© 2018Background: Hyperkyphosis may be frequently found nowadays due to the change in current lifestyles of sustained flexion postures and age-related system decline. The occiput-wall distance (OWD) is a practical measure that is commonly used to screen and monitor thoracic hyperkyphosis in epidemiologic studies. However, there was no clear evidence to support the clinical utility of the tool as compared to the data from direct standard measures. Objectives: To investigate psychometric properties—including validity, reliability, and appropriate cut-off point—of the OWD to determine the presence of thoracic hyperkyphosis, as compared to a standard Cobb's method. Design: Observational study. Methods: This study was conducted in ninety-nine participants, aged 10 years and above who had different degrees of thoracic hyperkyphosis from several communities. All participants were assessed for their severity of thoracic hyperkyphosis using the OWD, and 14 participants were involved in a reliability study. Within 7 days later, all participants were at a hospital to complete a radiographic examination. Results: Outcomes from OWD had good concurrent validity with the Cobb angles (r = 0.683, P < 0.001) and excellent rater reliability when assessed by well-trained health professionals (ICCs > 0.9, P < 0.001). The OWD of at least 6.5 cm had the best diagnostic properties to determine the presence of thoracic hyperkyphosis (sensitivity = 71.4%, specificity = 76.6%, and area under the curve = 0.846). Conclusion: The findings support validity and reliability of OWD, and offer a clear cut-off point to determine the presence of thoracic hyperkyphosis for clinical utility in various settings.","Cobb angle,Dowager's hump,Round back,Spine",HEAL,01/12/2018
3,Comparison of soil composition between farmlands and conserved area,Eurasian Journal of Analytical Chemistry,"© 2018 Society for Innovative Research. All rights reserved.Thai farmers usually have low formal education and lack of knowledge on soil quality improvement and proper use of fertilizers. After a few years of farming, they try to trespass in conserved forest areas because of soil deterioration in their own limited expanses of farmland and they believe that soils in the conserved area are more fertile than soils in their own farms. Consequently, most of them are arrested, creating individual and family problems. This project will compare the physical and chemical properties of soils from farmlands and soils from the conserved area. The results showed that soil nutrients from both farmlands and soils from the conserved area were not significantly different in nearly all parameters of analysis except soils from the conserved area have more organic matters and nitrogen content. However, both soils from farmlands and conserved area were sandy loam which has low water content and low cation exchange capacity. The analysis data were informed to the farmers and suggested them to improve their farmlands using appropriate organic matters and suitable plants have to be chosen to match the sandy loam soil in order to get more productivity. Water supply management also has to be improved. However, mixed farming is another good planting method. Some kinds of plants can fix nitrogen in soil such as legumes. They can enrich soil nutrients. The most important issue is that farmers have to be acknowledged that soils in the conserved areas are not more fertile than soils in their farmlands. Therefore, in the future, farmers will not trespass conserve areas.","Agriculture land management,Conserved area,Soil conservation","PHAR,CHEM",01/01/2018
4,The impact of wire caliber on ERCP outcomes: a multicenter randomized controlled trial of 0.025-inch and 0.035-inch guidewires,Gastrointestinal Endoscopy,"© 2018Background and Aims: Wire-guided biliary cannulation has been demonstrated to improve cannulation rates and reduce post-ERCP pancreatitis (PEP), but the impact of wire caliber has not been studied. This study compares successful cannulation rates and ERCP adverse events by using a 0.025-inch and 0.035-inch guidewire. Methods: A randomized, single blinded, prospective, multicenter trial at 9 high-volume tertiary-care referral centers in the Asia-Pacific region was performed. Patients with an intact papilla and conventional anatomy who did not have malignancy in the head of the pancreas or ampulla and were undergoing ERCP were recruited. ERCP was performed by using a standardized cannulation algorithm, and patients were randomized to either a 0.025-inch or 0.035-inch guidewire. The primary outcomes of the study were successful wire-guided cannulation and the incidence of PEP. Overall successful cannulation and ERCP adverse events also were studied. Results: A total of 710 patients were enrolled in the study. The primary wire-guided biliary cannulation rate was similar in 0.025-inch and 0.035-inch wire groups (80.7% vs 80.3%; P =.90). The rate of PEP between the 0.025-inch and the 0.035-inch wire groups did not differ significantly (7.8% vs 9.3%; P =.51). No differences were noted in secondary outcomes. Conclusion: Similar rates of successful cannulation and PEP were demonstrated in the use of 0.025-inch and 0.035-inch guidewires. (Clinical trial registration number: NCT01408264.)",,MEDI,01/06/2018


In [119]:
df_all2.tail()

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date
22601,A SOM-Based Trajectory Planning Analysis Method for Intelligent Groups System,SAE Technical Papers,,,ENGI,31/12/2023
22602,Overview and Research on Airworthiness and Safety of Electrical Propulsion and Battery Technologies in eVTOL,SAE Technical Papers,,,"ENGI,MEDI",31/12/2023
22603,Aeroengine Gas Path Parameter Trend Prediction Based on LSTM,SAE Technical Papers,,,ENGI,31/12/2023
22604,A Wind Tunnel Investigation on the Aerodynamics of the Propulsion Wing for a Novel eVTOL Vehicle,SAE Technical Papers,,,ENGI,31/12/2023
22605,High-Precision Modeling and Online Validation of a 200kW-Class Series Hybrid Power System in Aviation,SAE Technical Papers,,,ENGI,31/12/2023


In [118]:
df_all2.to_csv('merged_data.csv', index=False, encoding='utf-8')

print("Data saved to data.csv successfully!")

Data saved to data.csv successfully!


In [121]:
def get_first_subject_area(value):
    if isinstance(value, str):  # Check if the value is a string
        return value.split(',')[0]
    return "Unknown"  # Use a placeholder for missing or invalid data

In [129]:
df['subjectArea_first'] = df['subjectArea'].apply(get_first_subject_area)

In [134]:
arr1 = df['subjectArea_first'].unique()
arr1.sort()
arr1

array(['AGRI', 'ARTS', 'BIOC', 'BUSI', 'CENG', 'CHEM', 'COMP', 'DECI',
       'DENT', 'EART', 'ECON', 'ENER', 'ENGI', 'ENVI', 'HEAL', 'IMMU',
       'MATE', 'MATH', 'MEDI', 'MULT', 'NEUR', 'NURS', 'PHAR', 'PHYS',
       'PSYC', 'SOCI', 'VETE'], dtype=object)

In [122]:
df_final['subjectArea_first'] = df_final['subjectArea'].apply(get_first_subject_area)

In [135]:
arr2 = df_final['subjectArea_first'].unique()
arr2.sort()
arr2

array(['AGRI', 'ARTS', 'BIOC', 'BUSI', 'CENG', 'CHEM', 'COMP', 'DENT',
       'EART', 'ECON', 'ENER', 'ENGI', 'ENVI', 'HEAL', 'IMMU', 'MATE',
       'MATH', 'MEDI', 'MULT', 'NEUR', 'NURS', 'PHAR', 'PHYS', 'PSYC',
       'SOCI', 'VETE'], dtype=object)

In [124]:
df_final['subjectArea_first'].value_counts()

subjectArea_first
SOCI    587
MEDI    471
ENGI    287
COMP    193
ARTS    142
BIOC    117
PHYS     96
AGRI     66
ECON     59
ENVI     54
BUSI     44
IMMU     33
HEAL     32
MATH     32
NURS     31
NEUR     27
EART     22
VETE     20
CENG     13
CHEM     13
MULT     13
ENER     11
DENT     10
PHAR      7
MATE      5
PSYC      5
Name: count, dtype: int64

# Try Single prediction with y_test

In [39]:
# Step 1: Predict probabilities for X_test
y_pred_prob_test = model.predict_proba(X_test)

# Step 2: Convert to single-label predictions
y_pred_single_test = np.zeros_like(y_pred_prob_test)
y_pred_single_test[np.arange(y_pred_prob_test.shape[0]), y_pred_prob_test.argmax(axis=1)] = 1

# Step 3: Evaluate against y_test
print("Classification Report (Single-Label Predictions vs y_test):")
print(classification_report(y_test, y_pred_single_test, target_names=mlb.classes_, zero_division=0))

# Step 4: Convert predictions to readable labels
y_test_labels = mlb.inverse_transform(y_test)                # Convert y_test to label lists
y_pred_labels_single_test = mlb.inverse_transform(y_pred_single_test)  # Convert predictions to label lists

# Step 5: Display a few comparisons
comparison_df = pd.DataFrame({
    "True Labels": y_test_labels,
    "Predicted Labels (Single)": y_pred_labels_single_test
})


Classification Report (Single-Label Predictions vs y_test):
              precision    recall  f1-score   support

        AGRI       0.88      0.48      0.62       383
        ARTS       0.87      0.43      0.58        93
        BIOC       0.86      0.31      0.45       482
        BUSI       0.80      0.39      0.53       125
        CENG       0.91      0.34      0.50       313
        CHEM       0.90      0.36      0.51       438
        COMP       0.95      0.60      0.74       360
        DECI       0.75      0.08      0.14        75
        DENT       0.88      0.81      0.84        86
        EART       0.91      0.42      0.58       142
        ECON       0.75      0.38      0.50        79
        ENER       0.91      0.55      0.69       220
        ENGI       0.88      0.35      0.50       593
        ENVI       0.94      0.47      0.63       344
        HEAL       0.69      0.21      0.32        53
        IMMU       0.97      0.37      0.53       229
        MATE       0.

In [40]:
print("Comparison of True and Predicted Labels (First 10 Samples):")
comparison_df.head(10)

Comparison of True and Predicted Labels (First 10 Samples):


Unnamed: 0,True Labels,Predicted Labels (Single)
0,"(MEDI,)","(MEDI,)"
1,"(NURS,)","(NURS,)"
2,"(COMP, ENGI, MATE, MATH)","(MATE,)"
3,"(MEDI,)","(MEDI,)"
4,"(ENGI, MATE)","(ENGI,)"
5,"(ARTS, COMP, DECI, ENGI, MEDI, SOCI)","(ENGI,)"
6,"(MATE, PHYS)","(MATE,)"
7,"(CHEM, COMP, MATH, PHYS)","(PHYS,)"
8,"(MEDI, NEUR)","(MEDI,)"
9,"(BIOC, MEDI)","(MEDI,)"
