In [None]:
import pandas as pd

## Label Data

I manually labeled this dataset downloaded from SciVal with publications information. The objective here is to use the selected columns to predict whether a publication is related to "Climate Innovation."

## Read in Data

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/CI_publications.xlsx')

In [None]:
df.head()

Unnamed: 0,Index,CI related?,Title,Abstract,DOI,Scopus Source title,All Science Journal Classification (ASJC) field name,Quacquarelli Symonds (QS) Subject area field name,Quacquarelli Symonds (QS) Subject field name,Times Higher Education (THE) field name,ANZSRC FoR (2020) parent name,Sustainable Development Goals (2023),Topic Cluster name,Topic Cluster number,Topic name,Topic number
0,1,N,CARD 2020: Antibiotic resistome surveillance w...,https://www.scopus.com/record/display.url?eid=...,10.1093/nar/gkz935,Nucleic Acids Research,Genetics,Life Sciences & Medicine,Biological Sciences,Life Sciences,Biological Sciences| Environmental Sciences| C...,-,Nucleic Acid Hybridization; DNA; RNA Sequencing,40,Bacterial Genome; DNA Sequence; Pseudomonas,7289
1,2,N,"CARD 2023: expanded curation, support for mach...",https://www.scopus.com/record/display.url?eid=...,10.1093/nar/gkac920,Nucleic Acids Research,Genetics,Life Sciences & Medicine,Biological Sciences,Life Sciences,Biological Sciences| Chemical Sciences| Enviro...,SDG 3,Multiple Drug Resistance; Carbapenem; Antibiotics,21,Antibiotic Resistance; Bacterial Gene; Metagen...,680
2,3,N,Extended reality in spatial sciences: A review...,https://www.scopus.com/record/display.url?eid=...,10.3390/ijgi9070439,ISPRS International Journal of Geo-Information,Earth and Planetary Sciences (miscellaneous)| ...,Natural Sciences| Social Sciences & Management...,Geophysics| Earth & Marine Sciences| Engineeri...,Social Sciences| Physical Sciences,Engineering| Earth Sciences,-,Augmented Reality; Gesture Recognition; Image ...,362,Virtual Reality; Engineering Education; Comput...,2107
3,4,N,"Decreasing antibiotic use, the gut microbiota,...",https://www.scopus.com/record/display.url?eid=...,10.1016/S2213-2600(20)30052-7,The Lancet Respiratory Medicine,Pulmonary and Respiratory Medicine,Life Sciences & Medicine,Medicine,Clinical and Health,Biomedical And Clinical Sciences,SDG 3,Quality of Life; Severe Asthma; Magnesium,11,Asthma; Antiinfective Agent; Cohort Study,83790
4,5,N,Metagenome-assembled genome binning methods wi...,https://www.scopus.com/record/display.url?eid=...,10.1099/mgen.0.000436,Microbial Genomics,Genetics| Molecular Biology| Microbiology| Epi...,Life Sciences & Medicine,Medicine| Biological Sciences,Clinical and Health| Life Sciences,Biological Sciences,-,Gut Microbiota; Metagenomics; DNA,80,Metagenome; DNA Sequence; Next Generation Sequ...,9066


In [None]:
# Code the target variable
df['Target'] = df['CI related?'].replace({'Y': 1, 'N': 0})

In [None]:
# Look at the percentages of the target variable value counts
df['Target'].value_counts(normalize=True) * 100

#Shows that the data set is fairly balanced

Target
0    58.780488
1    41.219512
Name: proportion, dtype: float64

In [None]:
# Seperate the data into train, validation and test data sets
from sklearn.model_selection import train_test_split

# Define the proportions for the train, validation, and test sets
train_size = 0.6
validation_size = 0.2
test_size = 0.2

# First, split the dataset into train and temp sets
train_df, temp_df = train_test_split(df, test_size=(1 - train_size), random_state=42)

# Then, split the temp set into validation and test sets
validation_df, test_df = train_test_split(temp_df, test_size=(test_size / (test_size + validation_size)), random_state=42)

# Display the sizes of the splits
print("Training set size:", len(train_df))
print("Validation set size:", len(validation_df))
print("Test set size:", len(test_df))

Training set size: 246
Validation set size: 82
Test set size: 82


In [None]:
# Save the three data sets
train_df.to_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv', index = False)
validation_df.to_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv', index = False)
test_df.to_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/test_data.csv', index = False)

## Establish baseline

I notived that the SDG column in the original data set is a good predictor of the target variable. Specifically, if a publication is related to a SDG, then it's likely to be CI related. So I will use this as the baseline.

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 246 entries, 362 to 102
Data columns (total 17 columns):
 #   Column                                                Non-Null Count  Dtype 
---  ------                                                --------------  ----- 
 0   Index                                                 246 non-null    int64 
 1   CI related?                                           246 non-null    object
 2   Title                                                 246 non-null    object
 3   Abstract                                              246 non-null    object
 4   DOI                                                   246 non-null    object
 5   Scopus Source title                                   246 non-null    object
 6   All Science Journal Classification (ASJC) field name  246 non-null    object
 7   Quacquarelli Symonds (QS) Subject area field name     246 non-null    object
 8   Quacquarelli Symonds (QS) Subject field name          246 non-null    obj

In [None]:
print("The number of publications NOT related to SDG: " + str((train_df['Sustainable Development Goals (2023)'] == '-').sum()))
print("The number of publications related to SDG: " + str((train_df['Sustainable Development Goals (2023)'] != '-').sum()))

The number of publications NOT related to SDG: 102
The number of publications related to SDG: 144


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the values in 'column1' that will predict 1 for the target variable
values_predicting_1 = ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15']

# Generate baseline predictions
train_df['baseline_prediction'] = train_df['Sustainable Development Goals (2023)'].apply(
    lambda x: 1 if any (value in x for value in  values_predicting_1)  else 0)

# Calculate metrics
accuracy = accuracy_score(train_df['Target'], train_df['baseline_prediction'])
precision = precision_score(train_df['Target'], train_df['baseline_prediction'])
recall = recall_score(train_df['Target'], train_df['baseline_prediction'])
f1 = f1_score(train_df['Target'], train_df['baseline_prediction'])

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.68
Precision: 0.58
Recall: 0.65
F1 Score: 0.61


In [None]:
# Generate baseline predictions
validation_df['baseline_prediction'] = validation_df['Sustainable Development Goals (2023)'].apply(
    lambda x: 1 if any(value in x for value in values_predicting_1) else 0)

# Calculate metrics
accuracy = accuracy_score(validation_df['Target'], validation_df['baseline_prediction'])
precision = precision_score(validation_df['Target'], validation_df['baseline_prediction'])
recall = recall_score(validation_df['Target'], validation_df['baseline_prediction'])
f1 = f1_score(validation_df['Target'], validation_df['baseline_prediction'])

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.66
Precision: 0.54
Recall: 0.71
F1 Score: 0.61


Train a simple logistic regression model using the Topic and topic cluster number columns.

In [None]:
# prompt: a logistic regression model using the train_df and the validation_df from above and using the "Topic Cluster number" and the "Topic number" as features. First convert these two columns from strings to float. Use the above four metrics for assessing the results. If one of the two columns's value is '-', use the other column to predict. If both column values are '-' then predict as 0.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert the "Topic Cluster number" and "Topic number" columns to float
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Define the features and target variables
X_train = train_df[['Topic Cluster number', 'Topic number']]
y_train = train_df['Target']
X_val = validation_df[['Topic Cluster number', 'Topic number']]
y_val = validation_df['Target']

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Calculate the metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.62
Precision: 0.50
Recall: 0.06
F1 Score: 0.11


Using these two columns and this simple logistic regression model, we can see that our recall and F1 scoores are low. Next, we will encode the "Title column" and use it in our model.

In [None]:
# prompt: Encode the Title column and add it into the logistic regression model. Use count vectorizer. import the relevant packages first.

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert the "Topic Cluster number" and "Topic number" columns to float
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Define the features and target variables
X_train = train_df[['Topic Cluster number', 'Topic number', 'Title']]
y_train = train_df['Target']
X_val = validation_df[['Topic Cluster number', 'Topic number', 'Title']]
y_val = validation_df['Target']

# Encode the Title column using CountVectorizer
vectorizer = CountVectorizer()
X_train_encoded = vectorizer.fit_transform(X_train['Title'])
X_val_encoded = vectorizer.transform(X_val['Title'])

# Combine the encoded Title column with the other features
X_train_combined = pd.concat([X_train[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_train_encoded.toarray())], axis=1)
X_val_combined = pd.concat([X_val[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_val_encoded.toarray())], axis=1)

# Ensure all column names are strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_val_combined.columns = X_val_combined.columns.astype(str)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_combined, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_combined)

# Calculate the metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.62
Precision: 0.50
Recall: 0.06
F1 Score: 0.11


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert the "Topic Cluster number" and "Topic number" columns to float
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Define the features and target variables
X_train = train_df[['Topic Cluster number', 'Topic number', 'Title']]
y_train = train_df['Target']
X_val = validation_df[['Topic Cluster number', 'Topic number', 'Title']]
y_val = validation_df['Target']

# Encode the Title column using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_encoded = tfidf_vectorizer.fit_transform(X_train['Title'])
X_val_encoded = tfidf_vectorizer.transform(X_val['Title'])

# Combine the encoded Title column with the other features
X_train_combined = pd.concat([X_train[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_train_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out())], axis=1)
X_val_combined = pd.concat([X_val[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_val_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out())], axis=1)

# Ensure all column names are strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_val_combined.columns = X_val_combined.columns.astype(str)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_combined, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_combined)

# Calculate the metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.62
Precision: 0.50
Recall: 0.06
F1 Score: 0.11


In [None]:
# prompt: Generate a new feature based on the 'Sustainable Development Goals (2023)' column. If any value in ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15'] exists in the column value, then the new feature value is "1" else it is "0" and then add this new feature to the model above and product the results.

import pandas as pd
# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert the "Topic Cluster number" and "Topic number" columns to float
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Define the features and target variables
X_train = train_df[['Topic Cluster number', 'Topic number', 'Title']]
y_train = train_df['Target']
X_val = validation_df[['Topic Cluster number', 'Topic number', 'Title']]
y_val = validation_df['Target']

# Generate the new feature
train_df['SDG_related'] = train_df['Sustainable Development Goals (2023)'].apply(
    lambda x: 1 if any (value in x for value in  ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15'])  else 0)
validation_df['SDG_related'] = validation_df['Sustainable Development Goals (2023)'].apply(
    lambda x: 1 if any(value in x for value in ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15']) else 0)

# Encode the Title column using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_encoded = tfidf_vectorizer.fit_transform(X_train['Title'])
X_val_encoded = tfidf_vectorizer.transform(X_val['Title'])

# Combine the encoded Title column, the new feature, and the other features
X_train_combined = pd.concat([X_train[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_train_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out()), train_df['SDG_related'].reset_index(drop=True)], axis=1)
X_val_combined = pd.concat([X_val[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_val_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out()), validation_df['SDG_related'].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_val_combined.columns = X_val_combined.columns.astype(str)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_combined, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_combined)

# Calculate the metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.62
Precision: 0.50
Recall: 0.06
F1 Score: 0.11


In [None]:
# prompt: use the same features but use the XGBoost model instead.

import pandas as pd
from xgboost import XGBClassifier

# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert the "Topic Cluster number" and "Topic number" columns to float
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Define the features and target variables
X_train = train_df[['Topic Cluster number', 'Topic number', 'Title']]
y_train = train_df['Target']
X_val = validation_df[['Topic Cluster number', 'Topic number', 'Title']]
y_val = validation_df['Target']

# Generate the new feature
train_df['SDG_related'] = train_df['Sustainable Development Goals (2023)'].apply(
    lambda x: 1 if any (value in x for value in  ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15'])  else 0)
validation_df['SDG_related'] = validation_df['Sustainable Development Goals (2023)'].apply(
    lambda x: 1 if any(value in x for value in ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15']) else 0)

# Encode the Title column using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_encoded = tfidf_vectorizer.fit_transform(X_train['Title'])
X_val_encoded = tfidf_vectorizer.transform(X_val['Title'])

# Combine the encoded Title column, the new feature, and the other features
X_train_combined = pd.concat([X_train[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_train_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out()), train_df['SDG_related'].reset_index(drop=True)], axis=1)
X_val_combined = pd.concat([X_val[['Topic Cluster number', 'Topic number']].reset_index(drop=True), pd.DataFrame(X_val_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out()), validation_df['SDG_related'].reset_index(drop=True)], axis=1)

# Ensure all column names are strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_val_combined.columns = X_val_combined.columns.astype(str)

# Train the XGBoost model
model = XGBClassifier(objective='binary:logistic', seed=42)
model.fit(X_train_combined, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_combined)

# Calculate the metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.76
Precision: 0.70
Recall: 0.61
F1 Score: 0.66


In [None]:
# Specify the columns you want to analyze
columns_to_analyze = ['All Science Journal Classification (ASJC) field name',
                      'Quacquarelli Symonds (QS) Subject area field name',
                      'Quacquarelli Symonds (QS) Subject field name',
                      'Times Higher Education (THE) field name']

# Count unique values in specified columns
unique_counts = df[columns_to_analyze].nunique()

# Print the results
print("Number of unique values per column:")
print(unique_counts)

Number of unique values per column:
All Science Journal Classification (ASJC) field name    210
Quacquarelli Symonds (QS) Subject area field name        42
Quacquarelli Symonds (QS) Subject field name            190
Times Higher Education (THE) field name                  79
dtype: int64


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert numeric-like columns to floats or ints where necessary
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Generate a new feature based on Sustainable Development Goals
sdg_keys = ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15']
train_df['SDG_related'] = train_df['Sustainable Development Goals (2023)'].apply(lambda x: 1 if any(value in x for value in sdg_keys) else 0)
validation_df['SDG_related'] = validation_df['Sustainable Development Goals (2023)'].apply(lambda x: 1 if any(value in x for value in sdg_keys) else 0)

# Encode the Title column using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_title_encoded = tfidf_vectorizer.fit_transform(train_df['Title'])
X_val_title_encoded = tfidf_vectorizer.transform(validation_df['Title'])

# Prepare initial combined training and validation sets with encoded titles
X_train_base = pd.DataFrame(X_train_title_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_val_base = pd.DataFrame(X_val_title_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Encode categorical columns and evaluate the model for each
columns_to_encode = ['All Science Journal Classification (ASJC) field name',
                     'Quacquarelli Symonds (QS) Subject area field name',
                     'Quacquarelli Symonds (QS) Subject field name',
                     'Times Higher Education (THE) field name']

for col in columns_to_encode:
    vectorizer = CountVectorizer()
    X_train_encoded_col = vectorizer.fit_transform(train_df[col])
    X_val_encoded_col = vectorizer.transform(validation_df[col])

    # Combine all features: topic numbers, SDG-related, encoded titles, and current encoded categorical column
    X_train_combined = pd.concat([
        train_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
        train_df['SDG_related'].reset_index(drop=True),
        X_train_base,
        pd.DataFrame(X_train_encoded_col.toarray(), columns=vectorizer.get_feature_names_out())
    ], axis=1)

    X_val_combined = pd.concat([
        validation_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
        validation_df['SDG_related'].reset_index(drop=True),
        X_val_base,
        pd.DataFrame(X_val_encoded_col.toarray(), columns=vectorizer.get_feature_names_out())
    ], axis=1)

    # Train the XGBoost model
    model = XGBClassifier(objective='binary:logistic', seed=42)
    model.fit(X_train_combined, train_df['Target'])

    # Predict and calculate metrics
    y_pred = model.predict(X_val_combined)
    accuracy = accuracy_score(validation_df['Target'], y_pred)
    precision = precision_score(validation_df['Target'], y_pred)
    recall = recall_score(validation_df['Target'], y_pred)
    f1 = f1_score(validation_df['Target'], y_pred)

    # Print results for each category
    print(f"Results for column '{col}':")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


ValueError: feature_names must be unique. Duplicates found: ['and', 'artificial', 'behavior', 'building', 'care', 'cell', 'change', 'chemical', 'cognitive', 'community', 'conservation', 'construction', 'control', 'design', 'development', 'ecology', 'economic', 'education', 'energy', 'engineering', 'environment', 'environmental', 'ethics', 'evolution', 'experimental', 'factors', 'food', 'fuel', 'global', 'health', 'history', 'human', 'in', 'information', 'intelligence', 'international', 'life', 'management', 'manufacturing', 'matter', 'mechanical', 'media', 'mental', 'modeling', 'molecular', 'monitoring', 'networks', 'of', 'oral', 'organic', 'performing', 'physical', 'planetary', 'planning', 'plant', 'policy', 'political', 'pollution', 'power', 'public', 'quality', 'renewable', 'research', 'resource', 'risk', 'safety', 'science', 'sciences', 'simulation', 'social', 'studies', 'surface', 'sustainability', 'systems', 'technology', 'the', 'transportation', 'urban', 'visual']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Read in the train and validation data sets
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/train_data.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CI_pubs_project/data/validation_data.csv')

# Convert numeric-like columns to floats or ints where necessary
train_df['Topic Cluster number'] = train_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
train_df['Topic number'] = train_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic Cluster number'] = validation_df['Topic Cluster number'].apply(lambda x: float(x) if x != '-' else -1)
validation_df['Topic number'] = validation_df['Topic number'].apply(lambda x: float(x) if x != '-' else -1)

# Generate a new feature based on Sustainable Development Goals
sdg_keys = ['SDG 7', 'SDG 11', 'SDG 13', 'SDG 12', 'SDG 14', 'SDG 15']
train_df['SDG_related'] = train_df['Sustainable Development Goals (2023)'].apply(lambda x: 1 if any(value in x for value in sdg_keys) else 0)
validation_df['SDG_related'] = validation_df['Sustainable Development Goals (2023)'].apply(lambda x: 1 if any(value in x for value in sdg_keys) else 0)

# Encode the Title column using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_title_encoded = tfidf_vectorizer.fit_transform(train_df['Title'])
X_val_title_encoded = tfidf_vectorizer.transform(validation_df['Title'])

# Prepare initial combined training and validation sets with encoded titles
X_train_base = pd.DataFrame(X_train_title_encoded.toarray(), columns=[f"title_{name}" for name in tfidf_vectorizer.get_feature_names_out()])
X_val_base = pd.DataFrame(X_val_title_encoded.toarray(), columns=[f"title_{name}" for name in tfidf_vectorizer.get_feature_names_out()])

# Encode categorical columns and evaluate the model for each
columns_to_encode = ['All Science Journal Classification (ASJC) field name',
                     'Quacquarelli Symonds (QS) Subject area field name',
                     'Quacquarelli Symonds (QS) Subject field name',
                     'Times Higher Education (THE) field name']

for col in columns_to_encode:
    vectorizer = CountVectorizer()
    X_train_encoded_col = vectorizer.fit_transform(train_df[col])
    X_val_encoded_col = vectorizer.transform(validation_df[col])

    # Combine all features: topic numbers, SDG-related, encoded titles, and current encoded categorical column
    X_train_combined = pd.concat([
        train_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
        train_df['SDG_related'].reset_index(drop=True),
        X_train_base,
        pd.DataFrame(X_train_encoded_col.toarray(), columns=[f"{col}_{name}" for name in vectorizer.get_feature_names_out()])
    ], axis=1)

    X_val_combined = pd.concat([
        validation_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
        validation_df['SDG_related'].reset_index(drop=True),
        X_val_base,
        pd.DataFrame(X_val_encoded_col.toarray(), columns=[f"{col}_{name}" for name in vectorizer.get_feature_names_out()])
    ], axis=1)

    # Train the XGBoost model
    model = XGBClassifier(objective='binary:logistic', seed=42)
    model.fit(X_train_combined, train_df['Target'])

    # Predict and calculate metrics
    y_pred = model.predict(X_val_combined)
    accuracy = accuracy_score(validation_df['Target'], y_pred)
    precision = precision_score(validation_df['Target'], y_pred)
    recall = recall_score(validation_df['Target'], y_pred)
    f1 = f1_score(validation_df['Target'], y_pred)

    # Print results for each category
    print(f"Results for column '{col}':")
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


Results for column 'All Science Journal Classification (ASJC) field name':
Accuracy: 0.77, Precision: 0.70, Recall: 0.68, F1 Score: 0.69
Results for column 'Quacquarelli Symonds (QS) Subject area field name':
Accuracy: 0.79, Precision: 0.79, Recall: 0.61, F1 Score: 0.69
Results for column 'Quacquarelli Symonds (QS) Subject field name':
Accuracy: 0.80, Precision: 0.78, Recall: 0.68, F1 Score: 0.72
Results for column 'Times Higher Education (THE) field name':
Accuracy: 0.78, Precision: 0.74, Recall: 0.65, F1 Score: 0.69


In [None]:
# prompt: Use 'Quacquarelli Symonds (QS) Subject field name' and add it to the XGboost model then produce the results using the validation set. Use the code above fore reference.

import pandas as pd
# Encode the 'Quacquarelli Symonds (QS) Subject field name' column
vectorizer = CountVectorizer()
X_train_encoded_qs_subject = vectorizer.fit_transform(train_df['Quacquarelli Symonds (QS) Subject field name'])
X_val_encoded_qs_subject = vectorizer.transform(validation_df['Quacquarelli Symonds (QS) Subject field name'])

# Combine all features: topic numbers, SDG-related, encoded titles, and encoded 'Quacquarelli Symonds (QS) Subject field name'
X_train_combined = pd.concat([
    train_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
    train_df['SDG_related'].reset_index(drop=True),
    X_train_base,
    pd.DataFrame(X_train_encoded_qs_subject.toarray(), columns=[f"QS_subject_{name}" for name in vectorizer.get_feature_names_out()])
], axis=1)

X_val_combined = pd.concat([
    validation_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
    validation_df['SDG_related'].reset_index(drop=True),
    X_val_base,
    pd.DataFrame(X_val_encoded_qs_subject.toarray(), columns=[f"QS_subject_{name}" for name in vectorizer.get_feature_names_out()])
], axis=1)

# Train the XGBoost model
model = XGBClassifier(objective='binary:logistic', seed=42)
model.fit(X_train_combined, train_df['Target'])

# Predict and calculate metrics
y_pred = model.predict(X_val_combined)
accuracy = accuracy_score(validation_df['Target'], y_pred)
precision = precision_score(validation_df['Target'], y_pred)
recall = recall_score(validation_df['Target'], y_pred)
f1 = f1_score(validation_df['Target'], y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


Accuracy: 0.80, Precision: 0.78, Recall: 0.68, F1 Score: 0.72


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming test_df is already loaded and has the same structure
# If test_df is not loaded, you might need to read it similarly to train_df and validation_df:
# test_df = pd.read_csv('/path_to_your_data/test_data.csv')

# Assuming vectorizer is already fitted to the training data as shown previously

# Encode the 'Quacquarelli Symonds (QS) Subject field name' column for the test set
X_test_encoded_qs_subject = vectorizer.transform(test_df['Quacquarelli Symonds (QS) Subject field name'])

# Combine all features for the test set
X_test_combined = pd.concat([
    test_df[['Topic Cluster number', 'Topic number']].reset_index(drop=True),
    test_df['SDG_related'].reset_index(drop=True),
    pd.DataFrame(tfidf_vectorizer.transform(test_df['Title']).toarray(), columns=[f"title_{name}" for name in tfidf_vectorizer.get_feature_names_out()]),
    pd.DataFrame(X_test_encoded_qs_subject.toarray(), columns=[f"QS_subject_{name}" for name in vectorizer.get_feature_names_out()])
], axis=1)

# Predict using the trained model
y_test_pred = model.predict(X_test_combined)

# Calculate metrics for the test set
test_accuracy = accuracy_score(test_df['Target'], y_test_pred)
test_precision = precision_score(test_df['Target'], y_test_pred)
test_recall = recall_score(test_df['Target'], y_test_pred)
test_f1 = f1_score(test_df['Target'], y_test_pred)

# Print the results for the test set
print(f"Test Set Results - Accuracy: {test_accuracy:.2f}, Precision: {test_precision:.2f}, Recall: {test_recall:.2f}, F1 Score: {test_f1:.2f}")


Test Set Results - Accuracy: 0.77, Precision: 0.83, Recall: 0.69, F1 Score: 0.75
