In [1]:
#import data
import pandas as pd
url = "https://storage.googleapis.com/adsp-nlp-open/data/elsevier-oa-cc-by/abstracts.json"
data = pd.read_json(url, orient="records")

In [2]:
# View the target data columns
data.head()
subject=data['subject'].unique()
print(subject)

['phys_sci' 'healh_sci' 'engi_tech' 'life_sci' 'soc_sci']


The data comes with "title" and "abstract" fields. You may use either (or both) for your modeling.
Using both gave better results hence using it to compare the models using techniques of BOW and Lemmatization

In [3]:
#import labelencoder and combine 2 columns
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
#fit the encoder
data['subject_encoded']=encoder.fit_transform(data['subject'])
data['text']=data['abstract']+' ' +data['title']

In [4]:
#clean
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text=text.lower()  # Convert to lowercase
    return text.strip() 

data['cleaned_text'] = data['text'].apply(clean_text)


In [5]:
#vectorization using bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1000, stop_words='english')  # Limit to 1000 features for simplicity
X = vectorizer.fit_transform(data['cleaned_text'])
#target variable
y = data['subject_encoded']
#get feature names
words=vectorizer.get_feature_names_out()

In [6]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Model 1: Ensemble of 1000 logistic regression model using BOW

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer(max_features=1000, stop_words='english')  # Limit to 1000 features for simplicity
X = vectorizer.fit_transform(data['cleaned_text'])

# Step 2: Define the target variable
y = data['subject_encoded']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize the Logistic Regression model
classifier1 = LogisticRegression(max_iter=1000)

# Step 5: Create an ensemble of Logistic Regression models using BaggingClassifier
ensemble = BaggingClassifier(estimator=classifier1, n_estimators=1000, random_state=42, n_jobs=-1)

# Step 6: Train the ensemble model
ensemble.fit(X_train, y_train)

# Step 7: Predict on the test set
y_pred = ensemble.predict(X_test)

# Step 8: Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(class_report)


Accuracy: 0.6539157405014215
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.70      0.71      1809
           1       0.75      0.72      0.73      1525
           2       0.60      0.47      0.53      1191
           3       0.60      0.68      0.64      2578
           4       0.60      0.60      0.60       635

    accuracy                           0.65      7738
   macro avg       0.65      0.63      0.64      7738
weighted avg       0.66      0.65      0.65      7738



#Model 2: Using Logistic regression with Lemmatization for prediction

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and lemmatize text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize and remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    
    # Lemmatize each word
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the tokens back into a string
    return " ".join(tokens)

# Assuming 'data' is a DataFrame with 'cleaned_abstract' and 'subject_encoded' columns
data['cleaned_text_lemmatized'] = data['cleaned_text'].apply(preprocess_text)

# Vectorize the cleaned abstract using CountVectorizer or TfidfVectorizer
vectorizer = CountVectorizer(max_features=1000)  # Limit to 1000 features for simplicity
XX = vectorizer.fit_transform(data['cleaned_text_lemmatized'])

# Target variable
y = data['subject_encoded']

# Split the data into training and testing sets
XX_train, XX_test, y_train, y_test = train_test_split(XX, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression classifier
classifier2 = LogisticRegression(max_iter=500)  # Increase max_iter to ensure convergence

# Train the model
classifier2.fit(XX_train, y_train)

# Make predictions
y_pred = classifier2.predict(XX_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(class_report)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cicily.mathew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cicily.mathew\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.6561126906177307
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      1809
           1       0.74      0.72      0.73      1525
           2       0.61      0.46      0.53      1191
           3       0.60      0.68      0.64      2578
           4       0.60      0.62      0.61       635

    accuracy                           0.66      7738
   macro avg       0.65      0.64      0.64      7738
weighted avg       0.66      0.66      0.65      7738



#Model3: Random forest Classifier with BOW

In [10]:
#Use Random forest to predict the text
from sklearn.ensemble import RandomForestClassifier
# Initialize the model
classifier3 = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the classifier on the training data
classifier3.fit(X_train, y_train)
# Predict on the test set
y_pred_RF3 = classifier3.predict(X_test)
# Evaluate performance
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred_RF3))
print("Classification Report:\n", classification_report(y_test, y_pred_RF3))

Accuracy: 0.6492633755492375
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.74      0.71      1809
           1       0.72      0.80      0.76      1525
           2       0.69      0.30      0.42      1191
           3       0.59      0.71      0.64      2578
           4       0.62      0.45      0.52       635

    accuracy                           0.65      7738
   macro avg       0.66      0.60      0.61      7738
weighted avg       0.65      0.65      0.64      7738



What kind of models did you try and why?

Tried different variations of Logistic regression with each column seperately and combined along with SVM and random forest classifier with techniques BOW and lemmatization. Final 3 models are #Model 1: Ensemble of 1000 logistic regression model using BOW, #Model 2: Using Logistic regression with Lemmatization for prediction, #Model3: Random forest Classifier with BOW.

How did you evaluate the model and which metric do you think is most important.
Focus on evaluating the model based on accuracy rather than performance, as the university prioritizes relevance for search results. The model can run overnight to classify the documents, so performance is less of a concern.

How did you investigate misclassifications? Ans: Used classification report to investigate misclassifications, as it provides detailed metrics for evaluating the performance of all the classification model. It includes Precision, Recall, F1-Score, and Support for each class

Were the misclassifications understandable (genuinely difficult examples) or were they blatant errors? Provide a few examples.