NLP Coursework - Baseline models with preprocessing

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Coll

In [2]:
#PACKAGES USED:

#basic
import numpy as np
import pandas as pd

#to load dataset
from datasets import load_dataset

#for pre-processing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

#for feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#for modeling
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#performance metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
#Step 1: Load the dataset and split into train, validation and test dataset
dataset = load_dataset('rotten_tomatoes')

train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

print(train_dataset)
print(val_dataset)
print(test_dataset)


# Split the dataset into reviews and labels as x and y. 
x_train = train_dataset['text']
y_train = train_dataset['label']

x_val = val_dataset['text']
y_val = val_dataset['label']

x_test = val_dataset['text']
y_test = val_dataset['label']


# check the dataset reviews mannually for a better understanding, 
print(train_dataset[0:5])
#NOTE: first half is fully postive reviews and 2nd half is fully negative reviews. 


# to check the distribution of labels, 
y_train_df = pd.DataFrame(y_train)
y_val_df = pd.DataFrame(y_val)
y_test_df = pd.DataFrame(y_test)

print(y_train_df.value_counts())
print(y_val_df.value_counts())
print(y_test_df.value_counts())

# NOTE: The positive and negative reviews are distributed equally, else there could be an issue of class imbalance which has to addressed by down sampling or up sampling.

Downloading builder script:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading and preparing dataset rotten_tomatoes/default to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset rotten_tomatoes downloaded and prepared to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'effective but too-tepid biopic', 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .', "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one ."], 'label': [1, 1, 1, 1, 1]}
0    4265
1    4265
dtype: int64
0    533
1    533
dtype: int64
0    533
1    533
dtype: int64


PRE-PROCESSING

In [4]:
# Step 2: Pre-processing

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text.lower())
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the tokens
    #stemmer = PorterStemmer()
    #stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    #since stemming is creating some spelling mistakes which may affect the contextual representation, we have not performed this. eg: Centurys ----stemming---> centuri

    #lemmatizing the tokens
    lemmatizer = WordNetLemmatizer()
    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the stemmed tokens
    preprocessed_text = ' '.join(lemma_tokens)
    
    return preprocessed_text

# apply this in x_train, x_val, and x_test,
x_train_preprocessed = [preprocess_text(text) for text in x_train]
x_val_preprocessed = [preprocess_text(text) for text in x_val]
x_test_preprocessed = [preprocess_text(text) for text in x_test]

print(x_train_preprocessed[0:2])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['rock destined st century new conan he going make splash even greater arnold schwarzenegger jeanclaud van damme steven segal', 'gorgeously elaborate continuation lord ring trilogy huge column word adequately describe cowriterdirector peter jackson expanded vision j r r tolkien middleearth']


MODEL 1 - Logistic Regression

In [5]:
# Step 3: Feature selction & Modeling: 
# 3.1 - Logistic Regression with combinations of vectorizers and ngrams 

# Define the vectorizers and logistic regression model
vectorizers = [('count', CountVectorizer()),('tfidf', TfidfVectorizer())]
ngrams = [(1,1), (1,2), (1,3)]

# Loop over each combination of vectorizer, ngram and fit a logistic regression model
for vectorizer_name, vectorizer in vectorizers:
    for ngram in ngrams:
        # Define the pipeline for each combination
        pipe = Pipeline([
            ('vectorizer', vectorizer),
            ('clf', LogisticRegression(random_state=42))
        ])
        pipe.set_params(vectorizer__ngram_range=ngram)

        # Fit the model and make predictions on validation data
        pipe.fit(x_train_preprocessed, y_train)
        y_pred = pipe.predict(x_test_preprocessed)

        # Calculate and print the accuracy on validation data
        acc = accuracy_score(y_test, y_pred)
        print(f'{vectorizer_name} vectorizer with {ngram} ngram accuracy: {acc:.3f}')
        print('Confusion matrix:')
        print(confusion_matrix(y_test, y_pred))
        print('Classification report:')
        print(classification_report(y_test, y_pred))
        print() 


count vectorizer with (1, 1) ngram accuracy: 0.747
Confusion matrix:
[[409 124]
 [146 387]]
Classification report:
              precision    recall  f1-score   support

           0       0.74      0.77      0.75       533
           1       0.76      0.73      0.74       533

    accuracy                           0.75      1066
   macro avg       0.75      0.75      0.75      1066
weighted avg       0.75      0.75      0.75      1066


count vectorizer with (1, 2) ngram accuracy: 0.754
Confusion matrix:
[[418 115]
 [147 386]]
Classification report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       533
           1       0.77      0.72      0.75       533

    accuracy                           0.75      1066
   macro avg       0.76      0.75      0.75      1066
weighted avg       0.76      0.75      0.75      1066


count vectorizer with (1, 3) ngram accuracy: 0.753
Confusion matrix:
[[421 112]
 [151 382]]
Classification report:

MODEL 2 - Support Vector Machine

In [6]:
# Step 3: Feature selection & Modeling
# 3.2 - SVM with combinations of vectorizers and ngrams 

# Define the vectorizers and ngrams to test
vectorizers = [('count', CountVectorizer()), ('tfidf', TfidfVectorizer())]
ngrams = [(1, 1), (1, 2), (1, 3)]

# Define the SVM model
svm = SVC(kernel='linear')

# Loop over each combination of vectorizer, ngram and fit an SVM model
for vectorizer_name, vectorizer in vectorizers:
    for ngram in ngrams:
        # Define the pipeline for each combination
        pipe = Pipeline([
            ('vectorizer', vectorizer),
            ('svm', svm)
        ])
        pipe.set_params(vectorizer__ngram_range=ngram)

        # Fit the model and make predictions on validation data
        pipe.fit(x_train_preprocessed, y_train)
        y_pred = pipe.predict(x_test_preprocessed)

        # Calculate and print the accuracy on validation data
        acc = accuracy_score(y_test, y_pred)
        print(f'{vectorizer_name} vectorizer with {ngram} ngram accuracy: {acc:.3f}')
        print('Confusion matrix:')
        print(confusion_matrix(y_test, y_pred))
        print('Classification report:')
        print(classification_report(y_test, y_pred))
        print() 

count vectorizer with (1, 1) ngram accuracy: 0.735
Confusion matrix:
[[400 133]
 [150 383]]
Classification report:
              precision    recall  f1-score   support

           0       0.73      0.75      0.74       533
           1       0.74      0.72      0.73       533

    accuracy                           0.73      1066
   macro avg       0.73      0.73      0.73      1066
weighted avg       0.73      0.73      0.73      1066


count vectorizer with (1, 2) ngram accuracy: 0.753
Confusion matrix:
[[410 123]
 [140 393]]
Classification report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76       533
           1       0.76      0.74      0.75       533

    accuracy                           0.75      1066
   macro avg       0.75      0.75      0.75      1066
weighted avg       0.75      0.75      0.75      1066


count vectorizer with (1, 3) ngram accuracy: 0.751
Confusion matrix:
[[416 117]
 [148 385]]
Classification report:

In [7]:
# 3.2.1 - SVM(Hyperparameter optimized) with combinations of vectorizers and ngrams 

# Define the pipeline with TfidfVectorizer and SVM model
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
    ('svm', SVC())
])

# Define the parameter grid to search through
param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

# Perform grid search with cross validation
svm_grid = GridSearchCV(svm_pipeline, param_grid=param_grid, cv=5)
svm_grid.fit(x_train_preprocessed, y_train)

# Print the best hyperparameters and the mean cross validation score
print("Best hyperparameters: ", svm_grid.best_params_)
print("Best cross validation score: ", svm_grid.best_score_)

# Make predictions and evaluate the model on the test set
y_pred = svm_grid.predict(x_test_preprocessed)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Best hyperparameters:  {'svm__C': 1, 'svm__kernel': 'rbf', 'tfidf__max_features': 10000}
Best cross validation score:  0.7558030480656506
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       533
           1       0.78      0.73      0.75       533

    accuracy                           0.76      1066
   macro avg       0.76      0.76      0.76      1066
weighted avg       0.76      0.76      0.76      1066

[[425 108]
 [146 387]]


In [8]:
#plot the incorrect prediction for analysis. 
# Create a dataframe to store the incorrect predictions
df_incorrect_predictions_SVM = pd.DataFrame(columns=['text', 'true_label', 'predicted_label'])

y_test = np.array(y_test)

# Loop through the predictions and add any incorrect ones to the dataframe
for i in range(len(y_test)):
    if y_test[i] != y_pred[i]:
        text = x_test_preprocessed[i]
        true_label = y_test[i]
        predicted_label = y_pred[i]
        df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)

print(df_incorrect_predictions_SVM)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Save the DataFrame to a CSV file
df_incorrect_predictions_SVM.to_csv('df_incorrect_predictions_SVM.csv', index=False)
from google.colab import files

# Download the CSV file
files.download('df_incorrect_predictions_SVM.csv')


  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df

                                                  text true_label  \
0    compassionately explores seemingly irreconcila...          1   
1               soundtrack alone worth price admission          1   
2    importance earnest thick wit play like reading...          1   
3    made teen reviewed recommended year age mild r...          1   
4    competent unpretentious entertainment destined...          1   
..                                                 ...        ...   
249               there comedic moment romantic comedy          0   
250  andunders looking caddyshack adopt generationa...          0   
251  distinctly mixed bag occasional burst sharp wr...          0   
252  hilarious musical comedy though stymied accent...          0   
253  splatter movie probably reasonably good time s...          0   

    predicted_label  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  
..              ...  
249              

  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df_incorrect_predictions_SVM = df_incorrect_predictions_SVM.append({'text': text, 'true_label': true_label, 'predicted_label': predicted_label}, ignore_index=True)
  df

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
svm_grid_results = pd.DataFrame(svm_grid.cv_results_)
svm_grid_results = svm_grid_results.sort_values('rank_test_score')
svm_grid_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__C,param_svm__kernel,param_tfidf__max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,6.413836,0.235297,1.200237,0.132538,1.0,rbf,10000.0,"{'svm__C': 1, 'svm__kernel': 'rbf', 'tfidf__ma...",0.757327,0.76143,0.763775,0.745604,0.750879,0.755803,0.00672,1
8,8.568086,0.218623,1.371643,0.061225,1.0,linear,,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidf_...",0.756155,0.753224,0.762016,0.750879,0.747362,0.753927,0.004965,2
16,5.971281,0.270963,1.211983,0.093578,10.0,rbf,10000.0,"{'svm__C': 10, 'svm__kernel': 'rbf', 'tfidf__m...",0.755569,0.748535,0.770809,0.739742,0.754982,0.753927,0.010189,2
14,8.200811,0.216285,1.39753,0.112813,10.0,linear,,"{'svm__C': 10, 'svm__kernel': 'linear', 'tfidf...",0.758499,0.751465,0.766706,0.741501,0.749707,0.753576,0.008507,4
7,4.999832,0.193624,0.881422,0.069263,1.0,linear,10000.0,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidf_...",0.757913,0.749707,0.762603,0.743845,0.749121,0.752638,0.006714,5
9,5.535867,0.148757,1.085819,0.137535,1.0,rbf,5000.0,"{'svm__C': 1, 'svm__kernel': 'rbf', 'tfidf__ma...",0.759672,0.749707,0.756741,0.737397,0.747362,0.750176,0.007806,6
17,8.458928,0.191565,1.607746,0.172875,10.0,rbf,,"{'svm__C': 10, 'svm__kernel': 'rbf', 'tfidf__m...",0.748535,0.747948,0.749707,0.740328,0.757327,0.748769,0.005405,7
15,5.387565,0.272719,0.989848,0.016733,10.0,rbf,5000.0,"{'svm__C': 10, 'svm__kernel': 'rbf', 'tfidf__m...",0.754396,0.740328,0.75381,0.733294,0.740914,0.744549,0.008251,8
6,4.357544,0.256946,0.708404,0.047343,1.0,linear,5000.0,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidf_...",0.747362,0.740328,0.748535,0.729191,0.74619,0.742321,0.007146,9
11,8.965044,0.259068,1.629704,0.182771,1.0,rbf,,"{'svm__C': 1, 'svm__kernel': 'rbf', 'tfidf__ma...",0.740914,0.743259,0.737984,0.739742,0.746776,0.741735,0.003048,10
