In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from models.tbnb import TbNB
from utils.benchmarking import evaluate_model
from pathlib import Path

DATA_DIR = Path("../datasets")


## Data Loading

Dataset contains 50k IMDB movie reviews with binary response labels

In [2]:
file_path = DATA_DIR / "IMDB Dataset.csv"
data = pd.read_csv(file_path)
X = data["review"]
y = data["sentiment"]
print(f"Dataset size: {X.shape[0]} samples")


Dataset size: 50000 samples


## Cross-Validation
Scikit-learn readily provides an efficient Stratified Cross-Validation class 

In [3]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)

# Computational efficiency test

For each iteration we apply the following pipeline:
1. Train CountVectorizer on train split
2. Transform both train and test reviews into BoW matrices
3. Initialize models (base and iterative)
4. Apply the evaluate function which computes: accuracy and f1 score, training time and predicting time
5. Store everything withing a pandas dataframe for inspection
  


We also test different configurations for CountVectorizer, namely whether we use ngrams or not. Results indicate ngrams improve performance but require more training time since the training space is expanded

In [4]:
results = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"\nFold {i+1} ============================")

    train_x, train_y = X[train_index], y[train_index]
    test_x, test_y = X[test_index], y[test_index]
    
    vectorizer = CountVectorizer(stop_words="english", ngram_range=(1,2))
    train_vec = vectorizer.fit_transform(train_x)
    test_vec = vectorizer.transform(test_x)
    
    model_iter = TbNB(iterative=True)
    model_base = TbNB()
    
    r1 = evaluate_model(model_iter, train_vec, train_y, test_vec, test_y, "TbNB Iterative", pos_label="positive")
    r2 = evaluate_model(model_base, train_vec, train_y, test_vec, test_y, "TbNB Base", pos_label="positive")
    

    r1["Fold"] = i + 1
    r2["Fold"] = i + 1
    results.extend([r1, r2])



df_results = pd.DataFrame(results)








In [5]:
print("\nFinal Benchmark Results (ngrams):")
print(df_results)


Final Benchmark Results (ngrams):
            Model  Accuracy  F1-score  Train Time (s)  Predict Time (s)  Fold
0  TbNB Iterative    0.8798  0.885196        1.301789          0.030372     1
1       TbNB Base    0.8756  0.882353        1.187003          0.016791     1
2  TbNB Iterative    0.8849  0.888285        1.435097          0.032234     2
3       TbNB Base    0.8862  0.885490        1.327603          0.035238     2
4  TbNB Iterative    0.8785  0.881683        2.564238          0.121928     3
5       TbNB Base    0.8749  0.871495        1.760657          0.033807     3
6  TbNB Iterative    0.8746  0.878700        1.601097          0.035856     4
7       TbNB Base    0.8713  0.866729        1.445526          0.026662     4
8  TbNB Iterative    0.8833  0.886842        1.540773          0.072547     5
9       TbNB Base    0.8848  0.883164        1.360712          0.024121     5


In [6]:
print("\nAverage Performance (ngrams):")
print(df_results.groupby("Model").mean(numeric_only=True))


Average Performance (ngrams):
                Accuracy  F1-score  Train Time (s)  Predict Time (s)  Fold
Model                                                                     
TbNB Base        0.87856  0.877846        1.416300          0.027324   3.0
TbNB Iterative   0.88022  0.884141        1.688599          0.058587   3.0


In [7]:
results = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"\nFold {i+1} ============================")

    train_x, train_y = X[train_index], y[train_index]
    test_x, test_y = X[test_index], y[test_index]
    
    vectorizer = CountVectorizer()
    train_vec = vectorizer.fit_transform(train_x)
    test_vec = vectorizer.transform(test_x)
    
    model_iter = TbNB(iterative=True)
    model_base = TbNB()
    
    r1 = evaluate_model(model_iter, train_vec, train_y, test_vec, test_y, "TbNB Iterative", pos_label="positive")
    r2 = evaluate_model(model_base, train_vec, train_y, test_vec, test_y, "TbNB Base", pos_label="positive")
    

    r1["Fold"] = i + 1
    r2["Fold"] = i + 1
    results.extend([r1, r2])



df_results = pd.DataFrame(results)








In [8]:
print("\nFinal Benchmark Results:")
print(df_results)


Final Benchmark Results:
            Model  Accuracy  F1-score  Train Time (s)  Predict Time (s)  Fold
0  TbNB Iterative    0.8609  0.862563        0.368036          0.010521     1
1       TbNB Base    0.8571  0.856281        0.316214          0.003059     1
2  TbNB Iterative    0.8594  0.862749        0.299840          0.005858     2
3       TbNB Base    0.8590  0.863399        0.282916          0.003298     2
4  TbNB Iterative    0.8593  0.863411        0.285548          0.004849     3
5       TbNB Base    0.8593  0.863411        0.283168          0.003024     3
6  TbNB Iterative    0.8525  0.852603        0.280194          0.005017     4
7       TbNB Base    0.8531  0.853962        0.281799          0.003084     4
8  TbNB Iterative    0.8580  0.862056        0.328515          0.009181     5
9       TbNB Base    0.8585  0.861965        0.300602          0.003067     5


In [9]:
print("\nAverage Performance:")
print(df_results.groupby("Model").mean(numeric_only=True))


Average Performance:
                Accuracy  F1-score  Train Time (s)  Predict Time (s)  Fold
Model                                                                     
TbNB Base        0.85740  0.859803        0.292940          0.003106   3.0
TbNB Iterative   0.85802  0.860676        0.312427          0.007085   3.0
