In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from TbNB import TbNB
from utils.benchmarking import evaluate_model
from pathlib import Path

DATA_DIR = Path("../datasets")


## Data Loading

Dataset contains 50k IMDB movie reviews with binary response labels

In [2]:
file_path = DATA_DIR / "IMDB Dataset.csv"
data = pd.read_csv(file_path)
X = data["review"]
y = data["sentiment"]
print(f"Dataset size: {X.shape[0]} samples")


Dataset size: 50000 samples


## Cross-Validation
Scikit-learn readily provides an efficient Stratified Cross-Validation class 

In [3]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):

    print(f"Fold {i}:")

    print(f"  Train: index={train_index}")

    print(f"  Test:  index={test_index}")

Fold 0:
  Train: index=[    2     3     4 ... 49997 49998 49999]
  Test:  index=[    0     1     6 ... 49987 49990 49992]
Fold 1:
  Train: index=[    0     1     2 ... 49996 49997 49999]
  Test:  index=[    4    11    15 ... 49983 49993 49998]
Fold 2:
  Train: index=[    0     1     2 ... 49997 49998 49999]
  Test:  index=[    7     8     9 ... 49976 49984 49991]
Fold 3:
  Train: index=[    0     1     4 ... 49995 49998 49999]
  Test:  index=[    2     3     5 ... 49994 49996 49997]
Fold 4:
  Train: index=[    0     1     2 ... 49996 49997 49998]
  Test:  index=[   14    27    29 ... 49989 49995 49999]


# Computational efficiency test

For each iteration we apply the following pipeline:
1. Train CountVectorizer on train split
2. Transform both train and test reviews into BoW matrices
3. Initialize models (base and iterative)
4. Apply the evaluate function which computes: accuracy and f1 score, training time and predicting time
5. Store everything withing a pandas dataframe for inspection
  


We also test different configurations for CountVectorizer, namely whether we use ngrams or not. Results indicate ngrams improve performance but require more training time since the training space is expanded

In [None]:
results = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"\nFold {i+1} ============================")

    train_x, train_y = X[train_index], y[train_index]
    test_x, test_y = X[test_index], y[test_index]
    
    vectorizer = CountVectorizer(stop_words="english", ngram_range=(1,2))
    train_vec = vectorizer.fit_transform(train_x)
    test_vec = vectorizer.transform(test_x)
    
    model_iter = TbNB(iterative=True)
    model_base = TbNB()
    
    r1 = evaluate_model(model_iter, train_vec, train_y, test_vec, test_y, "TbNB Iterative", pos_label="positive")
    r2 = evaluate_model(model_base, train_vec, train_y, test_vec, test_y, "TbNB Base", pos_label="positive")
    

    r1["Fold"] = i + 1
    r2["Fold"] = i + 1
    results.extend([r1, r2])



df_results = pd.DataFrame(results)







In [None]:
print("\nFinal Benchmark Results (ngrams):")
print(df_results)

In [None]:
print("\nAverage Performance (ngrams):")
print(df_results.groupby("Model").mean(numeric_only=True))

In [None]:
results = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"\nFold {i+1} ============================")

    train_x, train_y = X[train_index], y[train_index]
    test_x, test_y = X[test_index], y[test_index]
    
    vectorizer = CountVectorizer()
    train_vec = vectorizer.fit_transform(train_x)
    test_vec = vectorizer.transform(test_x)
    
    model_iter = TbNB(iterative=True)
    model_base = TbNB()
    
    r1 = evaluate_model(model_iter, train_vec, train_y, test_vec, test_y, "TbNB Iterative", pos_label="positive")
    r2 = evaluate_model(model_base, train_vec, train_y, test_vec, test_y, "TbNB Base", pos_label="positive")
    

    r1["Fold"] = i + 1
    r2["Fold"] = i + 1
    results.extend([r1, r2])



df_results = pd.DataFrame(results)

In [None]:
print("\nFinal Benchmark Results:")
print(df_results)

In [None]:
print("\nAverage Performance:")
print(df_results.groupby("Model").mean(numeric_only=True))