In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from utils.helper import find_project_root

In [None]:
DATASET_PATH = find_project_root() / Path("datasets/nlp/")

df = pd.read_csv(DATASET_PATH / "IMDB Dataset.csv.zip")

In [None]:
from nlp.data.preprocessing import TextPreprocessor
from nlp.features.embeddings import GensimWord2Vec, SentenceEmbedder, PretrainedSentenceEmbedder
from nlp.pipelines.embedding_pipeline import EmbeddingPipeline

In [None]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

# Model Training

In [None]:
from nlp.models.linear import LogisticRegressionModel
from sklearn.metrics import classification_report, confusion_matrix

pipeline = EmbeddingPipeline(
    TextPreprocessor(),
    GensimWord2Vec(workers=10),
    SentenceEmbedder(None),
    LogisticRegressionModel(),
)

pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
mis_idx = np.where(y_pred != y_test)[0]

len(mis_idx)

In [None]:
for i in mis_idx[:10]:
    print("\n\n----------")
    print("TEXT:", X_test.iloc[i])
    print("TRUE:", y_test.iloc[i])
    print("PRED:", y_pred[i])

# Using pre-trained Word2Vec embeddings

In [None]:
import gensim.downloader as api

glove = api.load("glove-wiki-gigaword-100")

In [None]:
pipeline_glove = EmbeddingPipeline(
    TextPreprocessor(),
    glove,
    PretrainedSentenceEmbedder(glove),
    LogisticRegressionModel(),
    pretrained_w2v=True
)

pipeline_glove.fit(X_train, y_train)

In [None]:
y_pred = pipeline_glove.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

Pretrained embeddings did not outperform locally trained embeddings, likely due to domain mismatch and task misalignment.