In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from pathlib import Path

from utils.helper import find_project_root

In [None]:
DATASET_PATH = find_project_root() / Path("datasets/nlp/")

df = pd.read_csv(DATASET_PATH / "IMDB Dataset.csv.zip")

In [None]:
df.head()

In [None]:
from nlp.preprocessing import TextPreprocessor

preprocessor = TextPreprocessor()

sample = df["review"][0]
print(f"{sample:.500}\n")
print(f"{preprocessor.preprocess(sample):.500}")


In [None]:
from nlp.preprocessing import TextPreprocessor
from nlp.vectorizers import TfidfVectorizerWrapper

texts = [
    "this movie was great",
    "this movie was terrible",
    "this movie was fantastic",
]

pre = TextPreprocessor()
texts_clean = pre.preprocess_batch(texts)

vec = TfidfVectorizerWrapper(min_df=1, max_df=1)
X = vec.fit_transform(texts_clean)

print(X.shape)
print(vec.get_feature_names())
X

In [None]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

df_train, df_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)
vec = TfidfVectorizerWrapper()
X_train = vec.fit_transform(df_train)
X_test = vec.transform(df_test)

In [None]:
from nlp.models import LogisticRegressionModel

model = LogisticRegressionModel(C=1.0)

model.fit(X_train, y_train)


top_pos, top_neg = model.get_top_features(
    vec.get_feature_names(),
    k=10,
)

print("Top positive features:")
for w, c in top_pos:
    print(w, c)

print("\nTop negative features:")
for w, c in top_neg:
    print(w, c)


In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
print("Classification report:")
print(classification_report(y_test, y_pred))

In [None]:
import seaborn as sns

print("Confusion matrix:")
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")