In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from pathlib import Path

from utils.helper import find_project_root

In [None]:
DATASET_PATH = find_project_root() / Path("datasets/nlp/")

df = pd.read_csv(DATASET_PATH / "IMDB Dataset.csv.zip")

## Vectorize the text using TF-IDF
TF-IDF (Term Frequency-Inverse Document Frequency) is a common technique to convert text into numerical features. It assigns a weight to each word based on its frequency in the document and its rarity across the corpus. This helps to highlight important words while downplaying common ones.

In [None]:
from nlp.data.preprocessing import TextPreprocessor
from nlp.features.tfidf import TfidfVectorizerWrapper
from nlp.pipelines.tfidf_pipeline import TFIDFPipeline

In [None]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

## Train a Logistic Regression model
Logistic Regression is a simple yet effective linear model for binary classification tasks. It estimates the probability that a given input belongs to a particular class (positive or negative in this case) based on the features extracted from the text. The model learns to assign weights to each feature (word) to make predictions. It is often used as a baseline in text classification tasks due to its simplicity and interpretability

In [None]:
from nlp.models.linear import LogisticRegressionModel

pipeline = TFIDFPipeline(TextPreprocessor(), TfidfVectorizerWrapper(), LogisticRegressionModel(C=1.0))


pipeline.fit(X_train, y_train)


top_pos, top_neg = pipeline.model.get_top_features(
    pipeline.vectorizer.get_feature_names(),
    k=10,
)

print("Top positive features:")
for w, c in top_pos:
    print(w, c)

print("\nTop negative features:")
for w, c in top_neg:
    print(w, c)


In [None]:
from nlp.evaluation.metrics import evaluate_model, compute_confusion_matrix
import seaborn as sns

metrics = pipeline.evaluate(X_test, y_test)
print(metrics)

y_pred = pipeline.predict(X_test)
sns.heatmap(compute_confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")

# Compare Naive Bayes and Logistic Regression

In [None]:
from nlp.models.linear import LogisticRegressionModel
from nlp.models.naive_bayes import  MultinomialNBModel

# Logistic Regression
lr_pipeline = TFIDFPipeline(TextPreprocessor(), TfidfVectorizerWrapper(), LogisticRegressionModel(C=1.0))
lr_pipeline.fit(X_train, y_train)
lr_metrics = lr_pipeline.evaluate(X_test, y_test)

# Naive Bayes
nb_pipeline = TFIDFPipeline(TextPreprocessor(), TfidfVectorizerWrapper(), MultinomialNBModel(alpha=1.0))
nb_pipeline.fit(X_train, y_train)
nb_metrics = nb_pipeline.evaluate(X_test, y_test)

print("Logistic Regression:", lr_metrics)
print("Naive Bayes:", nb_metrics)

In [None]:
results_df = pd.DataFrame([lr_metrics, nb_metrics],
                          index=["LogisticRegression", "NaiveBayes"])

results_df

In [None]:
sns.heatmap(compute_confusion_matrix(y_test, lr_pipeline.predict(X_test)), annot=True, fmt="d", cmap="Blues")

In [None]:
sns.heatmap(compute_confusion_matrix(y_test, nb_pipeline.predict(X_test)), annot=True, fmt="d", cmap="Blues")

### Analyze misclassified examples
Analyzing misclassified examples can provide insights into the model's weaknesses and help identify patterns that lead to incorrect predictions. By examining these examples, we can understand whether the model struggles with certain types of reviews, specific words, or phrases that may be ambiguous. This analysis can guide further improvements to the model, such as adding more features, using different preprocessing techniques, or even collecting more data to address specific cases where the model fails.

In [None]:
# false negative example
misclassified_idxs = np.where(y_pred != y_test)

exemple = X_test.iloc[misclassified_idxs[0][0]]
print(f"Predicted: {'Positive' if y_pred[misclassified_idxs[0][0]] == 1 else 'Negative'}, Actual: {'Positive' if y_test.iloc[misclassified_idxs[0][0]] == 1 else 'Negative'}")
print(exemple)

In [None]:
import numpy as np

y_pred = lr_pipeline.predict(X_test)

mis_idx = np.where(y_pred != y_test)[0]

len(mis_idx)


In [None]:
for i in mis_idx[:10]:
    print("\n\n----------")
    print("TEXT:", X_test.iloc[i])
    print("TRUE:", y_test.iloc[i])
    print("PRED:", y_pred[i])
