In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

df = pd.read_csv("../data/Britannia.csv")

In [2]:
negative_reviews = df.loc[df["Negative_Review"] != "No Negative", "Negative_Review"]
positive_reviews = df.loc[df["Positive_Review"] != "No Positive", "Positive_Review"]

review_df = pd.DataFrame(
    {
        "Review": pd.concat([negative_reviews, positive_reviews], ignore_index=True),
        "Sentiment": ["Negative"] * len(negative_reviews)
        + ["Positive"] * len(positive_reviews),
    }
)

regex_pattern = re.compile(r"[^a-zA-Z\s]")
stop_words_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
pos_mapping = {"J": wordnet.ADJ, "V": wordnet.VERB, "N": wordnet.NOUN, "R": wordnet.ADV}


def get_wordnet_pos(tag):
    return pos_mapping.get(tag[0], wordnet.NOUN)


def preprocess_text(text):
    text = text.lower()
    text = regex_pattern.sub("", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words_set]
    pos_tags = pos_tag(tokens)
    tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags
    ]
    return " ".join(tokens)


review_df["Cleaned_Review"] = review_df["Review"].apply(preprocess_text)

print("Class Distribution:\n", review_df["Sentiment"].value_counts(normalize=True))

Class Distribution:
 Sentiment
Negative    0.509748
Positive    0.490252
Name: proportion, dtype: float64


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    review_df["Cleaned_Review"],
    review_df["Sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=review_df["Sentiment"],
)

classifiers = {
    "Logistic Regression": (
        LogisticRegression(max_iter=1000),
        {"clf__C": [0.1, 1, 10]},
    ),
    "Naive Bayes": (MultinomialNB(), {"clf__alpha": [0.1, 0.5, 1.0]}),
    "Decision Tree": (DecisionTreeClassifier(), {"clf__max_depth": [10, 50, 100]}),
    "SVM": (SVC(), {"clf__C": [0.1, 1, 10], "clf__kernel": ["linear", "rbf"]}),
    "Random Forest": (RandomForestClassifier(), {"clf__n_estimators": [10, 50, 100]}),
}


best_score, best_model, best_params = 0, None, None


def perform_grid_search(name, clf, param_grid):
    """Perform GridSearchCV on a classifier with given parameters."""
    pipeline = Pipeline(
        [
            ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
            ("clf", clf),
        ]
    )
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return (
        grid_search.best_score_,
        grid_search.best_estimator_,
        grid_search.best_params_,
    )


for name, (clf, param_grid) in classifiers.items():
    print(f"Training {name} ...")
    score, model, params = perform_grid_search(name, clf, param_grid)
    if score > best_score:
        best_score, best_model, best_params = score, model, params

print(f"Best Classifier: {best_model.named_steps['clf'].__class__.__name__}")
print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")

# joblib.dump(best_model, "best_model.joblib")

y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

Training Logistic Regression ...
Training Naive Bayes ...
Training Decision Tree ...
Training SVM ...
Training Random Forest ...
Best Classifier: MultinomialNB
Best Score: 0.8848659460027525
Best Parameters: {'clf__alpha': 0.1}
Classification Report:
              precision    recall  f1-score   support

    Negative     0.8669    0.9238    0.8944       853
    Positive     0.9149    0.8524    0.8826       820

    accuracy                         0.8888      1673
   macro avg     0.8909    0.8881    0.8885      1673
weighted avg     0.8904    0.8888    0.8886      1673



In [4]:
import numpy as np

y_pred = best_model.predict(X_test)
wrong_indices = np.where(y_pred != y_test)[0]

wrong_predictions = pd.DataFrame(X_test.iloc[wrong_indices])
wrong_predictions["Predicted_Sentiment"] = y_pred[wrong_indices]
wrong_predictions["True_Sentiment"] = y_test.values[wrong_indices]

print("Incorrectly Predicted Samples:")
wrong_predictions

Incorrectly Predicted Samples:


Unnamed: 0,Cleaned_Review,Predicted_Sentiment,True_Sentiment
6966,coffee,Negative,Positive
3171,location bit trek tube,Positive,Negative
3545,need little l c would fab view iff window clean,Positive,Negative
7518,water,Negative,Positive
5376,international hotel international staff team p...,Negative,Positive
...,...,...,...
975,leave lovely suite,Positive,Negative
6700,like staff smile,Negative,Positive
2193,nothing,Positive,Negative
4422,dreadful breakfast actually get money back not...,Negative,Positive
