In [1]:
import re
import joblib
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer


model_dir = "../model/best_ml_model.joblib"

In [2]:
df = pd.read_csv("../data/Britannia.csv")
negative_reviews = df.loc[df["Negative_Review"] != "No Negative", "Negative_Review"]
positive_reviews = df.loc[df["Positive_Review"] != "No Positive", "Positive_Review"]
review_df = pd.DataFrame(
    {
        "Review": pd.concat([negative_reviews, positive_reviews], ignore_index=True),
        "Sentiment": ["Negative"] * len(negative_reviews)
        + ["Positive"] * len(positive_reviews),
    }
)
review_df.describe()

Unnamed: 0,Review,Sentiment
count,8361,8361
unique,7689,2
top,Location,Negative
freq,151,4262


In [3]:
regex_pattern = re.compile(r"[^a-zA-Z\s]")
stop_words_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
pos_mapping = {"J": wordnet.ADJ, "V": wordnet.VERB, "N": wordnet.NOUN, "R": wordnet.ADV}


def preprocess_text(text):
    def get_wordnet_pos(tag):
        return pos_mapping.get(tag[0], wordnet.NOUN)

    text = text.lower()
    text = regex_pattern.sub("", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words_set]
    pos_tags = pos_tag(tokens)
    tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags
    ]
    return " ".join(tokens)


review_df["Cleaned_Review"] = review_df["Review"].apply(preprocess_text)
review_df.describe()

Unnamed: 0,Review,Sentiment,Cleaned_Review
count,8361,8361,8361
unique,7689,2,7270
top,Location,Negative,location
freq,151,4262,235


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    review_df[["Review", "Cleaned_Review"]],
    review_df["Sentiment"],
    test_size=0.2,
    random_state=42,
    stratify=review_df["Sentiment"],
)
print(X_train.shape, X_test.shape)

(6688, 2) (1673, 2)


In [5]:
classifiers = {
    "Logistic Regression": (
        LogisticRegression(max_iter=1000),
        {"clf__C": [0.1, 1, 10]},
    ),
    "Naive Bayes": (MultinomialNB(), {"clf__alpha": [0.1, 0.5, 1.0]}),
    "Decision Tree": (DecisionTreeClassifier(), {"clf__max_depth": [10, 50, 100]}),
    "SVM": (SVC(), {"clf__C": [0.1, 1, 10], "clf__kernel": ["linear", "rbf"]}),
    "Random Forest": (
        RandomForestClassifier(),
        {"clf__n_estimators": [10, 50, 100]},
    ),
}

best_score, best_model, best_params = 0, None, None

for clf, param_grid in tqdm(classifiers.values()):
    pipeline = Pipeline(
        [
            ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
            ("clf", clf),
        ]
    )
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1
    )
    grid_search.fit(X_train["Cleaned_Review"], y_train)

    if grid_search.best_score_ > best_score:
        best_score, best_model, best_params = (
            grid_search.best_score_,
            grid_search.best_estimator_,
            grid_search.best_params_,
        )

joblib.dump(best_model, model_dir)
print(
    f"Best Model: {best_model.named_steps['clf'].__class__.__name__}\n"
    f"Best Score: {best_score}\n"
    f"Best Params: {best_params}"
)

  0%|          | 0/5 [00:00<?, ?it/s]

Best Model: MultinomialNB
Best Score: 0.8848659460027525
Best Params: {'clf__alpha': 0.1}


In [6]:
y_pred = best_model.predict(X_test["Cleaned_Review"])
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

    Negative     0.8669    0.9238    0.8944       853
    Positive     0.9149    0.8524    0.8826       820

    accuracy                         0.8888      1673
   macro avg     0.8909    0.8881    0.8885      1673
weighted avg     0.8904    0.8888    0.8886      1673



In [7]:
y_pred = best_model.predict(X_test["Cleaned_Review"])
wrong_idx = np.where(y_pred != y_test)[0]

wrong_pred_df = pd.DataFrame(X_test.iloc[wrong_idx])
wrong_pred_df["Predicted_Sentiment"] = y_pred[wrong_idx]
wrong_pred_df["True_Sentiment"] = y_test.values[wrong_idx]

wrong_pred_df.to_csv("../data/wrong_pred_ml.csv", index=False)
wrong_pred_df

Unnamed: 0,Review,Cleaned_Review,Predicted_Sentiment,True_Sentiment
6966,Coffee,coffee,Negative,Positive
3171,Location a bit of a trek from tube,location bit trek tube,Positive,Negative
3545,Just needs a little T L C Would of had a fab ...,need little l c would fab view iff window clean,Positive,Negative
7518,not water,water,Negative,Positive
5376,The international hotel has very internationa...,international hotel international staff team p...,Negative,Positive
...,...,...,...,...
975,That we had to leave our lovely suite,leave lovely suite,Positive,Negative
6700,i like the staff all of them with the smile,like staff smile,Negative,Positive
2193,Nothing,nothing,Positive,Negative
4422,Dreadful breakfast Actually had to get my mon...,dreadful breakfast actually get money back not...,Negative,Positive
