##### Train

In [None]:
import os
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

df = pd.read_csv("../data/processed_reviews.csv")

vectorizers = {
    "Bag-of-Words": CountVectorizer(max_features=5000, ngram_range=(1, 2)),
    "TF-IDF": TfidfVectorizer(max_features=5000, ngram_range=(1, 2)),
}

models = {
    "SVM": (LinearSVC(), {"C": [0.1, 1, 10]}),
    "Logistic Regression": (LogisticRegression(max_iter=500), {"C": [0.1, 1, 10]}),
    "Naive Bayes": (MultinomialNB(), {"alpha": [0.1, 0.5, 1.0]}),
}

results = {}

X = df["processed_review"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

best_model = None
best_score = 0
best_model_name = ""
best_vectorizer_name = ""
best_params = {}

for vec_name, vectorizer in vectorizers.items():
    print(f"\nUsing {vec_name} feature extraction...")

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    for model_name, (model, param_grid) in models.items():
        print(f"Tuning and evaluating {model_name}...")

        grid_search = GridSearchCV(
            model, param_grid, n_jobs=-1, cv=5, scoring="accuracy", verbose=2
        )
        grid_search.fit(X_train_vec, y_train)

        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X_test_vec)

        accuracy = accuracy_score(y_test, y_pred)

        if accuracy > best_score:
            best_score = accuracy
            best_model = best_estimator
            best_model_name = model_name
            best_vectorizer_name = vec_name
            best_params = grid_search.best_params_

print(f"\nBest Model: {best_model_name} with {best_vectorizer_name}")
print(f"Best Parameters: {best_params}")
print(f"Accuracy: {best_score:.4f}")

model_dir = "../model"
joblib.dump(best_model, os.path.join(model_dir, "best_ml_model.pkl"))
joblib.dump(
    vectorizers[best_vectorizer_name], os.path.join(model_dir, "best_ml_vectorizer.pkl")
)

print("Best model and vectorizer saved to the 'model' folder.")


Using Bag-of-Words feature extraction...
Tuning and evaluating SVM...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ..............................................C=0.1; total time=  53.4s
[CV] END ..............................................C=0.1; total time=  53.4s
[CV] END ..............................................C=0.1; total time=  58.8s
[CV] END ..............................................C=0.1; total time= 1.0min
[CV] END ..............................................C=0.1; total time= 1.0min
[CV] END ................................................C=1; total time= 1.2min
[CV] END ................................................C=1; total time= 1.2min
[CV] END ................................................C=1; total time= 1.3min
[CV] END ................................................C=1; total time=  50.0s
[CV] END ................................................C=1; total time=  52.0s
[CV] END ...............................................C=1

##### Evaluate

In [2]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

model_dir = "../model"
best_model = joblib.load(os.path.join(model_dir, "best_ml_model.pkl"))
best_vectorizer = joblib.load(os.path.join(model_dir, "best_ml_vectorizer.pkl"))

df = pd.read_csv("../data/processed_reviews.csv")

X = df["processed_review"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    df[["processed_review", "review"]], y, test_size=0.2, random_state=42
)

X_test_vec = best_vectorizer.transform(X_test["processed_review"])

y_pred = best_model.predict(X_test_vec)

print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

Classification Report:
              precision    recall  f1-score   support

    negative     0.9311    0.9482    0.9396     77233
    positive     0.9577    0.9435    0.9505     95909

    accuracy                         0.9456    173142
   macro avg     0.9444    0.9459    0.9451    173142
weighted avg     0.9458    0.9456    0.9457    173142



In [8]:
misclassified = X_test[y_pred != y_test].copy()
misclassified["True Sentiment"] = y_test[y_pred != y_test]
misclassified["Predicted Sentiment"] = y_pred[y_pred != y_test]
misclassified.to_csv("../data/ml_misclassified_reviews.csv", index=False)