In [7]:
from scipy.sparse import load_npz
import pandas as pd

X = load_npz("../data/processed/tfidf_matrix.npz")
y = pd.read_csv("../data/processed/labels.csv")['label']

# X is a sparse matrix
print(f"Show the first two rows of X: \n{X[0:2]}")
print(f"Show the first five labels: \n {y.head()}")

Show the first two rows of X: 
  (0, 927)	0.29139783035410916
  (0, 334)	0.2395913846995883
  (0, 428)	0.41898667089873387
  (0, 972)	0.34731064288019076
  (0, 338)	0.2844542469263201
  (0, 49)	0.3886271165799766
  (0, 168)	0.3957882453544288
  (0, 637)	0.34896914617041114
  (0, 325)	0.22613205283344187
  (1, 948)	0.6596320170674406
  (1, 433)	0.6240184231409794
  (1, 579)	0.4189112192818252
Show the first five labels: 
 0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object


In [8]:
from sklearn.model_selection import train_test_split

# Use train_test_split to divide the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(4457, 1000)
(1115, 1000)


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Use Multinomial Naive Bayes - the best for words frequencies in a text
nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

print("Naive Bayes \nAccuracy: ", accuracy_score(y_test, y_pred_nb))
print("Classification report: \n", classification_report(y_test, y_pred_nb))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred_nb))

Naive Bayes 
Accuracy:  0.979372197309417
Classification report: 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix: 
 [[964   2]
 [ 21 128]]


In [11]:
from sklearn.linear_model import LogisticRegression

# Use Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=500)

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression \nAccuracy: ", accuracy_score(y_test, y_pred_lr))
print("Classification report: \n", classification_report(y_test, y_pred_lr))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred_lr))


Logistic Regression 
Accuracy:  0.9659192825112107
Classification report: 
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.98      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion matrix: 
 [[964   2]
 [ 36 113]]


In [12]:
from sklearn.model_selection import GridSearchCV

# Use GridSearchCV to find the best parameters
param_grid = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
# cv=5 --> cross validation
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'C': 10, 'solver': 'lbfgs'}


In [13]:
# Use the best parameters to predict the values
y_pred_gs = grid_search.predict(X_test)

print("Logistic Regression \nAccuracy: ", accuracy_score(y_test, y_pred_gs))
print("Classification report: \n", classification_report(y_test, y_pred_gs))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred_gs))

Logistic Regression 
Accuracy:  0.9838565022421525
Classification report: 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion matrix: 
 [[964   2]
 [ 16 133]]


In [14]:
# Compare Models
models = ['MultinomialNB', 'Logistic Regression', 'Logistic Regression (Optimized)']
accuracies = [accuracy_score(y_test, y_pred_nb), accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_gs)]

for model, acc in zip(models, accuracies):
    print(f"{model}: {acc}")


MultinomialNB: 0.979372197309417
Logistic Regression: 0.9659192825112107
Logistic Regression (Optimized): 0.9838565022421525


In [15]:
# joblib --> library to save machine learning models
import joblib

joblib.dump(grid_search.best_estimator_, "../models/best_model.pkl")

print("Saved model into: ../models/best_model.pkl")

Saved model into: ../models/best_model.pkl
