### Import - librairies and dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [21]:
df = pd.read_csv('/Users/adriendavidson/Documents/Davidson/sms-spam-nlp/data/sms_preprocessed.csv')
df = df[df['processed_text'].notna()]
X = df['processed_text']
y = df['label']

### Train/Test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, stratify = y, random_state = 42
    )

### Text vectorization

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)) # unigrams + bigrams
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

### Train models

#### Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)

#### Multinomial Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

In [36]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 500, random_state = 42)
rf.fit(X_train_vec, y_train)

### XGBoost

In [62]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators = 500,
    random_state = 42,
    eval_metric = 'logloss',
    scale_pos_weight = df['label'].value_counts()[0]/df['label'].value_counts()[1]
)

xgb.fit(X_train_vec, y_train)

### Evaluate models

In [64]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Logistic regression")
evaluate(lr, X_test_vec, y_test)
print("\n")
print("Multinomial Naive Bayes")
evaluate(nb, X_test_vec, y_test)
print("\n")
print("Random Forest")
evaluate(rf, X_test_vec, y_test)
print("\n")
print("XGBoost")
evaluate(xgb, X_test_vec, y_test)

Logistic regression
Accuracy: 0.9614003590664273
Precision: 0.9908256880733946
Recall: 0.72
F1 score: 0.833976833976834
Confusion Matrix:
 [[963   1]
 [ 42 108]]


Multinomial Naive Bayes
Accuracy: 0.9560143626570916
Precision: 1.0
Recall: 0.6733333333333333
F1 score: 0.8047808764940239
Confusion Matrix:
 [[964   0]
 [ 49 101]]


Random Forest
Accuracy: 0.9631956912028725
Precision: 0.9823008849557522
Recall: 0.74
F1 score: 0.844106463878327
Confusion Matrix:
 [[962   2]
 [ 39 111]]


XGBoost
Accuracy: 0.9694793536804309
Precision: 0.953125
Recall: 0.8133333333333334
F1 score: 0.8776978417266187
Confusion Matrix:
 [[958   6]
 [ 28 122]]


### Save the best model

In [66]:
import joblib

joblib.dump(xgb, '/Users/adriendavidson/Documents/Davidson/sms-spam-nlp/models/xgboost_sms.pkl')
joblib.dump(vectorizer, '/Users/adriendavidson/Documents/Davidson/sms-spam-nlp/models/tfidf_vectorizer.pkl')

['/Users/adriendavidson/Documents/Davidson/sms-spam-nlp/models/tfidf_vectorizer.pkl']