In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import TransformerMixin
    
from joblib import dump



In [2]:
df = pd.read_csv("spam.csv", encoding="L1")
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ã_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
X = df["v2"].values
y = df["v1"].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=345)

In [5]:
nb = make_pipeline(
    CountVectorizer(binary=True),
    MultinomialNB()
)

In [6]:
nb.fit(X_train, y_train);

In [7]:
y_pred = nb.predict(X_test)

In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1208
        spam       0.99      0.91      0.95       185

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [9]:
lr_w = make_pipeline(
    CountVectorizer(),
    LogisticRegression()
)

In [10]:
lr_w.fit(X_train, y_train);

In [11]:
y_pred = lr_w.predict(X_test)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1208
        spam       0.99      0.85      0.92       185

    accuracy                           0.98      1393
   macro avg       0.99      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [13]:
lr_c = make_pipeline(
    CountVectorizer(
        analyzer="char",
        ngram_range=(3, 7)
    ),
    TfidfTransformer(),
    LogisticRegression()
)

In [14]:
lr_c.fit(X_train, y_train);

In [15]:
y_pred = lr_c.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1208
        spam       1.00      0.63      0.77       185

    accuracy                           0.95      1393
   macro avg       0.97      0.82      0.87      1393
weighted avg       0.95      0.95      0.95      1393



In [17]:
rf = make_pipeline(
    CountVectorizer(),
    RandomForestClassifier()
)

In [18]:
rf.fit(X_train, y_train);

In [19]:
y_pred = rf.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1208
        spam       0.99      0.83      0.90       185

    accuracy                           0.98      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.98      0.98      0.98      1393



In [21]:
nb.fit(X, y)

Pipeline(steps=[('countvectorizer', CountVectorizer(binary=True)),
                ('multinomialnb', MultinomialNB())])

In [22]:
dump(nb, "clf.joblib")

['clf.joblib']