In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import BernoulliNB

In [3]:
text = "text"
target = "target"

In [4]:
df = pd.read_csv("spam.csv", encoding="L1").rename(
    columns={"v1": "target", "v2": "text"}
)[["text", "target"]]

In [5]:
df.head()

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [6]:
counts = df[target].value_counts()

In [7]:
counts

ham     4825
spam     747
Name: target, dtype: int64

In [8]:
counts["ham"] / (counts["spam"] + counts["ham"])

0.8659368269921034

In [9]:
X = df["text"].values
y = df["target"].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=345, stratify=y)

In [11]:
bernoulli_nb_pipeline = make_pipeline(
    CountVectorizer(binary=True), BernoulliNB(alpha=10 ** -6)
)

In [12]:
bernoulli_nb_pipeline.fit(X_train, y_train)
y_test_pred = bernoulli_nb_pipeline.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       0.99      0.90      0.94       187

    accuracy                           0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [13]:
bernoulli_nb_pipeline.fit(X, y)

Pipeline(steps=[('countvectorizer', CountVectorizer(binary=True)),
                ('bernoullinb', BernoulliNB(alpha=1e-06))])

In [14]:
joblib.dump(bernoulli_nb_pipeline, "clf.joblib")

['clf.joblib']