In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
data = pd.read_csv('spam.csv')

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.isna().sum()

Category    0
Message     0
dtype: int64

In [11]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [13]:
data['spam'] = data['Category'].map({'ham': 0, 'spam': 1})

In [14]:
data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [19]:
data = data.drop(['Category'], axis=1)

In [20]:
data.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [21]:
cv = CountVectorizer()

In [22]:
naive_bayes_model = MultinomialNB()

In [23]:
operations = [('cv', cv), ('naive_bayes_model', naive_bayes_model)]

In [24]:
pipe = Pipeline(operations)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(data.Message, data.spam, test_size=0.2)

In [26]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('naive_bayes_model', MultinomialNB())])

In [28]:
model_predictions = pipe.predict(X_test)

In [29]:
confusion_matrix(model_predictions, y_test)

array([[973,  10],
       [  0, 132]], dtype=int64)

In [30]:
print(classification_report(model_predictions, y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       983
           1       0.93      1.00      0.96       132

    accuracy                           0.99      1115
   macro avg       0.96      0.99      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [32]:
pipe.score(X_test,y_test)

0.9910313901345291