# Spam mail detection Naive Bayes, CountVectorizer, Sklearn Pipeline


In [44]:
import pandas as pd


In [45]:
df = pd.read_csv("https://raw.githubusercontent.com/codebasics/py/master/ML/14_naive_bayes/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [47]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [48]:
df.drop_duplicates(inplace = True)

In [49]:
df.shape

(5157, 3)

In [50]:
df.drop(['Category'], axis=1)

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [51]:
x= df.Message.values

In [52]:
y= df.spam.values

In [53]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

**Preprocesing CountVectorizer**

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [64]:
x_train_count = cv.fit_transform(x)
x_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Naive Bayes 

In [56]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_count,ytrain)

MultinomialNB()

In [57]:
x_test_count = cv.transform(xtest)
model.score(x_test_count, ytest)

0.9864341085271318

In [58]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = cv.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

**Sklearn Pipeline**

In [59]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [60]:
clf.fit(xtrain, ytrain)


Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [61]:
clf.score(xtest,ytest)


0.9864341085271318

In [62]:
clf.predict(emails)


array([0, 1], dtype=int64)

**K-Fold Cross Validation**

In [72]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(model,x_train_count, y, cv=10)
cv_score 

array([0.98643411, 0.98449612, 0.98255814, 0.97674419, 0.97286822,
       0.97093023, 0.97674419, 0.98252427, 0.96699029, 0.98834951])

In [73]:
cv_score.mean()

0.9788639271468351