# Naive Bayes Email Spam Filter 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
df.shape

(5572, 3)

We will convert the text data into numerical  values using a preprocessing technique called **CountVectorizer**.

In [10]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [28]:
X = df['Message']
y = df['spam']

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
X_train_count_vect = v.fit_transform(X_train.values)
X_train_count_vect.toarray()[:3]

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_count_vect, y_train)

In [35]:
X_test_count_vect = v.transform(X_test)
model.score(X_test_count_vect, y_test)

0.9901345291479821

In [36]:
emails = [
    "Hey mohan, can we get together to watch footbal game tomorrow?",
    "Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!"
]
email_count_vect = v.transform(emails)
model.predict(email_count_vect)

array([0, 1])

`sklearn Pipeline`

In [37]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [40]:
clf.fit(X_train, y_train)

In [41]:
clf.predict(emails)

array([0, 1])

In [42]:
clf.score(X_test, y_test)

0.9901345291479821