### SPAM FILTERING USING NAIVE BAYES

##### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


##### Importing data

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

##### Inspecting Data

In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


##### Turning the Spam/Ham data into numerical values

In [7]:
df['Spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)

In [8]:
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


#### Splitting Data into Train and Test

In [9]:
X = df['Message']
y = df['Spam']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

#### Finding Word Count and storing as matrix

In [17]:
cv =CountVectorizer()
X_train_count = cv.fit_transform(X_train.values)
X_test_count = cv.transform(X_test)

In [18]:
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

##### Training Model

In [19]:
model = MultinomialNB()

In [20]:
model.fit(X_train_count, y_train)

#### Testing Model

In [21]:
Emaiil_Ham = ["Hey Going to Uni today?"]
Email_Ham_count = cv.transform(Emaiil_Ham)
model.predict(Email_Ham_count)

array([0])

In [22]:
Email_Spam = ['Reward Reward Click on the Link Now']
email_spam_count = cv.transform(Email_Spam)
model.predict(email_spam_count)

array([1])

In [23]:
y_pred = model.predict(X_test_count)
from sklearn.metrics import r2_score

In [24]:
r2_score(y_test, y_pred)

0.9302805452499062