# Naive Bayes

In [16]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

In [21]:
keywords = ['click', 'CASH!', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent', 'prize', 'credit']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    

In [25]:
sms_raw.head(20)

Unnamed: 0,spam,message,click,CASH!,offer,winner,buy,free,cash,urgent,prize,credit,allcaps
0,False,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False,False,False
1,False,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False,False,False
2,True,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,False,False,False,False,False,False,False,False
3,False,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False,False,False
4,False,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False,False,False
5,True,FreeMsg Hey there darling it's been 3 week's n...,False,False,False,False,False,False,False,False,False,False,False
6,False,Even my brother is not like to speak with me. ...,False,False,False,False,False,False,False,False,False,False,False
7,False,As per your request 'Melle Melle (Oru Minnamin...,False,False,False,False,False,False,False,False,False,False,False
8,True,WINNER!! As a valued network customer you have...,False,False,False,False,False,False,False,False,True,False,False
9,True,Had your mobile 11 months or more? U R entitle...,False,False,False,False,False,True,False,False,False,False,False


In [23]:
sms_raw['allcaps'] = sms_raw.message.str.isupper()

In [24]:
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [None]:
sns.heatmap(sms_raw.corr())

In [None]:
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

In [None]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

In [None]:
sms_raw.head()