In [4]:
from scipy.stats import norm, t, ttest_1samp, ttest_ind, normaltest, poisson
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

***
# part ii

In [5]:
df = pd.read_csv('spam.csv', delimiter=',', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.7+ KB


In [7]:
df = df[['v1', 'v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
len(df[df.v1 == 'ham'])

4825

In [9]:
len(df[df.v1 == 'spam'])

747

In [10]:
747/5572

0.13406317300789664

In [11]:
# we are estimating that the percentage of spam in our inbox is greater than 12.5%.
# alpha = 0.025

p = 0.125
q = 1 - p
n = 5572
p_hat = 0.13406317300789664
se = np.sqrt((p*q)/n)
1 - norm(p, se).cdf(p_hat)

0.020396461702911872

***
# part iii

In [58]:
# 747 is number of spam messages per 4 weeks,
# so 747 / 30 ~ 27 (spam per day)

In [59]:
747/(4*7)

26.678571428571427

In [60]:
# the probability that you will get at least 30 spam messages per day
rv = poisson(26.678571428571427)
1 - rv.cdf(29)

0.2847229977401552

***
# part iv

In [15]:
# human spam detector
# create an interactive program that will randomly show you a message
# you have to determine if it's spam or not
# record your answer to each message
# check how accurate your predictions were
# how many type 1 and type 2 error did you have

***
# part v

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [17]:
counts = count_vect.fit_transform(df.v2)
tfidfs = tfidf_transformer.fit_transform(counts)
counts.shape, '---', tfidfs.shape

((5572, 8672), '---', (5572, 8672))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(tfidfs, df.v1, test_size=0.33, random_state=42)

In [19]:
nb = MultinomialNB().fit(X_train, y_train)
nb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
predictions = nb.predict(X_test)

In [49]:
true_results = predictions == y_test
true_results[:5]

3245     True
944      True
1044    False
2484     True
812      True
Name: v1, dtype: bool

In [50]:
# baseline accuracy on whole dataset
1 - (747/5572)

0.8659368269921034

In [51]:
# number of type 1 and type 2 errors

In [52]:
from sklearn.metrics import confusion_matrix

In [53]:
tn, fp, fn, tp  = confusion_matrix(y_test, predictions).ravel()

In [54]:
# false positive - type 1
fp

0

In [55]:
# false negative - type 2
fn

85

In [57]:
# accuracy of test set
(tp + tn) / (fp + fn + tp + tn)

0.953779227841218

***
# part vi

In [63]:
all_preds = nb.predict(tfidfs)
all_accuracy = all_preds == df.v1

In [67]:
# accuracy of entire dataset
sum(all_accuracy)/len(df.v1)

0.9603374012921752

In [68]:
tn1, fp1, fn1, tp1  = confusion_matrix(df.v1, all_preds).ravel()

In [69]:
# accuracy of test set
(tp1 + tn1) / (fp1 + fn1 + tp1 + tn1)

0.9603374012921752

In [70]:
# false positive - type 1
fp1

0

In [72]:
# false negative - type 2 (actual spam labeled as NOT spam)
fn1

221

In [77]:
# the 221 spams that were labeled 'not spam'
df[~all_accuracy].head(10)

Unnamed: 0,v1,v2
5,spam,FreeMsg Hey there darling it's been 3 week's n...
15,spam,"XXXMobileMovieClub: To use your credit, click ..."
34,spam,Thanks for your subscription to Ringtone UK yo...
54,spam,SMS. ac Sptv: The New Jersey Devils and the De...
56,spam,Congrats! 1 year special cinema pass for 2 is ...
65,spam,"As a valued customer, I am pleased to advise y..."
68,spam,"Did you hear about the new \Divorce Barbie\""? ..."
95,spam,Your free ringtone is waiting to be collected....
138,spam,You'll not rcv any more msgs from the chat svc...
146,spam,FreeMsg Why haven't you replied to my text? I'...


In [92]:
# all ham messages
sum(df.v1 == 'ham')

4825

In [99]:
4825 + 221 # total messages in inbox
           # went from 747 to 221 spam in inbox

5046

In [100]:
221/5046

0.04379706698374951

In [101]:
# we are estimating that the percentage of spam in our inbox is greater than 12.5%.
# alpha = 0.025

p = 0.125
q = 1 - p
n = 5351
p_hat = 0.04379706698374951
se = np.sqrt((p*q)/n)
# change to a two tail test
norm(p, se).cdf(p_hat) * 2

3.937554869860578e-72

In [102]:
# we still reject the null hypothesis