In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# ML ---> Classification or Regression

In [3]:
sms = pd.read_csv('SMSSpamCollection', sep = "\t", 
                  header= None, names = ['Label', 'SMS'])

In [4]:
# OR
# sms.rename({0;'Label', 1:'SMS'}, axis=1)

In [5]:
sms.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
sms.shape

(5572, 2)

In [7]:
sms['Label'].value_counts(normalize=True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [8]:
shuffle_sms = sms.sample(frac = 1, random_state = 1)

In [9]:
training = shuffle_sms[:4457]
testing = shuffle_sms[4457:]

In [10]:
training.shape

(4457, 2)

In [11]:
training.reset_index(inplace = True, drop=True)
testing.reset_index(inplace = True, drop=True)

In [12]:
p_ham = training['Label'].value_counts(normalize = True)[0]
p_spam = training['Label'].value_counts(normalize = True)[1]

In [13]:
#Naive Bayes
# P(Spam_message|newmsg) ∝ P(spam)*P(new_msg|spam) 

# P(Spamc|newmsg) ∝ P(spamc)*P(new_msg|spamc) 

In [14]:
training.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [15]:
training['SMS'] = training['SMS'].str.replace('\W', ' ').str.lower().str.split()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training['SMS'] = training['SMS'].str.replace('\W', ' ').str.lower().str.split()


In [16]:
vocabulary = []
for lst in training['SMS']:
#     print(lst)
    for word in lst:
#         print(word)
        vocabulary.append(word)

In [17]:
vocabulary = list(set(vocabulary))

In [18]:
len(vocabulary)

7782

In [19]:
training

Unnamed: 0,Label,SMS
0,ham,"[yep, by, the, pretty, sculpture]"
1,ham,"[yes, princess, are, you, going, to, make, me,..."
2,ham,"[welp, apparently, he, retired]"
3,ham,[havent]
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,..."
...,...,...
4452,ham,"[how, about, clothes, jewelry, and, trips]"
4453,ham,"[sorry, i, ll, call, later, in, meeting, any, ..."
4454,ham,"[babe, i, fucking, love, you, too, you, know, ..."
4455,spam,"[u, ve, been, selected, to, stay, in, 1, of, 2..."


In [20]:
len(training)

4457

In [21]:
dictionary = {}

for i in vocabulary:
    dictionary[i] = [0]*len(training)

In [22]:
# pd.DataFrame(dictionary)

In [23]:
training['SMS'].head()

0                    [yep, by, the, pretty, sculpture]
1    [yes, princess, are, you, going, to, make, me,...
2                      [welp, apparently, he, retired]
3                                             [havent]
4    [i, forgot, 2, ask, ü, all, smth, there, s, a,...
Name: SMS, dtype: object

In [24]:
for index,lst in enumerate(training['SMS']):
    for word in lst:
        dictionary[word][index] +=1

In [25]:
df = pd.DataFrame(dictionary)

In [26]:
df['yep']

0       1
1       0
2       0
3       0
4       0
       ..
4452    0
4453    0
4454    0
4455    0
4456    0
Name: yep, Length: 4457, dtype: int64

In [27]:
df.loc[0, ['yep', 'by', 'the', 'pretty', 'sculpture'] ]

yep          1
by           1
the          1
pretty       1
sculpture    1
Name: 0, dtype: int64

nword + alpha/(nspam + alpha*num_vocab)

In [28]:
# df['Label'] = training['Label']

In [29]:
training_set = pd.concat([training['Label'], df], axis = 1)

In [30]:
training_set[training_set['Label'] == 'spam']

Unnamed: 0,Label,6230,amt,truth,email,faith,lark,wine,stomach,max,...,films,listn,phone750,mth,since,ref,frying,wins,mtmsgrcvd18,itwhichturnedinto
16,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4437,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4439,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4443,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4449,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
n_spam = training_set[training_set['Label'] == 'spam'].sum(axis = 1).sum()

In [32]:
n_ham = training_set[training_set['Label'] == 'ham'].sum(axis = 1).sum()

P(Spam|New_msg) = P(Spam)*(word + alpha/(nspam + alpha*num_vocab))

P(Hpam|New_msg) = P(Hpam)*(word + alpha/(nham + alpha*num_vocab))

In [33]:
spam = training_set[training_set['Label'] == 'spam']
ham = training_set[training_set['Label'] == 'ham']

In [34]:
spam_parameter = {i:0 for i in vocabulary}
ham_parameter = {i:0 for i in vocabulary}

In [35]:
for i in vocabulary:
    nword = spam[i].sum()
    p_word_given_spam = (nword + 1) / ((n_spam) + 1*len(vocabulary))
    spam_parameter[i] = p_word_given_spam

In [36]:
for i in vocabulary:
    nword = ham[i].sum()
    p_word_given_ham = (nword + 1) / ((n_ham) + 1*len(vocabulary))
    ham_parameter[i] = p_word_given_ham

In [37]:
#Naive Bayes
# P(Spam_message|newmsg) ∝ P(spam)*P(w1|spam) * P(spam)*P(w2|spam) ............... P(spam)*P(wn|spam) 

# P(ham|newmsg) ∝ P(ham)*P(nw1|ham)*P(ham)*P(nw2|ham) .........................P(ham)*P(wn|ham) 

In [38]:
import re
def classify(new_message):
    
    msg = re.sub(r'\W', ' ', new_message).lower().split()
    p_spam_given_new_message = p_spam
    p_ham_given_new_message = p_ham
    
    
    
    for word in msg:
        if word in spam_parameter:
        
            p_spam_given_new_message *= spam_parameter[word]
            p_ham_given_new_message *= ham_parameter[word]
        else:
            return 'Human will classify'
    
    if p_spam_given_new_message > p_ham_given_new_message:
        return 'Spam Message'
    elif p_spam_given_new_message <= p_ham_given_new_message:
        return 'Non Spam'
    else:
        return 'Human will classify'

In [39]:
msgg = 'you won money secret secret secret'
classify(msgg)

'Spam Message'

In [40]:
from sklearn.naive_bayes import MultinomialNB

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
vect = CountVectorizer(stop_words='english')

model = MultinomialNB()

inp = vect.fit(training['SMS'])
output = training['Label']

MultinomialNB.fit(inp, out)
model(testing['SMS'])

In [43]:
testing['Prediction'] = testing['SMS'].apply(classify)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['Prediction'] = testing['SMS'].apply(classify)


In [44]:
testing['Prediction'] == testing['Label']

0       False
1       False
2       False
3       False
4       False
        ...  
1110    False
1111    False
1112    False
1113    False
1114    False
Length: 1115, dtype: bool

In [45]:
accuracy = total true / total_value

SyntaxError: invalid syntax (<ipython-input-45-55c87415e9ae>, line 1)