In [45]:
import nltk
import pandas as pd
from collections import defaultdict
from nltk import FreqDist

In [46]:
dt = pd.read_csv('dt.csv')
dt.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3
0,ham,"Go until jurong point, crazy.. Available only ...",,
1,ham,Ok lar... Joking wif u oni...,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,
3,ham,U dun say so early hor... U c already then say...,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,


In [47]:
dt.drop(labels=['Unnamed: 2','Unnamed: 3'],axis=1,inplace=True)
dt.rename(columns={'v1':'Labels','v2':'Text'},inplace=True)
dt = dt[['Text','Labels']]
dt.head()

Unnamed: 0,Text,Labels
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [48]:
dt = dt.sample(frac=1,random_state=42)
dtTrain = dt.iloc[:int(0.9*(len(dt)))]
dtTest = dt.iloc[int(0.9*(len(dt))):]

In [49]:
posTokens = [] 
negTokens = []
for row in dtTrain.values:
    if row[1] == 'ham':
        posTokens.extend(nltk.wordpunct_tokenize(row[0].lower()))
    else:
        negTokens.extend(nltk.wordpunct_tokenize(row[0].lower()))

In [50]:
pos_word_count = defaultdict(int,FreqDist(posTokens))
neg_word_count = defaultdict(int,FreqDist(negTokens))

In [51]:
corpusSize = len(posTokens) + len(negTokens)
print(f'Corpus has {corpusSize} words')

Corpus has 4578 words


Developing Unigram Model with add one smoothing 

In [52]:
unigramOneSmoothing = defaultdict(lambda: 1/(len(posTokens)+len(pos_word_count)),
                                        {x:(y+1)/(len(posTokens)+len(pos_word_count)) 
                                     for x,y in pos_word_count.items()}) 

unigramOneSmoothingNeg = defaultdict(lambda: 1/(len(negTokens)+len(neg_word_count)),
                                        {x:(y+1)/(len(negTokens)+len(neg_word_count)) 
                                     for x,y in neg_word_count.items()}) 

probHam = len(posTokens)/corpusSize
probSpam = len(negTokens)/corpusSize


pred = []
acc = 0
for row in dtTest.values:
    tokens = nltk.wordpunct_tokenize(row[0])
    prob = probHam/probSpam
    for tok in tokens:
        prob *= unigramOneSmoothing[tok]/unigramOneSmoothingNeg[tok]
    
    if prob >= 1:
        temp = 'ham'
        pred.append('ham')
    else:
        temp = 'spam'
        pred.append('spam')
    if temp == row[1]:
        acc += 1


In [53]:
print(f'Accuracy = {acc/len(dtTest)}')

Accuracy = 0.95


In [54]:
dtTest['Predicted'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [55]:
dtTest

Unnamed: 0,Text,Labels,Predicted
1,Ok lar... Joking wif u oni...,ham,ham
52,K fyi x has a ride early tomorrow morning but ...,ham,ham
149,Valentines Day Special! Win over 澹1000 in our ...,spam,spam
130,URGENT! We are trying to contact you. Last wee...,spam,spam
151,Congratulations ur awarded 500 of CD vouchers ...,spam,spam
103,wow. You're right! I didn't mean to do that. I...,ham,ham
99,Please don't text me anymore. I have nothing e...,ham,ham
116,You are a winner U have been specially selecte...,spam,spam
87,Yes I started to send requests to make it but ...,ham,ham
74,U can call me now...,ham,ham
