## Imports

In [6]:
import pandas as pd
import math
from tqdm import tqdm
import string

## Readin Dataset

In [7]:
dataSet = pd.read_csv("SMSSpamCollection", sep='\t',index_col=False, names = ["type", "text"], header=None)

In [8]:
dataSet

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Splitting Dataset

In [9]:
total = dataSet.count()[0]
numTraining = math.floor(total*0.7)

In [10]:
trainingSet = dataSet[:numTraining]
validationSet = dataSet[numTraining:]

In [11]:
trainingSet

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
3895,spam,Dear Dave this is your final notice to collect...
3896,ham,Yes. Last practice
3897,spam,tells u 2 call 09066358152 to claim £5000 priz...
3898,ham,No. Thank you. You've been wonderful


In [12]:
validationSet

Unnamed: 0,type,text
3900,ham,Ü mean it's confirmed... I tot they juz say on...
3901,ham,Okie
3902,ham,That depends. How would you like to be treated...
3903,ham,"Right on brah, see you later"
3904,ham,Waiting in e car 4 my mum lor. U leh? Reach ho...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Convert the Trainingset to Dictionary and Tokenizing

In [13]:
dicSet = trainingSet.to_dict(orient="records")
for i in dicSet:
    i["text"] = i["text"].translate(str.maketrans('', '', string.punctuation)).split()

In [14]:
dicSet

[{'type': 'ham',
  'text': ['Go',
   'until',
   'jurong',
   'point',
   'crazy',
   'Available',
   'only',
   'in',
   'bugis',
   'n',
   'great',
   'world',
   'la',
   'e',
   'buffet',
   'Cine',
   'there',
   'got',
   'amore',
   'wat']},
 {'type': 'ham', 'text': ['Ok', 'lar', 'Joking', 'wif', 'u', 'oni']},
 {'type': 'spam',
  'text': ['Free',
   'entry',
   'in',
   '2',
   'a',
   'wkly',
   'comp',
   'to',
   'win',
   'FA',
   'Cup',
   'final',
   'tkts',
   '21st',
   'May',
   '2005',
   'Text',
   'FA',
   'to',
   '87121',
   'to',
   'receive',
   'entry',
   'questionstd',
   'txt',
   'rateTCs',
   'apply',
   '08452810075over18s']},
 {'type': 'ham',
  'text': ['U',
   'dun',
   'say',
   'so',
   'early',
   'hor',
   'U',
   'c',
   'already',
   'then',
   'say']},
 {'type': 'ham',
  'text': ['Nah',
   'I',
   'dont',
   'think',
   'he',
   'goes',
   'to',
   'usf',
   'he',
   'lives',
   'around',
   'here',
   'though']},
 {'type': 'spam',
  'text': ['Fr

In [15]:
#spam dictionary
trainingDicSpam = {}
for i in dicSet:
    if i["type"] == "spam":
        for j in i["text"]:
            if j.lower() not in trainingDicSpam:
                trainingDicSpam[j.lower()] = 1
            else:
                trainingDicSpam[j.lower()] += 1

In [16]:
#not spam dictionary
trainingDicNotSpam = {}
for i in dicSet:
    if i["type"] == "ham":
        for j in i["text"]:
            if j.lower() not in trainingDicNotSpam:
                trainingDicNotSpam[j.lower()] = 1
            else:
                trainingDicNotSpam[j.lower()] += 1

In [17]:
trainingDicSpam

{'free': 151,
 'entry': 21,
 'in': 54,
 '2': 128,
 'a': 259,
 'wkly': 9,
 'comp': 7,
 'to': 498,
 'win': 46,
 'fa': 4,
 'cup': 5,
 'final': 14,
 'tkts': 4,
 '21st': 2,
 'may': 6,
 '2005': 3,
 'text': 83,
 '87121': 3,
 'receive': 21,
 'questionstd': 2,
 'txt': 116,
 'ratetcs': 2,
 'apply': 20,
 '08452810075over18s': 2,
 'freemsg': 8,
 'hey': 5,
 'there': 9,
 'darling': 2,
 'its': 7,
 'been': 28,
 '3': 11,
 'weeks': 8,
 'now': 138,
 'and': 100,
 'no': 41,
 'word': 17,
 'back': 19,
 'id': 1,
 'like': 9,
 'some': 5,
 'fun': 7,
 'you': 196,
 'up': 17,
 'for': 132,
 'it': 20,
 'still': 6,
 'tb': 1,
 'ok': 4,
 'xxx': 11,
 'std': 6,
 'chgs': 1,
 'send': 49,
 '£150': 20,
 'rcv': 2,
 'winner': 13,
 'as': 23,
 'valued': 5,
 'network': 16,
 'customer': 32,
 'have': 93,
 'selected': 16,
 'receivea': 1,
 '£900': 5,
 'prize': 65,
 'reward': 2,
 'claim': 80,
 'call': 228,
 '09061701461': 1,
 'code': 17,
 'kl341': 1,
 'valid': 16,
 '12': 5,
 'hours': 2,
 'only': 55,
 'had': 8,
 'your': 170,
 'mobile': 

In [18]:
trainingDicNotSpam

{'go': 164,
 'until': 16,
 'jurong': 1,
 'point': 11,
 'crazy': 8,
 'available': 9,
 'only': 86,
 'in': 567,
 'bugis': 6,
 'n': 96,
 'great': 73,
 'world': 24,
 'la': 5,
 'e': 58,
 'buffet': 2,
 'cine': 7,
 'there': 136,
 'got': 163,
 'amore': 1,
 'wat': 69,
 'ok': 201,
 'lar': 27,
 'joking': 2,
 'wif': 15,
 'u': 664,
 'oni': 2,
 'dun': 37,
 'say': 64,
 'so': 275,
 'early': 28,
 'hor': 1,
 'c': 46,
 'already': 65,
 'then': 159,
 'nah': 9,
 'i': 1526,
 'dont': 193,
 'think': 91,
 'he': 129,
 'goes': 21,
 'to': 1086,
 'usf': 6,
 'lives': 3,
 'around': 44,
 'here': 80,
 'though': 17,
 'even': 39,
 'my': 534,
 'brother': 9,
 'is': 523,
 'not': 288,
 'like': 163,
 'speak': 18,
 'with': 195,
 'me': 525,
 'they': 81,
 'treat': 12,
 'aids': 1,
 'patent': 1,
 'as': 114,
 'per': 12,
 'your': 289,
 'request': 6,
 'melle': 6,
 'oru': 4,
 'minnaminunginte': 3,
 'nurungu': 3,
 'vettam': 3,
 'has': 60,
 'been': 64,
 'set': 13,
 'callertune': 10,
 'for': 365,
 'all': 176,
 'callers': 5,
 'press': 7,
 

## Calculating Probability

In [39]:
numSpamNotSpam = dict(trainingSet["type"].value_counts())
pSpam = numSpamNotSpam["spam"] / len(trainingSet)
pSpam

0.13307692307692306

In [40]:
numSpamNotSpam = dict(trainingSet["type"].value_counts())
pNotSpam = numSpamNotSpam["ham"] / len(trainingSet)
pNotSpam

0.8669230769230769

In [41]:
totalSpam = 0
for i in trainingDicSpam.keys():
    totalSpam += trainingDicSpam[i]

In [42]:
totalNotSpam = 0
for i in trainingDicNotSpam.keys():
    totalNotSpam += trainingDicNotSpam[i]

In [43]:
def pTextSpam(tokens):
    totalSpam = 0
    for i in trainingDicSpam.keys():
        totalSpam += trainingDicSpam[i]
    pTextSpam = 1
    for i in tokens:
        if i.lower() in trainingDicSpam:
            pTextSpam *= pTokenSpamNotSpam(trainingDicSpam[i.lower()], totalSpam, len(tokens))
        else:
            pTextSpam *= pTokenSpamNotSpam(0, totalSpam, len(tokens))
    return pTextSpam

In [44]:
def pTextNotSpam(tokens):
    totalNotSpam = 0
    for i in trainingDicNotSpam.keys():
        totalNotSpam += trainingDicNotSpam[i]
    pTextNotSpam = 1
    for i in tokens:
        if i.lower() in trainingDicNotSpam:
            pTextNotSpam *= pTokenSpamNotSpam(trainingDicNotSpam[i.lower()], totalNotSpam, len(tokens))
        else:
            pTextNotSpam *= pTokenSpamNotSpam(0, totalNotSpam, len(tokens))
    return pTextNotSpam

In [45]:
def pTokenSpamNotSpam(number, total, length):
    return (number+1)/(total+length)

## Evaluation

In [47]:
def evaluate(text):
    text = text.translate(str.maketrans('', '', string.punctuation)).split()
    pSpamText = pSpam * pTextSpam(text)
    pNotSpamText = pNotSpam * pTextNotSpam(text)
    if pSpamText > pNotSpamText:
        return "spam"
    else:
        return "not spam"

In [48]:
evaluate("1000")

'spam'

## Validate the Classifier

In [49]:
valSet = validationSet.to_dict(orient="records")
total = len(valSet)
T = 0
F = 0
for i in tqdm(valSet):
    if i["type"].lower() == "ham":
        type = "not spam"
    else:
        type = "spam"
    if evaluate(i["text"]) == type:
        T += 1
    else:
        F += 1
accuracy = T/total*100
failing = F/total*100
print(f"accuracy = {accuracy}%\nfailing rate = {failing}%")

100%|█████████████████████████████████████| 1672/1672 [00:00<00:00, 2040.84it/s]

accuracy = 94.79665071770334%
failing rate = 5.203349282296651%



