In [153]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
    border-style: groove;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h1>
<center>
Spam email Classifier using NLTK and Bayes Net
</center>

<h2>
Summary
</h2>
<div class=h1_cell>
    <p>I will get spam emails table from Kaggle and make a data model in json that will be used for spam classifier.</p>
    <p><strong>Part 1, Data Loading: </strong>I first load the data using pandas</p>
    <p><strong>Part 2, Sentence Wrangler: </strong>Then, I made a sentence_wrangler which classify useful words and useless words from each table using tokenizer. I removed the special characters using regular expression</p>
    <p><strong>Part 3, bag of words: </strong>Make a bad of words using sentence_wrangler.</p>
    <p><strong>Part 4, class counts: </strong>Prepare the class counts that will be used for the Naive Bayes.</p>
    <p><strong>Part 5, Naive Bayes: </strong>Make the Naive Bayes function that will give spam and ham probabilities of each email.</p>
    <p><strong>Part 6, Run and Accuracy: </strong>Run Naive Bayes for the whole table and get predictions for each email.</p>
    <p><strong>Data Source: </strong> https://www.kaggle.com/llabhishekll/fraud-email-dataset.</p>
    
</div>

In [154]:
import os
import pandas as pd

import re
import nltk
import string

from nltk import sent_tokenize
from nltk.tree import Tree
punctuation = string.punctuation

from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()          #instantiate class

from nltk.tokenize import TreebankWordTokenizer
treeb_tokenizer = TreebankWordTokenizer()

nltk.download('punkt')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package punkt to /Users/edward/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/edward/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

<h1>
1. Data Loading
</h1>

In [155]:
spam_table = pd.read_csv("fraud_email_.csv")

In [156]:
spam_table.head()

Unnamed: 0,Text,Class
0,Supply Quality China's EXCLUSIVE dimensions at...,1
1,over. SidLet me know. Thx.,0
2,"Dear Friend,Greetings to you.I wish to accost ...",1
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1
4,Not a surprising assessment from Embassy.,0


In [157]:
len(spam_table)

11929

In [158]:
spam_table['spam'] = spam_table.apply(lambda row: 'ham' if row['Class']==0 else 'spam', axis=1 )

In [159]:
spam_table.head()

Unnamed: 0,Text,Class,spam
0,Supply Quality China's EXCLUSIVE dimensions at...,1,spam
1,over. SidLet me know. Thx.,0,ham
2,"Dear Friend,Greetings to you.I wish to accost ...",1,spam
3,MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....,1,spam
4,Not a surprising assessment from Embassy.,0,ham


<h1>
2. Sentence Wrangling
</h1>

In [160]:
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()          #instantiate class

from nltk.tokenize import TreebankWordTokenizer
treeb_tokenizer = TreebankWordTokenizer()            #instantiate class

In [161]:
from nltk.corpus import stopwords  # see more at http://xpo6.com/list-of-english-stop-words/
swords = stopwords.words('english')

In [164]:
import string
punctuation = string.punctuation

In [165]:
def containsAnyPunc(st, punctuation):
    ##help function to filter string containing punctuation
    return 1 in [c in st for c in punctuation]

In [300]:
def sentence_wrangler(sentence, swords, punctuation):
    """
    return list of lists
    first list is containing meaningful or useful words and 
    the second one is from stop words or non-words (special characters).
    """
    ans = [[],[]]
    word_tokes = word_punct_tokenizer.tokenize(str(sentence).lower())
    for unit in word_tokes:             ##unit small part of word like here, 's, my, people.
        if containsAnyPunc(unit, punctuation) or unit in swords:
            ans[1].append(unit)
        elif re.findall(r'[\W]', unit):   ##remove special charaters
            ans[1].append(unit)
        else:
            ans[0].append(unit)
    return ans

In [301]:
test_sentence = 'Here\'s is my whitelist - re pattern would be better. Extra credit if you do it'

In [302]:
sentence_wrangler(test_sentence, swords, punctuation)

[['whitelist', 'pattern', 'would', 'better', 'extra', 'credit'],
 ['here', "'", 's', 'is', 'my', '-', 're', 'be', '.', 'if', 'you', 'do', 'it']]

<h1>
3. Bag of Words
</h1>

In [303]:
def all_words(table):
    words_dictionary = dict()
    empty = []
    for i in range(len(table)):
        sentence = str(table.loc[i, 'Text'])
        spam_class = table.loc[i,'Class']
        li = sentence_wrangler(sentence, swords, punctuation)
        for word in li[0]:
            if word in words_dictionary:
                words_dictionary[word][spam_class] +=1
            else:
                words_dictionary[word] = [0,0]
                words_dictionary[word][spam_class] +=1
    return words_dictionary

In [304]:
bag_of_words = all_words(spam_table)

In [305]:
len(bag_of_words)  #unique words

109450

In [306]:
first10pairs = {k: bag_of_words[k] for k in list(bag_of_words)[:10]}
first10pairs

{'supply': [14, 126],
 'quality': [21, 32],
 'china': [337, 123],
 'exclusive': [14, 12],
 'dimensions': [4, 2],
 'unbeatable': [0, 12],
 'price': [12, 77],
 'dear': [40, 1883],
 'sir': [20, 1185],
 'pleased': [28, 63]}

<h1>
4. Class count
</h1>

In [307]:
class_counts = dict()

In [308]:
ham_count, spam_count = spam_table.groupby('Class').size()
class_counts['class_count'] = ham_count, spam_count
class_counts['spam_count'] = len(spam_table)
class_counts['class_prob'] = ham_count / class_counts['spam_count'], spam_count / class_counts['spam_count']
# class_counts['naked_count'] = spam_table.groupby(['Class'])

In [309]:
# useful_counts['naked_count'] = tweet_table.groupby(['label', 'hash_count']).size()[0][0],\
# tweet_table.groupby(['label', 'hash_count']).size()[1][0]

<h1>
5. Bayes Net
</h1>

In [310]:
def naive_bayes(text, count_dictionary, patc, bag_of_words, swords, punctuation):
    ans_list = []
    wrangle_li = sentence_wrangler(text, swords, punctuation)
    word_list = list(word for word in wrangle_li[0] if not re.findall(patc, word))
    for i in range(2):
        case_i = count_dictionary['class_prob'][i]
        num = 1
        for word in word_list:
            if word not in bag_of_words:
                num *=1
            else:
                num *= bag_of_words[word][i] / count_dictionary['class_count'][i]
        ans_list.append(num* case_i)
    return tuple(ans_list)

In [311]:
patc = r'^[0-9]'

In [200]:
patc = r'^[0-9\W]*$'

In [312]:
for i in range(5):
    print(naive_bayes(spam_table.loc[i, 'Text'], class_counts, patc, bag_of_words, swords, punctuation))
    print(spam_table.loc[i, 'Class'])

(0.0, 1.7400676946902776e-262)
1
(2.7678419422269e-07, 0.0)
0
(0.0, 1.9102425502583288e-221)
1
(0.0, 4.510351283e-314)
1
(5.8160090278563125e-08, 1.0015286672506927e-07)
0


<h1>
6. Run and Accuracy 
</h1>

In [313]:
import time

In [314]:
start = time.time()

copytable = spam_table
predictions = []
val = 0
for i,row in copytable.iterrows():
    if i%1000 ==0:
        print('did 1000')
    pair = naive_bayes(row['Text'], class_counts, patc, bag_of_words, swords, punctuation)
    predictions.append(pair.index(max(pair)))
    
end = time.time()
print(end - start)  # in seconds

did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
did 1000
18.65878391265869


In [315]:
#build zipped
actuals = spam_table['Class']
print(actuals[:5])
# for widx in range(len(actuals)):
#     actuals[widx] = widx
zipped = list(zip(predictions, actuals))

0    1
1    0
2    1
3    1
4    0
Name: Class, dtype: int64


In [316]:
confusion_dictionary = {(1, 1):0, (1, 0):0, (0, 1):0, (0, 0):0}

In [317]:
for pair in zipped:
    confusion_dictionary[pair] +=1
confusion_dictionary

{(1, 1): 4314, (1, 0): 753, (0, 1): 873, (0, 0): 5989}

In [318]:
accuracy = (1.0*confusion_dictionary[(0,0)]+confusion_dictionary[(1,1)])/len(spam_table)
accuracy

0.8636935199932937

<h1>
Analyzing and save
</h1>

<div class=h1_cell>
    <p>I got 86% accuracy.</p>
    <p>I will save my bad_of_words into JSON to make a spam classifying program in the future.</p>
</div>

In [319]:
import json

with open('bag_of_words.txt', 'w') as file:
    file.write(json.dumps(bag_of_words))

In [320]:
bag2 = json.load(open("bag_of_words.txt"))  # making sure I can read it in again

In [325]:
sorted(bag2.items())[:10]

[('0', [92, 642]),
 ('00', [674, 1266]),
 ('000', [3, 2768]),
 ('0000', [0, 3]),
 ('00000', [0, 4]),
 ('000000', [0, 161]),
 ('00000000000000', [1, 0]),
 ('000000000066', [0, 4]),
 ('000000b2', [0, 1]),
 ('00000e25', [0, 1])]