In [3]:
import os
import codecs

def read_in(folder):
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        if not a_file.startswith("."):
            f = codecs.open(folder + a_file, "r", encoding = "ISO-8859-1", errors="ignore")
            a_list.append(f.read())
            f.close()
    return a_list

In [5]:
spam_list = read_in("enron1/spam/")
ham_list = read_in("enron1/ham/")
print(len(spam_list))
print(len(ham_list))
print(spam_list[0])
print(ham_list[0])

1500
3672
Subject: hot jobs
Global marketing specialties po box 300 east amherst, ny 14051 this e - mail message is an advertisement and/or solicitation.

Subject: timeline calendars
Per the management offsite meeting, please provide me with your monthly
Calendars showing your vital business days, close, bid - week, etc. If
Necessary to capture all those days please provide me with the months of
January and february.
Please work with your assistants to get this accomplished before next monday.
For those of you without, please give me a call and we can set a time to sit
Down and I will handle your calendars.
Thank you!
Yvette
X 3. 5953


In [6]:
import random
  
all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
random.seed(42)
random.shuffle(all_emails)
print ("Dataset size = " + str(len(all_emails)) + " emails")

Dataset size = 5172 emails


In [8]:
import nltk
from nltk import word_tokenize

def tokenize(input):
    word_list = []
    for word in word_tokenize(input):
        word_list.append(word)
    return word_list

input = "What's the best way to split a sentence into words?"
print(tokenize(input))

['What', "'s", 'the', 'best', 'way', 'to', 'split', 'a', 'sentence', 'into', 'words', '?']


In [10]:
def get_features(text):
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features

all_features = [(get_features(email), label) for (email, label) in all_emails]

print(get_features("Participate In Our New Lottery NOW!"))
print(len(all_features))
print(len(all_features[0][0]))
print(len(all_features[99][0]))
print(all_features[0])

{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}
5172
32
315
({'subject': True, ':': True, 'for': True, 'all': True, 'your': True, 'transportation': True, 'needs': True, 'sky': True, 'limotoronto': True, 'ontario': True, 'canadaskylimo': True, '@': True, 'copper': True, '.': True, 'netmarch': True, 'special': True, '10': True, '%': True, 'discount': True, '(': True, 'cash': True, 'only': True, ')': True, 'do': True, 'not': True, 'drink': True, 'drivecall': True, 'the': True, 'skyfor': True, '416': True, '979': True, '5466': True}, 'spam')


In [11]:
from nltk import NaiveBayesClassifier, classify
  
def train(features, proportion):
    train_size = int(len(features) * proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ("Training set size = " + str(len(train_set)) + " emails")
    print ("Test set size = " + str(len(test_set)) + " emails")
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
  
train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4137 emails
Test set size = 1035 emails


In [12]:
def evaluate(train_set, test_set, classifier):
    print ("Accuracy on the training set = " +  str(classify.accuracy(classifier, train_set)))
    print ("Accuracy of the test set = " + str(classify.accuracy(classifier, test_set)))
    classifier.show_most_informative_features(50)
  
evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9586656997824511
Accuracy of the test set = 0.9391304347826087
Most Informative Features
               forwarded = True              ham : spam   =    196.0 : 1.0
                    2004 = True             spam : ham    =    145.4 : 1.0
                     nom = True              ham : spam   =    123.5 : 1.0
            prescription = True             spam : ham    =    119.7 : 1.0
                     ect = True              ham : spam   =    107.2 : 1.0
                    pain = True             spam : ham    =    106.8 : 1.0
                   cheap = True             spam : ham    =     89.1 : 1.0
                    spam = True             spam : ham    =     81.1 : 1.0
                    2005 = True             spam : ham    =     79.5 : 1.0
                  differ = True             spam : ham    =     76.3 : 1.0
                   adobe = True             spam : ham    =     73.1 : 1.0
              nomination = True              ham : sp

In [13]:
from nltk.text import Text
  
def concordance(data_list, search_word):
    for email in data_list:
        word_list = [word for word in word_tokenize(email.lower())]
        text_list = Text(word_list)
        if search_word in word_list:
            text_list.concordance(search_word)


print ("STOCKS in HAM:")
concordance(ham_list, "stocks")
print ("\n\nSTOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in HAM:
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files


STOCKS in SPAM:
Displaying 5 of 5 matches:
ck monday some of these littie voip stocks have been really moving lately . an
t can happen with these sma | | cap stocks when they take off . and it happens
 statements . as with many microcap stocks , today ' s company has additiona |
is report pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this report . none 
Displaying 1 of 1 matches:
 one trade monday ! go wysk . penny stocks are considered highiy specuiative a
Displaying 6 of

Displaying 3 of 3 matches:
might occur . as with many microcap stocks , today ' s company has additiona |
is emai | pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 3 of 3 matches:
5 how many times have you seen good stocks but you couldn ' t get your hands o
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 4 of 4 matches:
k tuesday some of these littie voip stocks have been reaily moving lateiy . an
 statements . as with many microcap stocks , today ' s company has additional 
is report pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this report . none 
Displaying 1 of 1 matches:
or information puposes only . penny stocks are considered highly speculative a
Displaying 2 of 2 match