# Chapter 2: Your first practical NLP application, spam filtering

Read in spam and ham file lists:

In [1]:
import os
import codecs

def read_in(folder):
    files = os.listdir(folder)
    a_list = []
    for a_file in files:
        if not a_file.startswith("."):
            f = codecs.open(folder + a_file, "r", encoding = "ISO-8859-1", errors="ignore")
            a_list.append(f.read())
            f.close()
    return a_list

Initialise lists and print out length – should be 100:

In [2]:
spam_list = read_in("enron1/spam/")
print(len(spam_list))
print(spam_list[0])
ham_list = read_in("enron1/ham/")
print(len(ham_list))
print(ham_list[0])

1500
Subject: dobmeos with hgh my energy level has gone up! Stukm
Introducing
Doctor - formulated
Hgh
Human growth hormone - also called hgh
Is referred to in medical science as the master hormone. It is very plentiful
When we are young, but near the age of twenty - one our bodies begin to produce
Less of it. By the time we are forty nearly everyone is deficient in hgh,
And at eighty our production has normally diminished at least 90 - 95%.
Advantages of hgh:
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swings
- new hair growth and color restore
Read
More at this website
Unsubscribe

36

Combine all emails together, keeping the label, and shuffle them: 

In [8]:
import random

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
random.seed(42)
random.shuffle(all_emails)
print (f"Dataset size = {str(len(all_emails))} emails")

Dataset size = 5172 emails


Preprocess the texts by tokenising them and removing the stopwords:


In [9]:
import nltk
from nltk import word_tokenize

def get_features(text): 
    features = {}
    word_list = [word for word in word_tokenize(text.lower())]
    for word in word_list:
        features[word] = True
    return features

all_features = [(get_features(email), label) for (email, label) in all_emails]

print(get_features("Participate In Our New Lottery NOW!"))
print(len(all_features))
print(len(all_features[0][0]))
print(len(all_features[99][0]))

{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}
5172
27
53


Apply Naive Bayes classifier:

In [11]:
from nltk import NaiveBayesClassifier, classify

def train(features, proportion):
    train_size = int(len(features) * proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print (f"Training set size = {str(len(train_set))} emails")
    print (f"Test set size = {str(len(test_set))} emails")
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier

train_set, test_set, classifier = train(all_features, 0.8)

Training set size = 4137 emails
Test set size = 1035 emails


Evaluate the performance:

In [14]:
def evaluate(train_set, test_set, classifier):
    # check how the classifier performs on the training and test sets
    print (f"Accuracy on the training set = {str(classify.accuracy(classifier, train_set))}")
    print (f"Accuracy on the test set = {str(classify.accuracy(classifier, test_set))}")    
    # check which words are most informative for the classifier
    classifier.show_most_informative_features(50)

evaluate(train_set, test_set, classifier)

Accuracy on the training set = 0.9613246313753928
Accuracy of the test set = 0.9420289855072463
Most Informative Features
               forwarded = True              ham : spam   =    198.3 : 1.0
                    2004 = True             spam : ham    =    143.8 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
            prescription = True             spam : ham    =    122.9 : 1.0
                    pain = True             spam : ham    =     98.8 : 1.0
                  health = True             spam : ham    =     82.7 : 1.0
                     ect = True              ham : spam   =     76.8 : 1.0
                    2001 = True              ham : spam   =     75.8 : 1.0
                featured = True             spam : ham    =     74.7 : 1.0
              nomination = True              ham : spam   =     72.1 : 1.0
             medications = True             spam : ham    =     69.9 : 1.0
                  differ = True             spam : ha

Explore the contexts of use:

In [15]:
from nltk.text import Text

def concordance(data_list, search_word):
    for email in data_list:
        word_list = [word for word in word_tokenize(email.lower())]
        text_list = Text(word_list)
        if search_word in word_list:
            text_list.concordance(search_word)


print ("STOCKS in HAM:")
concordance(ham_list, "stocks")
print ("\n\nSTOCKS in SPAM:")
concordance(spam_list, "stocks")

STOCKS in HAM:
Displaying 1 of 1 matches:
ad my portfolio is diversified into stocks that have lost even more money than
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files
Displaying 1 of 1 matches:
ur member directory . * follow your stocks and news headlines , exchange files


STOCKS in SPAM:
Displaying 2 of 2 matches:
ims and do your own due diligence . stocks to play ( s 2 p ) profiles are not 
s obtained . investing in micro cap stocks is extremely risky and , investors 
Displaying 1 of 1 matches:
cautions that small and micro - cap stocks are high - risk investments and tha
Displaying 1 of 1 matches:
s obtained . investing in micro cap stocks is extremely risky and , investors 
Displaying 3 of 3 matches:
ancements but may be one of the few stocks left in this industry group that is
his email pertaining to investing , stoc

is report pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this report . none 
Displaying 2 of 2 matches:
his email pertaining to investing , stocks , securities must be understood as 
ntative before deciding to trade in stocks featured within this email . none o
Displaying 1 of 1 matches:
scovering value in natural resource stocks elgin resources ( elr - tsx ) extra
Displaying 2 of 2 matches:
 the last 12 months , many of these stocks made tripie and even quadruple retu
one trade tuesday ! go mogi . penny stocks are considered highly speculative a
Displaying 5 of 5 matches:
hursday ! some of these littie voip stocks have been realiy moving lateiy . an
t can happen with these sma | | cap stocks when they take off . and it happens
 statements . as with many microcap stocks , today ' s company has additiona |
is report pertaining to investing , stocks , securities must be understood as 
ntative before deciding

Input some of your own messages:

In [16]:
test_spam_list = ["Participate in our new lottery!", "Try out this new medicine"]
test_ham_list = ["See the minutes from the last meeting attached", 
                 "Investors are coming to our office on Monday"]

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]

new_test_set = [(get_features(email), label) for (email, label) in test_emails]

evaluate(train_set, new_test_set, classifier)

Accuracy on the training set = 0.9613246313753928
Accuracy of the test set = 1.0
Most Informative Features
               forwarded = True              ham : spam   =    198.3 : 1.0
                    2004 = True             spam : ham    =    143.8 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
            prescription = True             spam : ham    =    122.9 : 1.0
                    pain = True             spam : ham    =     98.8 : 1.0
                  health = True             spam : ham    =     82.7 : 1.0
                     ect = True              ham : spam   =     76.8 : 1.0
                    2001 = True              ham : spam   =     75.8 : 1.0
                featured = True             spam : ham    =     74.7 : 1.0
              nomination = True              ham : spam   =     72.1 : 1.0
             medications = True             spam : ham    =     69.9 : 1.0
                  differ = True             spam : ham    =     66.7

See how they get classified:

In [17]:
for email in test_spam_list:
    print (email)
    print (classifier.classify(get_features(email)))
for email in test_ham_list:
    print (email)
    print (classifier.classify(get_features(email)))

Participate in our new lottery!
spam
Try out this new medicine
spam
See the minutes from the last meeting attached
ham
Investors are coming to our office on Monday
ham


Run in an interactive manner:

In [18]:
while True:
    email = input("Type in your email here (or press 'Enter'): ")
    if len(email)==0:
        break
    else: 
        prediction = classifier.classify(get_features(email))
        print (f"This email is likely {prediction}\n")

Type in your email here (or press 'Enter'): Buy new meds
This email is likely spam

Type in your email here (or press 'Enter'): Buy new meds here!
This email is likely spam

Type in your email here (or press 'Enter'): Get you stock options fast
This email is likely spam

Type in your email here (or press 'Enter'): Let's schedule a meeting for tomorrow
This email is likely ham

Type in your email here (or press 'Enter'): 


Run on a different dataset:

# Assignment:

Apply the classifier to a different test set, e.g. the emails from `enron2/`. As before, you need to read in the data, extract textual content, extract the features and evaluate the classifier. What do the results tell you?

In [19]:
test_spam_list = read_in("enron2/spam/")
print(len(test_spam_list))
print(test_spam_list[0])
test_ham_list = read_in("enron2/ham/")
print(len(test_ham_list))
print(test_ham_list[0])

test_emails = [(email_content, "spam") for email_content in test_spam_list]
test_emails += [(email_content, "ham") for email_content in test_ham_list]
random.shuffle(test_emails)

new_test_set = [(get_features(email), label) for (email, label) in test_emails]

evaluate(train_set, new_test_set, classifier)

1496
Subject: fw: this is the solution I mentioned lsc
Oo
Thank you,
Your email address was obtained from a purchased list,
Reference #2020 mid = 3300. If you wish to unsubscribe
From this list, please click here and enter
Your name into the remove box. If you have previously unsubscribed
And are still receiving this message, you may email our abuse
Control center, or call 1 - 888 - 763 - 2497, or write us at: nospam,
6484 coral way, miami, fl, 33155". ÃÂÃÂ© 2002
Web credit inc. All rights reserved.
4361
Subject: re: rankings
Thank you.
Accuracy on the training set = 0.9613246313753928
Accuracy of the test set = 0.7604575721359057
Most Informative Features
               forwarded = True              ham : spam   =    198.3 : 1.0
                    2004 = True             spam : ham    =    143.8 : 1.0
                     nom = True              ham : spam   =    125.8 : 1.0
            prescription = True             spam : ham    =    122.9 : 1.0
                    pain = True 

Combine the two datasets:

In [20]:
spam_list = read_in("enron1/spam/") + read_in("enron2/spam/")
print(len(spam_list))
ham_list = read_in("enron1/ham/") + read_in("enron2/ham/")
print(len(ham_list))

all_emails = [(email_content, "spam") for email_content in spam_list]
all_emails += [(email_content, "ham") for email_content in ham_list]
random.shuffle(test_emails)

all_features = [(get_features(email), label) for (email, label) in all_emails]
print(len(all_features))

train_set, test_set, classifier = train(all_features, 0.8)
evaluate(train_set, new_test_set, classifier)

2996
8033
11029
Training set size = 8823 emails
Test set size = 2206 emails
Accuracy on the training set = 0.9822055990026068
Accuracy of the test set = 0.9735359399009732
Most Informative Features
                   meter = True              ham : spam   =    264.5 : 1.0
                    2004 = True             spam : ham    =    245.7 : 1.0
                   vince = True              ham : spam   =    200.0 : 1.0
                     nom = True              ham : spam   =    195.9 : 1.0
                     sex = True             spam : ham    =    195.1 : 1.0
                     ect = True              ham : spam   =    174.0 : 1.0
            prescription = True             spam : ham    =    169.2 : 1.0
                    spam = True             spam : ham    =    145.8 : 1.0
                     fyi = True              ham : spam   =    135.2 : 1.0
               forwarded = True              ham : spam   =    134.0 : 1.0
              nomination = True              ham : s