In [54]:
import os
import numpy as np

from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix

# 1. Preparing the text data

In [55]:
# a) Removal of stop words – Stop words like “and”, “the”, “of”, etc are very common in all English sentences 
#                            and are not very meaningful in deciding spam or legitimate status, so these words 
#                             have been removed from the emails.
# b) Lemmatization – It is the process of grouping together the different inflected forms of a word so they can be 
#                    analysed as a single item. For example, “include”, “includes,” and “included” would all be 
#                    represented as “include”. The context of the sentence is also preserved in lemmatization as 
#                    opposed to stemming (another buzz word in text mining which does not consider meaning of the 
#                    sentence).
# c) Remove non-words – Removal of the non-words like punctuation marks or special characters from the mail documents.

# a) and b) are already done in the Ling-spam corpus. c) would be done in the next step, creating dictionary.

# 2. Creating word dictionary 

In [56]:
def make_dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]    
    all_words = []       
    for mail in emails:    
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:  #Body of email is only 3rd line of text file
                    words = line.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    
    # Remove the non-words like punctuation marks or special characters from the mail documents.
    list_to_remove = dictionary.keys()
    for item in list_to_remove:
        if item.isalpha() == False: 
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    return dictionary

# Feature extraction 

In [49]:
def extract_features(mail_dir): 
    files = [os.path.join(mail_dir,fi) for fi in sorted(os.listdir(mail_dir))]
    features_matrix = np.zeros((len(files),3000))
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i,line in enumerate(fi):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
            docID = docID + 1     
    return features_matrix

# Training classifiers

In [52]:
# Paths to train and test mails
train_dir = '../data/train-mails'
test_dir = '../data/test-mails'

# Create a dictionary of words with its frequency
dictionary = make_dictionary(train_dir)

# Prepare feature vectors per training mail and its labels
train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)

# Training SVM and Naive bayes classifier
model1 = MultinomialNB()
model2 = LinearSVC()
model1.fit(train_matrix,train_labels)
model2.fit(train_matrix,train_labels)

# Test the unseen mails for Spam
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)

In [53]:
print confusion_matrix(test_labels,result1)
print confusion_matrix(test_labels,result2)

[[129   1]
 [  9 121]]
[[126   4]
 [  6 124]]


In [58]:
feature_matrix = extract_features(train_dir)
feature_matrix.shape

(702, 3000)

In [63]:
train_labels = np.zeros(702)
train_labels[351:701] = 1
train_labels

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [64]:
train_matrix = extract_features(train_dir)
train_matrix.shape

(703, 3000)

In [71]:
model1 = LinearSVC()

model1.fit(feature_matrix, train_labels)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [67]:
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1

In [72]:
result1 = model1.predict(test_matrix)
print confusion_matrix(test_labels,result1)

[[126   4]
 [  6 124]]


In [73]:
# Labels -- non-spam = 0, spam = 1
# 129 -- prediction = non spam, truth = non spam
# 1  -- prediction = spam, truth = non spam 
# 9 -- prediciton = non spam, truth = spam
# 121 -- prediction = spam, truth = spam
acc = (126+124)*1.0/(129+121+1+9)
acc

0.9615384615384616