In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import nltk, re, string
import scipy.io as sio
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize']  = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

#np.random.seed(1)
%load_ext autoreload
%autoreload 2

In [2]:
def getVocabList(df_path, reverse=False):
    with open(df_path) as f:
        vocab_dict = {}
        for line in f:                    
            (val, key) = line.split()     
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key
    return vocab_dict # Returns the dictionary of vocabulary

In [3]:
def processEmail(email):
    '''
    PROCESSEMAIL preprocesses a the body of an email and returns a list of word_indices 
       word_indices = PROCESSEMAIL(email_contents) preprocesses 
       the body of an email and returns a list of indices of the 
       words contained in the email.
    '''
    # ========================== Preprocess Email ===========================
    '''
    #Find the Headers ( \n\n and remove )
    #Uncomment the following lines if you are working with raw emails with the full headers
    hdrstart = strfind(email_contents, ([char(10) char(10)]));
    email_contents = email_contents(hdrstart(1):end);
    '''
    email = email.lower()
    email = re.sub('<[^<>]+>', ' ', email);
    email = re.sub('[0-9]+', 'number', email)
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email); 
    email = re.sub('[$]+', 'dollar', email);
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    table  = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens] # remove punctuation from each word
    tokenList = []
    stemmer   = PorterStemmer()
    for token in tokens:
        token = re.sub('[^a-zA-Z0-9]', '', token)#Remove any non alphanumeric characters
        stemmed = stemmer.stem(token) #Use the Porter stemmer to stem the word
        if not len(token): continue #Throw out empty tokens
        tokenList.append(stemmed) #Store a list of all unique stemmed words
    return tokenList

In [4]:
def wordToIndices(tokenList, vocab_dict):
    word_indices = []
    for token in tokenList:
        if token in vocab_dict:
            value = vocab_dict[token]
            word_indices.append(value)
    return word_indices

In [5]:
def emailFeatures(indexList, vocabList):
    n = len(vocabList)
    x = np.zeros((n,1))
    for idx in indexList:
        x[idx] = 1
    return x

In [6]:
def preprocessing_model(email,df_path_vocab):
    tokenList = processEmail(email)
    vocab_dict = getVocabList(df_path_vocab,reverse=False)
    indexList = wordToIndices(tokenList, vocab_dict)
    x = emailFeatures(indexList,vocab_dict)
    
    print("Length of feature vector is %d" % len(x))
    print("Number of non-zero entries is: %d" % sum(x==1))
    return x

In [7]:
# Training set
df_train = 'data/spamTrain.mat'
tr_data = sio.loadmat(df_train)
X = tr_data['X']
y = tr_data['y'].ravel()

# Test set
df_test = 'data/spamTest.mat'
tes_data = sio.loadmat(df_test)
Xtest = tes_data['Xtest']
ytest = tes_data['ytest'].ravel()

In [8]:
#Find Indices of Positive and Negative Examples
pos = np.array([X[i] for i in range(X.shape[0]) if y[i] == 1])
neg = np.array([X[i] for i in range(X.shape[0]) if y[i] == 0])
print('Total number of training emails = ',X.shape[0])
print('Number of training spam emails = ',pos.shape[0])
print('Number of training nonspam emails = ',neg.shape[0])

Total number of training emails =  4000
Number of training spam emails =  1277
Number of training nonspam emails =  2723


In [9]:
C = 0.1  # SVM regularization parameter
svm_clf = LinearSVC(C=C, loss="hinge", random_state=42)
model = svm_clf.fit(X, y)

In [10]:
train_predictions = model.predict(X).reshape(y.shape[0],1)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(train_predictions, y)]  
accuracy = 100. * (sum(map(int, correct)) / len(correct))  
print("\nTraining Accuracy(UNPARTITIONED DATA) = %.2f"%(accuracy)+"%")


Training Accuracy(UNPARTITIONED DATA) = 99.85%


In [11]:
train_predictions = model.predict(Xtest).reshape(ytest.shape[0],1)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(train_predictions, ytest)]  
accuracy = 100. * (sum(map(int, correct)) / len(correct))  
print("\nTraining Accuracy(UNPARTITIONED DATA) = %.2f"%(accuracy)+"%")


Training Accuracy(UNPARTITIONED DATA) = 98.90%


In [12]:
def email_classifier(model,processed_email_fv):
    email_fv   = processed_email_fv.reshape(1,-1) # for a single training example
    prediction = model.predict(email_fv)
    if prediction[0] == 0:
        print("SVM has classified email as NOT A SPAM")
    else:
        print("SVM has classified email as SPAM")

In [13]:
# Loading sample email & vocabulary list for preprocessing
df_path_file = 'data/emailSample2.txt'
df_path_vocab = 'data/vocab.txt'
file = open(df_path_file, 'rt')
email = file.read()
file.close()

In [14]:
vocab_dict_flipped = getVocabList(df_path_vocab,reverse=True)

#Sort indicies from most important to least-important (high to low weight)
sorted_indices = np.argsort( model.coef_, axis=None )[::-1]
print("The 15 most important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[:15] ])
print()
print("The 15 least important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[-15:] ])
print()

The 15 most important words to classify a spam e-mail are:
['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'll', 'natur', 'mortgag', 'futur', 'hot']

The 15 least important words to classify a spam e-mail are:
['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']



In [15]:
# Calling the preprocessing model
processed_email = preprocessing_model(email,df_path_vocab)
email_classifier(model,processed_email)

Length of feature vector is 1899
Number of non-zero entries is: 120
SVM has classified email as NOT A SPAM
