# Exercise 6 | Spam Classification with SVMs

In [None]:
%matplotlib inline

import re
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from nltk import tokenize, stem
from scipy.io import loadmat
from scipy.optimize import minimize
from sklearn import preprocessing
from sklearn import svm

## Part 1: Email Preprocessing

To use an SVM to classify emails into Spam v.s. Non-Spam, you first
need to convert each email into a vector of features. In this part,
you will implement the preprocessing steps for each email. You should
complete the code in processEmail.m to produce a word indices vector
for a given email.

In [None]:
def get_vocab_list():
    with open('vocab.txt') as f:
        vocab_list = dict([(int(i), j)
                           for l in f.readlines()
                           for i, j in [l.strip().split('\t')]])
    return vocab_list

def process_email(file_contents):
    vocab_list = get_vocab_list()
    rev_vocab_list = {v : k for k, v in vocab_list.items()}
    
    # preprocess email
    
    # lower case
    email_contents = file_contents.lower()
    
    # strip all HTML
    email_contents = re.sub(r'<[^<>]+>', ' ', email_contents)
    
    # handle numbers
    email_contents = re.sub(r'[0-9]+', 'number', email_contents)
    
    # handle URLs
    email_contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email_contents)
    
    # handle email addresses
    email_contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email_contents)
    
    # handle $ sign
    email_contents = re.sub(r'[$]+', 'dollar', email_contents)
    
    # tokenize email
    strs = tokenize.word_tokenize(email_contents)
    
    # stem words
    stemmer = stem.PorterStemmer()
    strs = [stemmer.stem(v) for v in strs]
    # get rid of puctuation
    strs = [s.strip(' @$/#.-:&*+=[]?!(){},''">_<;%') for s in strs if s not in ' @$/#.-:&*+=[]?!(){},''">_<;%']
    
    print(f'''
Processed Email
===============
{' '.join(strs)}
===============''')
    return [rev_vocab_list[w] for w in strs if w in rev_vocab_list]


In [None]:
# Preprocessing sample email (emailSample1.txt)

with open('emailSample1.txt') as f:
    file_contents = f.read()

word_indices = process_email(file_contents)
print(f'''
Word Indices:

{word_indices}''')

## Part 2: Feature Extraction

Now, you will convert each email into a vector of features in R^n. You
should complete the code in emailFeatures.m to produce a feature
vector for a given email.

In [None]:
def email_features(word_indices, vocab_size):
    x = np.zeros((vocab_size, 1))
    x[word_indices] = 1
    
    return x

In [None]:
# Extracting features from sample email (emailSample1.txt)

vocab_list = get_vocab_list()
features = email_features(word_indices, len(vocab_list))

print(f'''Length of feature vector: {len(features)}
Number of non-zeor entries: {int(np.sum(features))}''')

## Part 3: Train Linear SVM for Spam Classification

In this section, you will train a linear classifier to determine if an
email is Spam or Not-Spam.

In [None]:
# load the spam email dataset
mat = loadmat('spamTrain.mat')
X = mat['X']
y = mat['y']

# training Linear SVM (Spam Classification)
C = 0.1
model = svm.LinearSVC(C=C).fit(X, y.ravel())

p = model.predict(X)
print(f'Training Accuracy: {np.mean(p == y.ravel()) * 100:.4}')

## Part 4: Test Spam Classification

After training the classifier, we can evaluate it on a test set. We
have included a test set in spamTest.mat

In [None]:
# load the test dataset
mat = loadmat('spamTest.mat')
Xtest = mat['Xtest']
ytest = mat['ytest']

# evaluate the trained Linear SVM on a test set
p = model.predict(Xtest)
print(f'''Test Accuracy: {np.mean(p == ytest.ravel()) * 100:.4}''')

## Part 5: Top Predictors of Spam

Since the model we are training is a linear SVM, we can inspect the
weights learned by the model to understand better how it is
determining whether an email is spam or not. The following code finds
the words with the highest weights in the classifier. Informally, the
classifier 'thinks' that these words are the most likely indicators of
spam.

In [None]:
weights = model.coef_.ravel()
# sort the weights in descending order
idx = np.argsort(weights)[::-1]

print('Top predictors os spam:')
for i in range(15):
    print(f'{vocab_list[idx[i]+1]:15}{weights[idx[i]]:.5}')

## Part 6: Try Your Own Emails

Now that you've trained the spam classifier, you can use it on your
own emails! In the starter code, we have included spamSample1.txt,
spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
The following code reads in one of these emails and then uses your
learned SVM classifier to determine whether the email is Spam or Not
Spam

In [None]:
filename = 'spamSample1.txt'

with open(filename) as f:
    file_contents = f.read()
    
word_indices = process_email(file_contents)
x = email_features(word_indices, len(vocab_list))
p = model.predict(x.reshape(1, -1))

print(f'''
Processed {filename}
Spam Classification: {p}
(1 indicates spam, 0 indicates not spam)
''')