# Ex6c - Spam classification with SVM
Here we use what we have learned in the Email Features Extraction developed on the previous notebook to create a function that runs the work for us.

In [1]:
import re
from nltk.stem import PorterStemmer
import numpy as np

# We'll use loadmap to load the matlab dataset
from scipy.io import loadmat

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Email Features Extraction Function

In [2]:
fname = 'emailSample1.txt'
fh = open(fname)
words = list()
for line in fh:
    linewords = line.split()
    for word in linewords:
        if word not in words:
            words.append(word)
# words.sort()
fh.close()

In [3]:
def eFeatExtract(words):
    
    # Email Preprocess
    ##################
    # lower case
    words = list(map(lambda word: word.lower(), words))
    # Stripping HTML
    words = list(map(lambda word: re.sub('<[^<>]+>', ' ', word), words))
    # Handle numbers
    words = list(map(lambda word: re.sub('[0-9]+', 'number', word), words))
    # Handle URLs
    words = list(map(lambda word: re.sub('(http|https)://[^\s]*', 'httpaddr', word), words))
    # Handle Email Addresses
    words = list(map(lambda word: re.sub('[^\s]+@[^\s]+', 'emailaddr', word), words))
    # Handle Email Addresses
    words = list(map(lambda word: re.sub('[$]+', 'dollar', word), words))
    # Remove any non alphanumeric characters
    words = list(map(lambda word: re.sub('[^a-zA-Z0-9]', '', word), words))
    # Remove any empty string
    words = list(filter(None, words))
    
    # Email tokenize
    ###########################
    ps = PorterStemmer()
    words = list(map(lambda word: ps.stem(word), words))
    
    # VocabList
    #######################
    fname = 'vocab.txt'
    fh = open(fname)
    vocabList = list()
    for line in fh:
        linewords = line.split()
        for word in linewords[1:]:
            if word not in vocabList:
                vocabList.append(word)
    fh.close()

    wordIndices = list(map(lambda word: vocabList.index(word) if (word in vocabList) else None, words))
    # Remove the characters not found
    wordIndices = list(filter(None, wordIndices))
    
    featVector = np.zeros((len(vocabList)), dtype = int)
    for i in range(len(wordIndices)): featVector[wordIndices[i]]=1
    
    return featVector


In [4]:
print(eFeatExtract(words))

[0 0 0 ... 0 0 0]


## Training SVM for Spam Classification

Loading the Spam Email dataset. <code>spamTrain.mat</code> contains 4000 examples of spam and not spam email while <code>spamTest.mat</code> contains 1000.

In [5]:
# Loading all data on a dictonary
data = loadmat('spamTrain.mat')

# Convert the data into a numpy array
X = data['X']
y = data['y'].flatten()

data = loadmat('spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest'].flatten()


In [13]:
# param_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
# param_grid = [{'C' : param_range,
#               'kernel': ['linear']},
#              {'C' : param_range,
#               'gamma': param_range,
#               'kernel': ['rbf']}]

param_range = [0.1]
param_grid = [{'C' : param_range,
               'kernel': ['linear']}]

gs = GridSearchCV(estimator=SVC(random_state=1),
                 param_grid=param_grid,
                 scoring='accuracy',
                 cv=10,
                 refit=True,
                 n_jobs=-1)

gs.fit(X, y)

GridSearchCV(cv=10, estimator=SVC(random_state=1), n_jobs=-1,
             param_grid=[{'C': [0.1], 'kernel': ['linear']}],
             scoring='accuracy')

In [14]:
gs.best_params_, gs.best_score_

({'C': 0.1, 'kernel': 'linear'}, 0.9770000000000001)

In [15]:
print('Test accuracy: %.3f' %gs.score(Xtest, ytest))

Test accuracy: 0.989
