In [1]:
%pylab inline
from scipy.io import loadmat
from sklearn.svm import SVC
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
with open('emailSample1.txt') as f:
    email1_contents = f.read()

# Preprocessing Emails

In [3]:
vocab_df = pd.read_csv('vocab.txt',
            delimiter='\t',
            header=None,
           names=['index', 'word'])
vocab = vocab_df.word.tolist()

In [4]:
def processEmail(text):
    # convert to lowercase
    text = text.lower()
    # strip all html tags
    text = re.sub(r'<[^<>]*>', ' ', text)
    # handle numbers
    text = re.sub(r'[0-9]+', 'number', text)
    # handle urls
    text = re.sub(r'(http|https)://[^\s]*', ' httpaddr ', text)
    # handle email adrresses
    text = re.sub(r'[^\s]+@[^\s]+', ' emailaddr ', text)
    # handle $ sign
    text = re.sub(r'\$+', 'dollar', text)
    # tokenize words
    words = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in words if w.isalpha()]
    return [vocab.index(s) for s in stemmed if s in vocab]

In [5]:
word_indices = processEmail(email1_contents)
' '.join(str(idx) for idx in word_indices)

'85 915 793 1076 882 369 1698 789 1821 1830 882 430 1170 793 1001 1892 591 1675 237 161 88 687 944 1662 1119 1061 1698 374 1161 478 1892 1509 798 1181 1236 809 1894 1439 1546 1698 1757 1895 687 1675 991 960 1476 70 529 1698 530'

# Extracting Features from Emails

In [6]:
def emailFeatures(indices):
    features = np.zeros(1899)
    for i in indices:
        features[i] = 1
    return features

In [7]:
emailFeatures(word_indices)

array([0., 0., 0., ..., 0., 0., 0.])

# Training SVM for Spam Classification

In [8]:
spam_train = loadmat('spamTrain.mat')
X_train = spam_train['X']
y_train = spam_train['y'].flatten()

spam_test = loadmat('spamTest.mat')
X_test = spam_test['Xtest']
y_test = spam_test['ytest'].flatten()

In [9]:
clf = SVC(C=0.1, kernel="linear")
clf.fit(X_train,y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.99825, 0.989)

# Top Predictors forSpam

In [10]:
' '.join([vocab[tup[0]] for tup in sorted(enumerate(clf.coef_[0]), key=lambda x: x[1], reverse=True)][:20])

'our click remov guarante visit basenumb dollar will price pleas most nbsp lo ga hour al da se want dollarnumb'

# Predict Spam Samples

In [11]:
with open('emailSample1.txt') as f:
    email1_contents = f.read()
with open('emailSample2.txt') as f:
    email2_contents = f.read()
with open('spamSample1.txt') as f:
    spam1_contents = f.read()
with open('spamSample2.txt') as f:
    spam2_contents = f.read()

In [12]:
email1_features = emailFeatures(processEmail(email1_contents))
email2_features = emailFeatures(processEmail(email2_contents))
spam1_features = emailFeatures(processEmail(spam1_contents))
spam2_features = emailFeatures(processEmail(spam2_contents))
clf.predict(np.vstack((email1_features,
                       email2_features,
                       spam1_features,
                       spam2_features)))

array([0, 0, 1, 1], dtype=uint8)