# Spam Classifier

In [11]:
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as io
import sklearn.svm as svm # For SVM.
import re # For matching regular expressions.
import nltk.tokenize as tkn # For tokenizing.
import nltk.stem.porter as stem # For stemming.
%matplotlib inline

In [7]:
# Glimpse of the data.
print("emailSample1.txt:")
!cat data/emailSample1.txt

emailSample1.txt:
> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com



## Cleaning the data

### Email preprocessing
- Lower-casing: The entire email is converted into lower case.
- Stripping HTML: All HTML tags are removed from the emails. 
- Normalizing URLs: All URLs are replaced with the text “httpaddr”.
- Normalizing Email Addresses: All email addresses are replaced with the text “emailaddr”.
- Normalizing Numbers: All numbers are replaced with the text "number".
- Normalizing Dollars: All dollar signs are replaced with the text “dollar”.

In [12]:
def preprocessor(email):
    # Lower casing
    email = email.lower()
    
    # Stripping HTML tags (<..> .... </..>)
    email = re.sub('<[^<>]+>', ' ', email)
    
    # Normalizing URLs
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    
    # Normalizing Email Adrresses
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    
    # Normalizing Numbers
    email = re.sub('[0-9]+', 'number', email)
    
    # Normalizig Dollars
    email = re.sub('[$]+', 'dollar', email)
    
    return email

### Tokenizing and stemming

The following function takes in a raw email, preprocesses it using the above preprocessor, then tokenizes it, stems each word and returns an ordered list of tokens. 

In [13]:
def email_to_tokens(email):
    # Preprocessing
    email = preprocessor(email)
    
    # Tokenizing
    tokenizer = tkn.RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(email)
    
    # Stemming
    stemmer = stem.PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    return sorted(set(stemmed_tokens))

## Vocabulary and indexing

In [26]:
vocabulary = {}
with open('data/vocab.txt') as f:
    for line in f:
        val, key = line.split()
        vocabulary[key] = int(val)

Here we convert the tokens in the processed email to the indices corresponding to those in the vocabulary.

In [34]:
def tokens_to_indices(email, vocabulary):
    tokens = email_to_tokens(email)
    indices = [vocabulary[token] for token in tokens if token in vocabulary]
    return indices

In [35]:
email = open('data/emailSample1.txt').read()
print(tokens_to_indices(email, vocabulary))

[71, 86, 89, 162, 181, 238, 370, 375, 431, 479, 530, 531, 592, 688, 790, 794, 799, 810, 883, 916, 945, 961, 992, 1002, 1062, 1077, 1120, 1162, 1171, 1182, 1237, 1364, 1440, 1477, 1510, 1547, 1663, 1676, 1699, 1758, 1822, 1831, 1893, 1895, 1896]


## Feature vector creation
The feature vectors show if a word in the email is present in the vocabulary. The function below will create a feature vector from a list of indices, if the $i$-th word from the vocabulary appears in the email then the corresponding entry in the vector will be $1$, else $0$.  

In [36]:
def feature_vector_from_email(email, vocabulary):
    total_words = len(vocabulary)
    email_indices = tokens_to_indices(email, vocabulary)
    feature_vector = np.zeros(total_words)
    feature_vector[email_indices] = 1
    return feature_vector

In [41]:
email = open('data/emailSample1.txt').read()
fv = feature_vector_from_email(email, vocabulary)
print(f'Length of feature vector: {fv.size}')
print(f'Number of non-zero entries: {int(np.sum(fv))}')

Length of feature vector: 1899
Number of non-zero entries: 45


## SVM as spam classifier

In [45]:
# Load data
training_data = io.loadmat(os.path.join('data', 'spamTrain.mat'))
X, y = training_data['X'], training_data['y'].ravel()

test_data = io.loadmat(os.path.join('data', 'spamTest.mat'))
Xtest, ytest = test_data['Xtest'], test_data['ytest'].ravel()

In [50]:
pos, neg = X[y==1], X[y==0]
print(f'Total number of training emails {X.shape[0]}')
print(f'Total number of spam emails {pos.shape[0]}')
print(f'Total number of nonspam emails {neg.shape[0]}')

Total number of training emails 4000
Total number of spam emails 1277
Total number of nonspam emails 2723


In [51]:
# Linear SVM, with C = 0.1
linear_svm = svm.SVC(C=0.1, kernel='linear')
linear_svm.fit(X, y)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [53]:
# Test
training_prediction = linear_svm.predict(X)
test_prediction = linear_svm.predict(Xtest)

training_accuracy = (np.sum(training_prediction == y) / y.size) * 100
test_accuracy = (np.sum(test_prediction == ytest) / ytest.size) * 100

print(f'The training accuracy is {training_accuracy}.')
print(f'The test accuracy is {test_accuracy}.')

The training accuracy is 99.825.
The test accuracy is 98.9.


### Top predictors for spam
Now we check which words, according to the classifier, are the most predictive of spam.

In [70]:
flipped_vocabulary = {k:v for v,k in vocabulary.items()}
sorted_indices = np.argsort(linear_svm.coef_, axis=None)[::-1]

print('The fifteen top predictors for spam are:')
print(*[flipped_vocabulary[index] for index in sorted_indices[:15]])

print('The fifteen bottom predictors for spam are:')
print(*[flipped_vocabulary[index] for index in sorted_indices[-15:]])

The fifteen top predictors for spam are:
otherwis clearli remot gt visa base doesn wife previous player mortgag natur ll futur hot
The fifteen bottom predictors for spam are:
http toll xp ratio august unsubscrib useless numberth round linux datapow wrong urgent that spam


In [82]:
# Optional predictions
def prediction(email_file):
    email = open(email_file).read()
    fv = feature_vector_from_email(email, vocabulary)
    prediction = linear_svm.predict(fv.reshape(1, -1))
    if (prediction[0] == 1):
        print('Spam')
    else:
        print('Not Spam')

In [83]:
prediction('data/emailSample2.txt')

Not Spam


In [84]:
prediction('data/spamSample1.txt')

Not Spam


In [85]:
prediction('data/spamSample2.txt')

Spam


Thus the classifier correctly classifies the second non-spam and spam email samples but fails to classify the first spam email sample.