## Spam Classification
Based on Andre Ng Lectures.
Supervised Learning with SVM from sklearn package.

In [52]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import scipy.io as sio
from sklearn import svm

In [128]:
def ProcessEmail(email,vocabulary):
#   Takes .txt file containing the email and returns the feature vector.
#   Performs various operations to clean, tokenize, stem, and index.
    with open(email, 'r') as f:
        emailContents = f.read()
        
    WordIndices = []
    vocab = pd.read_csv(vocabulary,delimiter='\t',names=['Index','Word'],index_col=False)
    # vocab.head()
    emailContents = emailContents.lower() 

    # Expressions that begin with < and end with >
    emailContents = re.sub('<[^<>]+>',' ',emailContents)

    emailContents = re.sub('[0-9]+', 'number',emailContents)

    emailContents = re.sub('(http|https)://[^\s]*', 'httpaddr',emailContents)

    emailContents = re.sub( '[^\s]+@[^\s]+', 'emailaddr',emailContents)

    emailContents = re.sub( '[$]+', 'dollar', emailContents) 
    # Dollar is currently joined to the number
    emailContents = re.sub( 'dollarnumber', 'dollar', emailContents) 

    emailContents = word_tokenize(emailContents)

    emailContents = [word for word in emailContents if word.isalpha()]
    ps = PorterStemmer()
    emailContents = [ps.stem(word) for word in emailContents]

    #Remove Short words
    emailContents = [word for word in emailContents if len(word)>=2]
#     emailContents = ['dollar' for word in emailContents if word =='dollarnumb']
    
#     print(emailContents)
    for word in emailContents:
        ind = vocab[vocab['Word'] == word].index.tolist()
        if ind:
            WordIndices.append(ind[0]+1)
#     print(WordIndices)
    
    # Create Features
    n = vocab.shape[0]
    X = np.zeros(n)
    for ind in WordIndices:
        X[ind] = 1
#     print(X)
    return X.reshape(-1, 1)

    
X = ProcessEmail('emailSample1.txt','vocab.txt')
# X = ProcessEmail('spamSample1.txt','vocab.txt')


> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




## Train Spam Classifier

In [112]:
rawdata = sio.loadmat('spamTrain.mat',mat_dtype=True)
X = rawdata['X']
y = rawdata['y'].flatten()
print(X.shape)
clf = svm.SVC(kernel='linear', C = 0.1) #, gamma=gamma)
clf.fit(X, y)
Ypred = clf.predict(X)
accuracy = np.mean(np.equal(y,Ypred))
print('Training Accuracy:',accuracy)

(4000, 1899)
Training Accuracy: 0.99825


In [113]:
# Verify accuracy with test data
rawdata = sio.loadmat('spamTest.mat',mat_dtype=True)
Xtest = rawdata['Xtest']
ytest = rawdata['ytest'].flatten()
Ypred = clf.predict(Xtest)
accuracy = np.mean(np.equal(ytest,Ypred))
print('Test Accuracy:',accuracy)

Test Accuracy: 0.989


## What were the top classifiers of spam?

In [126]:
weight, ind = np.sort(clf.coef_),np.argsort(clf.coef_)

weight,ind  = weight[0],ind[0]

for i in range(0,16):
    print(' %-15s  (%f) \n' %(vocab.iloc[ind[i],1],weight[i]))

 spamassassin     (-0.605132) 

 the              (-0.438072) 

 url              (-0.428355) 

 wrote            (-0.409923) 

 date             (-0.405492) 

 list             (-0.361847) 

 rpm              (-0.352484) 

 numbertnumb      (-0.327918) 

 user             (-0.322528) 

 until            (-0.315433) 

 author           (-0.308427) 

 razor            (-0.301497) 

 yahoo            (-0.275555) 

 tom              (-0.269043) 

 httpaddr         (-0.268191) 

 jim              (-0.253419) 



## Predict a New email

In [130]:
newemail = 'spamSample2.txt'
Xm = ProcessEmail(newemail,'vocab.txt')
Predic = clf.predict(Xm.T)
print('Is spam? 0-not spam | 1-is spam\n',Predic)

Best Buy Viagra Generic Online

Viagra 100mg x 60 Pills $125, Free Pills & Reorder Discount, Top Selling 100% Quality & Satisfaction guaranteed!

We accept VISA, Master & E-Check Payments, 90000+ Satisfied Customers!
http://medphysitcstech.ru



Is spam? 0-not spam | 1-is spam
 [1.]
