In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from sklearn import svm

In [2]:
with open('ex6/emailSample1.txt', 'r') as f:
    email = f.read()
    print(email)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com




# preprocess

1. Lower-casing

In [3]:
email = email.lower()
email

"> anyone knows how much it costs to host a web portal ?\n>\nwell, it depends on how many visitors you're expecting.\nthis can be anywhere from less than 10 bucks a month to a couple of $100. \nyou should checkout http://www.rackspace.com/ or perhaps amazon ec2 \nif youre running something big..\n\nto unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

2. Stripping HTML

In [4]:
email = re.sub(r'<.*?>',' ',email)
email

"> anyone knows how much it costs to host a web portal ?\n>\nwell, it depends on how many visitors you're expecting.\nthis can be anywhere from less than 10 bucks a month to a couple of $100. \nyou should checkout http://www.rackspace.com/ or perhaps amazon ec2 \nif youre running something big..\n\nto unsubscribe yourself from this mailing list, send an email to:\ngroupname-unsubscribe@egroups.com\n\n"

In [5]:
email = email.replace('\n', ' ')
email

"> anyone knows how much it costs to host a web portal ? > well, it depends on how many visitors you're expecting. this can be anywhere from less than 10 bucks a month to a couple of $100.  you should checkout http://www.rackspace.com/ or perhaps amazon ec2  if youre running something big..  to unsubscribe yourself from this mailing list, send an email to: groupname-unsubscribe@egroups.com  "

3. Normalizing URLs

In [6]:
pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
email = re.sub(pattern, 'httpaddr', email)
email

"> anyone knows how much it costs to host a web portal ? > well, it depends on how many visitors you're expecting. this can be anywhere from less than 10 bucks a month to a couple of $100.  you should checkout httpaddr or perhaps amazon ec2  if youre running something big..  to unsubscribe yourself from this mailing list, send an email to: groupname-unsubscribe@egroups.com  "

4. Normalizing Email Addresses

In [7]:
pattern = re.compile(r'[-_\w\.]{0,64}@([-\w]{1,63}\.)*[-\w]{1,63}')
email = re.sub(pattern, 'emailaddr', email)
email

"> anyone knows how much it costs to host a web portal ? > well, it depends on how many visitors you're expecting. this can be anywhere from less than 10 bucks a month to a couple of $100.  you should checkout httpaddr or perhaps amazon ec2  if youre running something big..  to unsubscribe yourself from this mailing list, send an email to: emailaddr  "

5. Normalizing Dollars

In [8]:
email = re.sub('[\$]+', 'dollar', email)
email

"> anyone knows how much it costs to host a web portal ? > well, it depends on how many visitors you're expecting. this can be anywhere from less than 10 bucks a month to a couple of dollar100.  you should checkout httpaddr or perhaps amazon ec2  if youre running something big..  to unsubscribe yourself from this mailing list, send an email to: emailaddr  "

6. Normalizing Numbers

In [9]:
email = re.sub('[\d]+', 'number', email)
email

"> anyone knows how much it costs to host a web portal ? > well, it depends on how many visitors you're expecting. this can be anywhere from less than number bucks a month to a couple of dollarnumber.  you should checkout httpaddr or perhaps amazon ecnumber  if youre running something big..  to unsubscribe yourself from this mailing list, send an email to: emailaddr  "

7. Word Stemming
8. Removal of non-words

In [10]:
import nltk
stemmer = nltk.stem.porter.PorterStemmer()

token_list = []
tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)

for token in tokens:
    token = re.sub('[^a-zA-Z0-9]', '', token)
    stemmed = stemmer.stem(token)
    if len(token) == 0: continue
    token_list.append(stemmed)
    
print(token_list)

['anyon', 'know', 'how', 'much', 'it', 'cost', 'to', 'host', 'a', 'web', 'portal', 'well', 'it', 'depend', 'on', 'how', 'mani', 'visitor', 'you', 're', 'expect', 'thi', 'can', 'be', 'anywher', 'from', 'less', 'than', 'number', 'buck', 'a', 'month', 'to', 'a', 'coupl', 'of', 'dollarnumb', 'you', 'should', 'checkout', 'httpaddr', 'or', 'perhap', 'amazon', 'ecnumb', 'if', 'your', 'run', 'someth', 'big', 'to', 'unsubscrib', 'yourself', 'from', 'thi', 'mail', 'list', 'send', 'an', 'email', 'to', 'emailaddr']


# Encoder

In [19]:
df = pd.read_table('ex6/vocab.txt',names=['words'])
vocab = df.values

vocab

array([['aa'],
       ['ab'],
       ['abil'],
       ...,
       ['zdnet'],
       ['zero'],
       ['zip']], dtype=object)

In [21]:
vector = np.zeros(len(vocab))

vector.shape

(1899,)

In [25]:
idx = [i for i in range(len(vocab)) if vocab[i] in token_list]

idx[:5]

[70, 85, 88, 161, 180]

In [26]:
for i in idx:
    vector[i] = 1

In [27]:
vector.sum(), len(vector)

(45.0, 1899)

# Train

In [29]:
from scipy.io import loadmat

data = loadmat('ex6/spamTrain')
X, y = data['X'], data['y']

X.shape, y.shape

((4000, 1899), (4000, 1))

In [30]:
data = loadmat('ex6/spamTest')

X_test, y_test = data['Xtest'], data['ytest']

X_test.shape, y_test.shape

((1000, 1899), (1000, 1))

In [33]:
model = svm.SVC(C=1, kernel='linear')
model.fit(X, y)

  y = column_or_1d(y, warn=True)


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [36]:
test_pre = model.score(X_test, y_test)
test_pre

0.978