# Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.io
import re
from math import *
from sklearn import svm

In [5]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Initializing the PorterStemmer
ps = PorterStemmer()

# Downloading the punkt model
#nltk.download('punkt')

In [6]:
sns.set_style('whitegrid')
%matplotlib inline

# Functions

In [7]:
def readFile(fileText):
    try:
        # Read The text file
        file = open(fileText, 'r')
        fileContent = file.read()
        
        # Closing stream after reading it
        file.close()
        
        # Returing file Content
        return { "status": True, "content": fileContent, "msg": '' }
    
    except FileNotFoundError as e:
        # File can't be found
        print(e)
        
        # Returning empty string
        return { "status": False, "content": " ", "msg": e } 
    
def getVocabList():

    # Reading VocabList
    file = readFile('vocab.txt')

    if(file["status"]):
        
        # Getting content of the file
        fileContent = file["content"]
        
        
        # Replacing Numbers with ' '
        numberPattern = "(\d+)"
        fileContent = re.sub(numberPattern, ' ', fileContent)

        
        # Remove any non alphanumeric characters
        nonWordPattern = '[^a-zA-Z0-9]'
        fileContent = re.sub( nonWordPattern, ' ', fileContent)

        
        # Replace multiple spaces with single space
        spacePattern = "[ ]+"
        fileContent = re.sub( spacePattern ,' ', fileContent)

        # Tokenize words
        try:
            
            # Tokenize all of the words
            words = word_tokenize(fileContent)
            return words

        # Error Occured  
        except:
            print("Some Error Occured in Stemming Process")
            return ['']
    else:
        
        # reading file has some problems
        print("We have some problems in Reading File")
        print(file["msg"])
    

    
def processEmail(fileName):

    # Read The text file
    file = readFile(fileName)
    
    if(file["status"]):
        
        # Getting content of the file
        fileContent = file["content"]
        
        # Convert string to lowercase
        fileContent = fileContent.lower()

        # Strip HTML
        htmlPattern = "<[^>]*>"
        fileContent = re.sub(htmlPattern,' ', fileContent)

        # Normalize URLs
        urlPattern = "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
        fileContent = re.sub(urlPattern,'httpaddr', fileContent)

        # Normalize Numbers
        numberPattern = "(\d+)"
        fileContent = re.sub(numberPattern, 'number', fileContent)

        # Normalize Email Address
        emailPattern = r'[\w\.-]+@[\w\.-]+'
        fileContent = re.sub(emailPattern, 'emailaddr', fileContent)

        # Normalize Dollars
        dollarPattern = '[$]+'
        fileContent = re.sub(dollarPattern, 'dollar', fileContent)

        # Remove any non alphanumeric characters
        nonWordPattern = '[^a-zA-Z0-9]'
        fileContent = re.sub( nonWordPattern, ' ', fileContent)

        # Replace multiple spaces with single space
        spacePattern = "[ ]+"
        fileContent = re.sub( spacePattern ,' ', fileContent)

        # Words Stemming
        try:
            # Tokenize all of the words
            words = word_tokenize(fileContent)

            # Word Stemming
            words = [ps.stem(x) for x in words]

        except:
            print("Some Error Occured in Stemming Process")
        
        # Initialzing word_indices
        word_indices = []
        
        for w in words:  
            # Constructing Word_indices
            try:
                idx = vocab.index(w)
                word_indices.append(idx)
                
            except ValueError as e:
                # Words doesn't exist in Vobabulary
                continue
        
        return word_indices
    else:
       
        # reading file has some problems
        print("We have some problems in Reading File")
        print(file["msg"])
        
def emailFeatures(word_indices):
    # Total number of words in the dictionary
    n = 1900
    
    # creating feature vector
    matrix = np.zeros((n,1))
    
    # Mapping word_indices to feature vector
    matrix[word_indices] = 1
    
    return matrix

def findBestModel(X,y, Xval, yval):
    # Initializing the Possible values for both C and Sigma
    pValues = np.array([0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]);
    
    # Creating matrix for holding the error of each model
    error = np.zeros((len(pValues) ** 2,1))
    
    # Computing model error for each permutation of the sigma and C
    for i in range(len(pValues)):
        for j in range(len(pValues)):
            # Initializing The Model
            model = svm.SVC(C=pValues[i] ,kernel= 'rbf' ,gamma= 2 * ( pValues[j] ** 2 ))
            
            # Fitting Data to The Model
            model.fit(X,y)
            
            # Computing error of the Model on the Cross Validation Dataset
            error[ i * len(pValues) + j ] = 1 - model.score(Xval, yval)
            
    # Getting the minimum value index in error matrix
    idx = np.argmin(error)
    
    # Finding C, sigma for model with minimum error
    i = np.floor(idx / len(pValues))
    j = idx - i * len(pValues)
    
    C = pValues[int(i)]
    sigma = pValues[int(j)]
    
    return { "C": C,
           "sigma": sigma }

# Spam Classifier

## Load Data

In [8]:
mat = scipy.io.loadmat('spamTrain.mat')
X = mat["X"][0:3400]
y = mat["y"].T[0][0:3400]

Xval = mat["X"][3400:4000]
yval = mat["y"].T[0][3400:4000]

## Train The SVM

In [24]:
findBestModel(X,y,Xval,yval)

{'C': 10.0, 'sigma': 0.029999999999999999}

In [9]:
# Initializing The Model
model = svm.SVC(C=10 ,kernel= 'rbf' ,gamma= 2 * ( 0.3 ** 2 ))

# Fitting Data to The Model
model.fit(X,y)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.18, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
model.score(Xval,yval)

0.80666666666666664

## Find Best Model With Sklearn

In [11]:
from sklearn.grid_search import GridSearchCV

In [31]:
param_grid = { 'C' : [ 0.1, 0.4, 0.8, 2, 5, 10, 20, 40, 100, 200, 400, 1000], 'gamma' : [ 1, 0.1, 0.01, 0.001, 0.0001,]}
grid = GridSearchCV(svm.SVC(), param_grid, verbose= 3)

In [14]:
grid.fit(X,y)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.679894 -  13.9s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s remaining:    0.0s


[CV] ......................... C=0.1, gamma=1, score=0.679612 -  15.2s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   29.2s remaining:    0.0s


[CV] ......................... C=0.1, gamma=1, score=0.681377 -  15.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.697531 -  13.6s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.691086 -  16.9s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.687555 -  18.3s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...................... C=0.1, gamma=0.01, score=0.888889 -  12.7s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...................... C=0.1, gamma=0.01, score=0.895852 -  12.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...................... C=0.1, gamma=0.01, score=0.891439 -  12.6s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 33.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 0.4, 0.8, 2, 5, 10, 20, 40, 100, 200, 400, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [29]:
model = svm.SVC(C=5, gamma=0.01, kernel='rbf')
model.fit(X,y)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [30]:
model.score(Xval,yval)

0.97999999999999998