## SVM

In [1]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [2]:
def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]

    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary)

    for item in list_to_remove:
       # remove if numerical. 
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    
    # consider only most 3000 common words in dictionary.
    dictionary = dictionary.most_common(3000)
    return dictionary

In [3]:
def extract_features(mail_dir):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i,line in enumerate(fi):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = words.count(word)
            train_labels[docID] = 0;
            filepathTokens = fil.split('/')
            lastToken = filepathTokens[len(filepathTokens) - 1]
            if lastToken.startswith("spmsg"):
                train_labels[docID] = 1;
                count = count + 1
            docID = docID + 1
    return features_matrix, train_labels

In [5]:
from sklearn import svm
from sklearn.metrics import accuracy_score

TRAIN_DIR = "data/train-mails"
TEST_DIR = "data/test-mails"

dictionary = make_Dictionary(TRAIN_DIR)

print("reading and processing emails from file.")
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)

#This is very basic implementation. It assumes default values of tuning parameters (kernel = linear, C = 1 and gamma = 1)

model = svm.SVC()

print("Training model.")
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

reading and processing emails from file.
Training model.




FINISHED classifying. accuracy score : 
0.8153846153846154


In [19]:
# Reducing the training size by 90%
feature = features_matrix[:70]
label = labels[:70]

### Tuning the Parameters

In [20]:
# Kernel
model = svm.SVC(kernel="rbf", C = 1)

print("Training model.")
#train model
model.fit(feature, label)
predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.
FINISHED classifying. accuracy score : 
0.7




In [22]:
# C
model = svm.SVC(kernel="rbf", C = 100)

print("Training model.")
#train model
model.fit(feature, label)
predicted_labels = model.predict(test_feature_matrix)

print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.
FINISHED classifying. accuracy score : 
0.9


