In [18]:
import os 
# This collections implements specialized container datatypes.
# A counter tool is provided to support convenient and rapid tallies.
from collections import Counter
import numpy as np
from sklearn.datasets import make_circles
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
np.set_printoptions(threshold=np.nan)
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
TRAIN_DIR = "./train-mails"
TEST_DIR = "./test-mails"

In [3]:
def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)       # it will create a dictionary of words: number of times it appered
    list_to_remove = list(dictionary)     # gets only keys in list
    for item in list_to_remove:
       if item.isalpha() == False:        # remove if numerical. 
            del dictionary[item]
       elif len(item) == 1:
            del dictionary[item]
            
    # consider only most 3000 common words in dictionary.
    dictionary = dictionary.most_common(3000)
    
    return dictionary

In [4]:
# features_matrix is a matrix with word replaced with nuber of times it appered in string
# eg: str0 = "i am good i"
# suppose am is stop word
# features_matrix = [[2, 0, 1, 2], [0, 0, 0, 0], ....]
def extract_features(mail_dir):
  files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
  features_matrix = np.zeros((len(files),3000))
  train_labels = np.zeros(len(files))
  count = 0;
  docID = 0;
  for fil in files:
    with open(fil) as fi:
      for i,line in enumerate(fi):         # convert file words in enumerate
        if i == 2:                         # text starts from line no 2
          words = line.split()             # split lines
          for word in words:               # for each word
            wordID = 0
            for i,d in enumerate(dictionary):
              if d[0] == word:
                wordID = i                 # index of word in dictionary
                features_matrix[docID,wordID] = words.count(word)           # array of index where the word appered in line
      train_labels[docID] = 0;
      filepathTokens = fil.split('/')
      lastToken = filepathTokens[len(filepathTokens) - 1]
      if lastToken.startswith("spmsg"):
          train_labels[docID] = 1;
          count = count + 1
      docID = docID + 1
  return features_matrix, train_labels

In [21]:
def plot_decision_surface_sklearn(clf,X,y):
    X0 = X[np.where(y == 0)]
    X1 = X[np.where(y == 1)]

    plt.figure()
    
    x_min = X[:, 0].min()
    x_max = X[:, 0].max()
    y_min = X[:, 1].min()
    y_max = X[:, 1].max()

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
    Z = Z.reshape(XX.shape)
    plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
    
    plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
              linestyles=['--', '-', '--'],
              levels=[-.5, 0, .5])
    plt.scatter(X0[:, 0], X0[:, 1], c='r',s=50)
    plt.scatter(X1[:, 0], X1[:, 1], c='b',s=50)
    plt.show()

In [22]:
with open("./train-mails/6-73msg1.txt") as m:
            print("non spam file")
            for line in m:
                print(line)

non spam file
Subject: kant innateness



reference innateness kant easy , here one relevant current controversy regard linguistic " realism " v . " conceptualism . " middle course propose bewtween two above mention , namely , category neither self-thought first principle apriorus our knowledege nor derive experience , sujective disposition , implant us first momement , order our creator employment complete harmony law nature accordance experience proceeds - - kind preformation-system pure reason . . . decisive ojection . . . necessity category . . . sacrifice . ( b167-168 . _ critique pure reason _ tran . kemp smith . ) noam chomsky observe " rationalist " " empiricist " theory language incorporate " innate disposition . " ( _ reflection language _ . pantheon . 1975 . p . 215 ) whether logical modality enter linguistic picture uncertain , one imagine formulation minimality connection economy derivation necessary sense , choose possible example purpose illustration . , modality exclude

In [6]:
with open("./train-mails/spmsgb154.txt") as m:
            print("spam file")
            for line in m:
                print(line)

spam file
Subject: luck ! ! ! !



- - adult - - most fun sit ! ! click here



In [7]:
dictionary = make_Dictionary(TRAIN_DIR)

In [8]:
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)

In [24]:
# This is very basic implementation. It assumes default values of tuning parameters (kernel = linear, C = 1 and gamma = 1)
model = svm.SVC()
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)

In [25]:
print("accuracy: ", accuracy_score(test_labels, predicted_labels))

accuracy:  0.8153846153846154


(702, 3000) (702,)


ValueError: X.shape[1] = 2 should be equal to 3000, the number of features at training time

<Figure size 432x288 with 0 Axes>

In [16]:
# to improve accuraccy we will change svm parameters
# default kernal is rbf:
# c(regularization parameter)
model = svm.SVC(kernel="rbf",C=100, gamma=0.001)
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)

In [17]:
print("accuracy: ", accuracy_score(test_labels, predicted_labels))

accuracy:  0.9730769230769231


In [None]:
# 97% accurracy is quite good