In [93]:
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [83]:
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

No. of Chris training emails :  7936
No. of Sara training emails :  7884


In [76]:
# creating classifier and fitting in training data
clf = SVC(kernel='linear')
t0 = time()
clf.fit(features_train, labels_train)
print("training time: ", round(time() - t0, 3), "s")

training time:  117.647 s


In [77]:
# calculating prediction 
t0 = time()
pred = clf.predict(features_test)
print("prediction time: ", round(time() - t0, 3), "s")

prediction time:  11.172 s


In [17]:
acc = accuracy_score(pred, labels_test)
print("accuracy: ", acc)

accuracy:  0.9840728100113766


## Speeding up the training time

In order to speed up SVM algo, we have to slice the training dataset. In our case, we will try to see how it works with only 1% of the original dataset, as an experiment.

In [84]:
features_train = features_train[:int(len(features_train)/100)]
labels_train = labels_train[:int(len(labels_train)/100)]

In [85]:
t0 = time()
clf.fit(features_train, labels_train)
print("training time: ", round(time() - t0, 3), "s")

training time:  0.083 s


In [86]:
t0 = time()
pred = clf.predict(features_test)
print("prediction time: ", round(time() - t0, 3), "s")

prediction time:  0.566 s


In [87]:
acc = accuracy_score(pred, labels_test)
print("accuracy: ", acc)

accuracy:  0.8845278725824801


In [23]:
### rfb kernel ###

In [36]:
clf_rbf = SVC(kernel='rbf', gamma='auto')
t0 = time()
clf_rbf.fit(features_train, labels_train)
print("training time: ", round(time() - t0, 3), "s")

training time:  0.08 s


In [38]:
t0 = time()
pred = clf_rbf.predict(features_test)
print("prediction time: ", round(time() - t0, 3), "s")

prediction time:  1.096 s


In [39]:
acc = accuracy_score(pred, labels_test)
print("accuracy: ", acc)

accuracy:  0.6160409556313993


In [40]:
### rfb kernel + C ###

In [50]:
clf_rbf = SVC(kernel='rbf', gamma='auto', C=10000)
t0 = time()
clf_rbf.fit(features_train, labels_train)
print("training time: ", round(time() - t0, 3), "s")

training time:  0.08 s


In [51]:
t0 = time()
pred = clf_rbf.predict(features_test)
print("prediction time: ", round(time() - t0, 3), "s")

prediction time:  0.861 s


In [52]:
acc = accuracy_score(pred, labels_test)
print("accuracy: ", acc)

accuracy:  0.8924914675767918


With an increased C parameter, we get a better accuracy. But we shall not anticipate that a larger C results in a more complex boundary.

In [53]:
### going back to the full dataset ###
features_train, features_test, labels_train, labels_test = preprocess()

No. of Chris training emails :  7936
No. of Sara training emails :  7884


In [54]:
clf_rbf = SVC(kernel='rbf', gamma='auto', C=10000)
t0 = time()
clf_rbf.fit(features_train, labels_train)
print("training time: ", round(time() - t0, 3), "s")

training time:  65.722 s


In [55]:
t0 = time()
pred = clf_rbf.predict(features_test)
print("prediction time: ", round(time() - t0, 3), "s")

prediction time:  9.795 s


In [56]:
acc = accuracy_score(pred, labels_test)
print("accuracy: ", acc)

accuracy:  0.9908987485779295


In [57]:
### back to 1% dataset ###

In [58]:
features_train = features_train[:int(len(features_train)/100)]
labels_train = labels_train[:int(len(labels_train)/100)]

In [59]:
clf_rbf = SVC(kernel='rbf', gamma='auto', C=10000)
t0 = time()
clf_rbf.fit(features_train, labels_train)
print("training time: ", round(time() - t0, 3), "s")

training time:  0.096 s


In [63]:
t0 = time()
pred = clf_rbf.predict(features_test)
print("prediction time: ", round(time() - t0, 3), "s")
print("pediction for the 10th, 26th, 50th element: ", pred[10], pred[26], pred[50])

prediction time:  0.776 s
pediction for the 10th, 26th, 50th element:  1 0 1


In [None]:
### back to full dataset to check how many emails are predicted to be written by Chris ###

In [73]:
features_train, features_test, labels_train, labels_test = preprocess()
clf_rbf.fit(features_train, labels_train)
print("Emails predicted to be written by Chris: ", sum(clf_rbf.predict(features_test) == 1))

No. of Chris training emails :  7936
No. of Sara training emails :  7884
Emails predicted to be written by Chris:  877
