# Problem Set 4: Build your own spam filter

In [1]:
# Import all necessary packages
import pandas as pd
import numpy as np
import string
import time
import sklearn
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Pre-processing

### Cleaning emails

In [2]:
# Read in data set as a dataframe
emails = pd.read_csv("emails.csv")

In [3]:
# Make a copy of dataframe so we do not alter the original
emailsplit = emails.copy()

In [4]:
# Import tools necessary to create a "bag of words"
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# words are already lemmatized, so we just remove punctuation
emailsplit['text'] = emailsplit['text'].apply(lambda x: re.sub(r'[^\w\s]','',x)) # remove punctuation with regex

# create cached object of stopwords to improve speed
cachedStopwords = set(stopwords.words('english'))

# remove stop words
emailsplit['text'] = emailsplit.apply(lambda x: [item for item in x if item not in cachedStopwords])
emailsplit['text'] = emailsplit['text'].str[8:] # remove "subject" from beginning of each email


#========================================
# NOTES:
# create one preprocessing function where we can choose what processing to do. Easier to check how it affects results.

### Create bag of words and tf-idf sparse matrix

In [5]:
# create bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

corpus = [] # list of email content, where each item in list is an email
for i in range(len(emailsplit['text'])):
    corpus.append(emailsplit.loc[i]['text'])
    
vectorizer = CountVectorizer() # create vectorizer

wordmatrix = vectorizer.fit_transform(corpus) # sparse matrix where each value ij is how many times the word j occurs in email i

In [6]:
# TF-IDF (term frequency-inverse document frequency), a weighting scheme for words.
# The weight increases when a word occurs many times in a small number of documents, leading to increased discriminatory power of the word. 
# The weight decreases when the word occurrs infrequently, or occurs in a large number of documents. 

tfidf_transformer = TfidfTransformer().fit(wordmatrix) # create transformer based on vocab in train set
tfidf = tfidf_transformer.transform(wordmatrix) # calculate tf-idf of each word

In [7]:
# Add extra variables

from scipy.sparse import hstack

# count dollar sign punctuation
count_dollar = lambda l1: sum([1 for x in l1 if x in set('$')])
emailsplit = emailsplit.assign(dollar = emails['text'].apply(count_dollar))

tfidf_ex = hstack((tfidf ,np.array(emailsplit[['dollar']])))

In [8]:
# Create a sparse matrix
tfidf_ex = tfidf_ex.tocsr()

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_ex, emailsplit['spam'], test_size=0.33)

## Train Model 1: Naive Bayes

### Tune hyperparameters

In [11]:
# NB
# Tuning alpha
from sklearn.naive_bayes import MultinomialNB
alphas = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
model3 = MultinomialNB()
grid3 = GridSearchCV(estimator=model3, param_grid=dict(alpha=alphas), return_train_score=True)
grid3.fit(X_train, y_train)
print(grid3.best_score_)
print(grid3.best_estimator_.alpha)

0.9671618451915559
0.1


### Train model

In [12]:
# Keep track of how long it takes to fit/train model
start_time1 = time.time()

# Create model with best estimators from grid search
clf1 = MultinomialNB(alpha=grid3.best_estimator_.alpha)

# Train model
clf1.fit(X_train, y_train)

time1 = str(round(time.time() - start_time1, 2))
print(time1 + " seconds")

# Perform k-fold cross-validation on training data to test accuracy of the model
cross_val_score(clf1, X_train, y_train, cv=10).mean()

0.01 seconds


0.9731566495621049

## Model 2: Support Vector Machine

### Tune hyperparameters

In [13]:
# SVM
# Tuning C and tol
from sklearn.svm import LinearSVC

grid2 = [{'C': [0.0001, 0.001, 0.01, 0.1, 1.0],
         'tol': [0.0001, 0.001, 0.01, 0.1, 1.0]}]
model2 = LinearSVC()
gridcv2 = GridSearchCV(estimator=model2, param_grid=grid2, return_train_score=True)
gridcv2.fit(X_train, y_train)
print(gridcv2.best_score_)
print(gridcv2.best_estimator_.C)
print(gridcv2.best_estimator_.tol)

0.9921813917122753
1.0
0.1


### Train model

In [None]:
# Keep track of how long it takes to fit/train model
start_time2 = time.time()

# Create model with best estimators from grid search
clf2 = LinearSVC(C=gridcv2.best_estimator_.C, 
                 tol=gridcv2.best_estimator_.tol,
                 random_state=0)
clf2.fit(X_train, y_train)

time2 = str(round(time.time() - start_time2, 2))
print(time2 + " seconds")

# Perform k-fold cross-validation on training data to test accuracy of the model
cross_val_score(clf2, X_train, y_train, cv=10).mean()

1.02 seconds


## Model 3: Random Forest

### Tune hyperparameters

In [14]:
# Random Forest
# Tuning for max_features

from sklearn.ensemble import RandomForestClassifier
grid1 = [{'max_features' : ['auto', 'sqrt', 'log2', 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95]}]

model1 = RandomForestClassifier()
gridcv1 = GridSearchCV(estimator=model1, param_grid=grid1, return_train_score=True)
gridcv1.fit(X_train, y_train)
print(gridcv1.best_score_)
print(gridcv1.best_estimator_.max_features)


0.9728954912692207
0.4


### Train model

In [15]:
# Keep track of how long it takes to fit/train model
start_time3 = time.time()

# Create model with best estimators from grid search
clf3 = RandomForestClassifier(max_features = gridcv1.best_estimator_.max_features, 
                              random_state=0)
clf3.fit(X_train, y_train)

time3 = str(round(time.time() - start_time3, 2))

print(time3 + " seconds")

# Perform k-fold cross-validation on training data to test accuracy of the model
cross_val_score(clf3, X_train, y_train, cv=10).mean()

3.62 seconds


0.9726304123150566

## Model 4: Lasso Regularized Logistic Regression

### Train model

In [16]:
# Lasso
# Tuning C, the inverse of the regularization parameter
from sklearn import linear_model

grid4 = [{'C' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10,100,1000,10000]}]

model4 = linear_model.LogisticRegression()
gridcv4 = GridSearchCV(estimator=model4, param_grid=grid4, return_train_score=True)
gridcv4.fit(X_train, y_train)
print(gridcv4.best_score_)
print(gridcv4.best_estimator_.C)

0.9913995308835027
1000


### Train model

In [17]:
# Keep track of how long it takes to fit/train model
start_time4 = time.time()

# Create model with best estimators from grid search
clf4 = linear_model.LogisticRegression(C = gridcv4.best_estimator_.C, random_state=0)
clf4.fit(X_train, y_train)

time4 = str(round(time.time() - start_time4, 2))

print(time4 + " seconds")

# Perform k-fold cross-validation on training data to test accuracy of the model
cross_val_score(clf4, X_train, y_train, cv=10).mean()

0.16 seconds


0.994266073759791

## Run models on test data

In [18]:
"""
Function
--------
compute_rmse

Given two arrays, one of actual values and one of predicted values,
compute the Roote Mean Squared Error

Parameters
----------
predictions : array
    Array of numerical values corresponding to predictions for each of the N observations

yvalues : array
    Array of numerical values corresponding to the actual values for each of the N observations

Returns
-------
rmse : int
    Root Mean Squared Error of the prediction

Example
-------
>>> print compute_rmse((2,2,3),(0,2,6)
2.08
"""
def compute_rmse(predictions, yvalues):
    sigma = np.sum(np.square(np.array(predictions)-np.array(yvalues)))
    rmse = np.sqrt(sigma/len (predictions))
    return rmse

In [19]:
# Predict on test data using Naive Bayes, calculate accuracy using RMSE
clf1_pred = clf1.predict(X_test)
compute_rmse(clf1_pred, y_test)

0.16097271743639588

In [20]:
# Predict on test data using Support Vector Machine, calculate accuracy using RMSE
clf2_pred = clf2.predict(X_test)
compute_rmse(clf2_pred, y_test)

0.10284169677447053

In [21]:
# Predict on test data using Random Forest, calculate accuracy using RMSE
clf3_pred = clf3.predict(X_test)
compute_rmse(clf3_pred, y_test)

0.19376860378775274

In [22]:
# Predict on test data using Lasso, calculate accuracy using RMSE
clf4_pred = clf4.predict(X_test)
compute_rmse(clf4_pred, y_test)

0.09481535954756656

## Compare performance

ROC/AUC, confusion matrix, precision vs. recall, speed


In [24]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Specificity, Sensitivity
def specificity(y_test, clf_pred):
    spec = np.sum((y_test == 0) & (clf_pred == 0)) / (np.sum((y_test == 0) & (clf_pred == 0)) + np.sum((y_test == 0) & (clf_pred == 1)))
    return spec

def sensitivity(y_test, clf_pred):
    sens = np.sum((y_test == 1) & (clf_pred == 1)) / (np.sum((y_test == 1) & (clf_pred == 1)) + np.sum((y_test == 1) & (clf_pred == 0)))
    return sens

print ('Naive Bayes: Specificity vs Sensitivity')
print ('Specificity: ' + str(specificity(y_test, clf1_pred)))
print ('Sensitivity: ' + str(sensitivity(y_test, clf1_pred)))
print('')

print ('Support Vector Machine: Specificity vs Sensitivity')
print ('Specificity: ' + str(specificity(y_test, clf2_pred)))
print ('Sensitivity: ' + str(sensitivity(y_test, clf2_pred)))
print('')

print ('Random Forest: Specificity vs Sensitivity')
print ('Specificity: ' + str(specificity(y_test, clf3_pred)))
print ('Sensitivity: ' + str(sensitivity(y_test, clf3_pred)))
print('')

print ('Lasso Regularized Logistic Regression: Specificity vs Sensitivity')
print ('Specificity: ' + str(specificity(y_test, clf4_pred)))
print ('Sensitivity: ' + str(sensitivity(y_test, clf4_pred)))
print('')

# Time
print ('Naive Bayes: Time to Train')
print(time1 + " seconds")
print()
print ('Support Vector Machine: Time to Train')
print(time2 + " seconds")
print()
print ('Random Forest: Time to Train')
print(time3 + " seconds")
print()
print ('Lasso Regularized Logistic Regression: Time to Train')
print(time4 + " seconds")
print()

# Confusion matrix
print ('Naive Bayes: Confusion Matrix')
print(confusion_matrix(y_test, clf1_pred))
print()

print ('Support Vector Machine: Confusion Matrix')
print(confusion_matrix(y_test, clf2_pred))
print()

print ('Random Forest: Confusion Matrix')
print(confusion_matrix(y_test, clf3_pred))
print()

print ('Lasso Regularized Logistic Regression: Confusion Matrix')
print(confusion_matrix(y_test, clf4_pred))
print()

# ROC/AUC
print ('Naive Bayes: ROC/AUC')
fpr1, tpr1, thresholds1 = metrics.roc_curve(y_test, clf1_pred)
print('False Positive Rate: ' + str(fpr1))
print('True Positive Rate: ' + str(tpr1))
print('Thresholds: '+ str(thresholds1))
print('AUC: ' + str(roc_auc_score(y_test, clf1_pred)))
print()

print ('Support Vector Machine: ROC/AUC')
fpr2, tpr2, thresholds2 = metrics.roc_curve(y_test, clf2_pred)
print('False Positive Rate: ' + str(fpr2))
print('True Positive Rate: ' + str(tpr2))
print('Thresholds: '+ str(thresholds2))
print('AUC: ' + str(roc_auc_score(y_test, clf2_pred)))
print()

print ('Random Forest: ROC/AUC')
fpr3, tpr3, thresholds3 = metrics.roc_curve(y_test, clf3_pred)
print('False Positive Rate: ' + str(fpr3))
print('True Positive Rate: ' + str(tpr3))
print('Thresholds: '+ str(thresholds3))
print('AUC: ' + str(roc_auc_score(y_test, clf3_pred)))
print()

print ('Lasso Regularized Logistic Regression: Confusion Matrix: ROC/AUC')
fpr4, tpr4, thresholds4 = metrics.roc_curve(y_test, clf4_pred)
print('False Positive Rate: ' + str(fpr4))
print('True Positive Rate: ' + str(tpr4))
print('Thresholds: '+ str(thresholds4))
print('AUC: ' + str(roc_auc_score(y_test, clf4_pred)))
print()



Naive Bayes: Specificity vs Sensitivity
Specificity: 0.9965588437715073
Sensitivity: 0.8995433789954338

Support Vector Machine: Specificity vs Sensitivity
Specificity: 0.9965588437715073
Sensitivity: 0.9657534246575342

Random Forest: Specificity vs Sensitivity
Specificity: 0.9683413626978665
Sensitivity: 0.9429223744292238

Lasso Regularized Logistic Regression: Specificity vs Sensitivity
Specificity: 0.9965588437715073
Sensitivity: 0.9726027397260274

Naive Bayes: Time to Train
0.02 seconds

Support Vector Machine: Time to Train
0.72 seconds

Random Forest: Time to Train
3.62 seconds

Lasso Regularized Logistic Regression: Time to Train
0.16 seconds

Naive Bayes: Confusion Matrix
[[1448    5]
 [  44  394]]

Support Vector Machine: Confusion Matrix
[[1448    5]
 [  15  423]]

Random Forest: Confusion Matrix
[[1407   46]
 [  25  413]]

Lasso Regularized Logistic Regression: Confusion Matrix
[[1448    5]
 [  12  426]]

Naive Bayes: ROC/AUC
False Positive Rate: [0.         0.00344116 1.