# Problem Set 4: Build your own spam filter

In [89]:
import pandas as pd
import numpy as np
import string
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Pre-processing

### Cleaning emails

In [3]:
emails = pd.read_csv("emails.csv")

In [22]:
# use smaller dataset to play with
emailsplit = emails.copy()
# emailsplit = emailsplit.loc[0:200]

In [23]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# words are already lemmatized, so we just remove punctuation
emailsplit['text'] = emailsplit['text'].apply(lambda x: re.sub(r'[^\w\s]','',x)) # remove punctuation with regex

# create cached object of stopwords to improve speed
cachedStopwords = set(stopwords.words('english'))

# remove stop words
emailsplit['text'] = emailsplit.apply(lambda x: [item for item in x if item not in cachedStopwords])
emailsplit['text'] = emailsplit['text'].str[8:] # remove "subject" from beginning of each email

# remove numbers (alternatively, could convert to a single constant like a punctuation symbol to preserve some info)
# emailsplit['text'] = emailsplit['text'].apply(lambda x: [item for item in x if item not in set())])

#========================================
# NOTES:
# create one preprocessing function where we can choose what processing to do. Easier to check how it affects results.
# does feature_extraction already do the above? may be redundant

### Create bag of words and tf-idf sparse matrix

In [24]:
# create bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

corpus = [] # list of email content, where each item in list is an email
for i in range(len(emailsplit['text'])):
    corpus.append(emailsplit.loc[i]['text'])
    
vectorizer = CountVectorizer() # create vectorizer

wordmatrix = vectorizer.fit_transform(corpus) # sparse matrix where each value ij is how many times the word j occurs in email i

In [25]:
# TF-IDF (term frequency-inverse document frequency), a weighting scheme for words.
# The weight increases when a word occurs many times in a small number of documents, leading to increased discriminatory power of the word. 
# The weight decreases when the word occurrs infrequently, or occurs in a large number of documents. 

tfidf_transformer = TfidfTransformer().fit(wordmatrix) # create transformer based on vocab in train set
tfidf = tfidf_transformer.transform(wordmatrix) # calculate tf-idf of each word

In [32]:
# # Add extra variables

# from scipy.sparse import hstack

# # # punctuation count
# # punct = set(string.punctuation)
# # count_punct = lambda l1: sum([1 for x in l1 if x in punct])
# # emailsplit = emailsplit.assign(punct = emails['text'].apply(count_punct))

# # # message length
# # emailsplit = emailsplit.assign(length = emails['text'].apply(len))

# # count dollar sign punctuation
# count_dollar = lambda l1: sum([1 for x in l1 if x in set('$')])
# emailsplit = emailsplit.assign(dollar = emails['text'].apply(count_dollar))

# tfidf_ex = hstack((tfidf ,np.array(emailsplit[['dollar']])))

In [33]:
# tfidf_ex = tfidf_ex.tocsr()

In [71]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_ex, emailsplit['spam'], test_size=0.33)

## Train Model 1: Naive Bayes

### Tune hyperparameters

In [None]:
# NB
# Tuning n_estimators
alphas = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
model3 = MultinomialNB()
grid3 = GridSearchCV(estimator=model3, param_grid=dict(alpha=alphas), return_train_score=True)
grid3.fit(X_train, y_train)
print(grid3.best_score_)
print(grid3.best_estimator_.alpha)

### Train model

In [72]:
from sklearn.naive_bayes import MultinomialNB

start_time1 = time.time()

clf1 = MultinomialNB(random_state=0)
# train model
clf1.fit(X_train, y_train)

time1 = str(round(time.time() - start_time1, 2))
print(time1 + " seconds")

cross_val_score(clf1, X_train, y_train, cv=10).mean()

0.860565370410874

## Model 2: Support Vector Machine

### Tune hyperparameters

In [110]:
# SVM
# Tuning n_estimators
grid2 = [{'C': [0.0001, 0.001, 0.01, 0.1, 1.0],
         'tol': [0.0001, 0.001, 0.01, 0.1, 1.0]}]
model2 = LinearSVC()
gridcv2 = GridSearchCV(estimator=model2, param_grid=grid2, return_train_score=True)
gridcv2.fit(X_train, y_train)
print(gridcv2.best_score_)
print(gridcv2.best_estimator_.n_estimators)

0.9900964295022153


AttributeError: 'LinearSVC' object has no attribute 'n_estimators'

### Train model

In [74]:
from sklearn.svm import LinearSVC

start_time2 = time.time()

clf2 = LinearSVC(random_state=0)
clf2.fit(X_train, y_train)

time2 = str(round(time.time() - start_time2, 2))
print(time2 + " seconds")

# cross validation
from sklearn.cross_validation import cross_val_score
cross_val_score(clf2, X_train, y_train, cv=10).mean()

0.9929639868665673

## Model 3: Random Forest

### Tune hyperparameters

In [85]:
# Random Forest
# Tuning for n_estimators
# n_estimators = np.array(np.arange(50,70,1)) # the optimal parameters always seem to be the bigger ones
# grid1 = [{'n_estimators' = [np.arange(0, 50, 1)],
#          'max_features' = ['none', 'all', np.arange(0,1,0.05)]}]

grid1 = [{'max_features' : ['auto', 'sqrt', 'log2', 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

model1 = RandomForestClassifier()
gridcv1 = GridSearchCV(estimator=model1, param_grid=grid1, return_train_score=True)
gridcv1.fit(X_train, y_train)
print(gridcv1.best_score_)
print(gridcv1.best_estimator_.max_features)

# gridcv1.best_estimator_.max_features = 0.2

0.9713317696116758
0.2


In [111]:
np.arange(0.1,1.0,0.1)

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

### Train model

In [91]:
from sklearn.ensemble import RandomForestClassifier
start_time3 = time.time()
clf3 = RandomForestClassifier(max_features = gridcv1.best_estimator_.max_features, 
                              random_state=0)
clf3.fit(X_train, y_train)

time3 = str(round(time.time() - start_time3, 2))

print(time3 + " seconds")

# cross validation
from sklearn.cross_validation import cross_val_score
cross_val_score(clf3, X_train, y_train, cv=10).mean()



3.49seconds


0.9760232833923421

## Run models on test data

In [95]:
clf1_pred = clf1.fit(X_train, y_train).predict(X_test)

## Compare performance

ROC/AUC, confusion matrix, precision vs. recall, speed


In [103]:
from sklearn import metrics

# Times

# false-positive, true-positive
print(np.mean((y_test == 1) & (clf1_pred == 1)), #tpr
      np.mean((y_test == 1) & (clf1_pred == 0))) #fpr

# Precision vs. recall

# Confusion matrix

0.10999471179270227 0.13802221047065044
