# Problem Set 4: Build your own spam filter

In [1]:
import pandas as pd
import numpy as np
import string
import re
import sklearn

## Pre-processing

In [2]:
# Clean text data
# Create word dictionary
# Extract features
# Train classifier

# ideas:
# use tf-idf to weight features

### Cleaning emails

In [53]:
emails = pd.read_csv("emails.csv")

In [54]:
# use smaller dataset to play with
emailsplit = emails.copy()
# emailsplit = emailsplit.loc[0:200]

In [55]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# words are already lemmatized, so we just remove punctuation
emailsplit['text'] = emailsplit['text'].apply(lambda x: re.sub(r'[^\w\s]','',x)) # remove punctuation with regex

# create cached object of stopwords to improve speed
cachedStopwords = set(stopwords.words('english'))

# remove stop words
emailsplit['text'] = emailsplit.apply(lambda x: [item for item in x if item not in cachedStopwords])
emailsplit['text'] = emailsplit['text'].str[8:] # remove "subject" from beginning of each email

# remove numbers (alternatively, could convert to a single constant like a punctuation symbol to preserve some info)
# emailsplit['text'] = emailsplit['text'].apply(lambda x: [item for item in x if item not in set())])

#========================================
# NOTES:
# create one preprocessing function where we can choose what processing to do. Easier to check how it affects results.
# does feature_extraction already do the above? may be redundant

### Create bag of words and tf-idf sparse matrix

In [56]:
# create bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

corpus = [] # list of email content, where each item in list is an email
for i in range(len(emailsplit['text'])):
    corpus.append(emailsplit.loc[i]['text'])
    
vectorizer = CountVectorizer() # create vectorizer

wordmatrix = vectorizer.fit_transform(corpus) # sparse matrix where each value ij is how many times the word j occurs in email i

In [57]:
# TF-IDF (term frequency-inverse document frequency), a weighting scheme for words.
# The weight increases when a word occurs many times in a small number of documents, leading to increased discriminatory power of the word. 
# The weight decreases when the word occurrs infrequently, or occurs in a large number of documents. 

tfidf_transformer = TfidfTransformer().fit(wordmatrix) # create transformer based on vocab in train set
tfidf = tfidf_transformer.transform(wordmatrix) # calculate tf-idf of each word

## Train Model 1: Naive Bayes

## Train Model 2: Decision Trees

In [82]:
from sklearn import tree
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf, emailsplit['spam'], test_size=0.33)

clf1 = tree.DecisionTreeClassifier()
clf1 = clf1.fit(X_train, y_train)

# cross validation
from sklearn.cross_validation import cross_val_score
cross_val_score(clf1, X_train, y_train, cv=10).mean()

0.9582958797258259

## Train Model 3: Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf, emailsplit['spam'], test_size=0.33)

clf2 = RandomForestClassifier(max_depth=2, random_state=0)
clf2 = clf2.fit(X_train, y_train)

# cross validation
from sklearn.cross_validation import cross_val_score
cross_val_score(clf1, X_train, y_train, cv=10).mean()

0.946052273716275

### Tuning hyperparameters

In [81]:
# Random Forest
# tuning for max_depth

# Create grid search
# def make_grid_search(classifier, grid, train_features, train_outcome, test_features, test_outcome):
#     from sklearn.pipeline import make_pipeline
#     from sklearn.preprocessing import MinMaxScaler
#     scaler = MinMaxScaler()
#     param_grid = grid
#     model = classifier
#     pipe = make_pipeline(scaler, model)
    
#     from sklearn.model_selection import GridSearchCV
#     grid = GridSearchCV(pipe, param_grid)
#     search_model = grid.fit(train_features, train_outcome)
#     return search_model
from sklearn.model_selection import GridSearchCV

# n_estimators = np.array(np.arange(50,70,1)) # the optimal parameters always seem to be the bigger ones
# model = RandomForestClassifier()
# grid = GridSearchCV(estimator=model, param_grid=dict(n_estimators=n_estimators), return_train_score=True)
# grid.fit(X_train, y_train)
# print(grid.best_score_)
# print(grid.best_estimator_.n_estimators)

# SVM
# Tuning

0.9465728433672139
63


#### Performance Metrics

ROC/AUC, confusion matrix, precision vs. recall



array([[0, 1, 2, 3, 4, 5, 6]])

### Creating extra variables

In [None]:
# summary stats .. any difference in length, word diversity, number of punctuation symbols?

# might not even need these ... instead of counting dollar signs, just doing word/punct frequency in naive bayes might be more effective
emails_copy = pd.read_csv("emails.csv")

# punctuation count
count_punct = lambda l1: sum([1 for x in l1 if x in set(string.punctuation)])
emails_copy = emails_copy.assign(punct = emails_copy['text'].apply(count_punct))

# message length
emails_copy = emails_copy.assign(length = emails_copy['text'].apply(len))

# count dollar sign punctuation
count_dollar = lambda l1: sum([1 for x in l1 if x in set('$')])
emails_copy = emails_copy.assign(dollar = emails_copy['text'].apply(count_dollar))

In [None]:
count_dollar = lambda l1: sum([1 for x in l1 if x in set('$')])
emails_copy = emails_copy.assign(dollar = emails_copy['text'].apply(count_dollar))