# Problem Set 4: Build your own spam filter

In [1]:
import pandas as pd
import numpy as np
import string
import re
import sklearn

## Pre-processing

In [2]:
# Clean text data
# Create word dictionary
# Extract features
# Train classifier

# ideas:
# use tf-idf to weight features

### Cleaning emails

In [53]:
emails = pd.read_csv("emails.csv")

In [54]:
# use smaller dataset to play with
emailsplit = emails.copy()
# emailsplit = emailsplit.loc[0:200]

In [55]:
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# words are already lemmatized, so we just remove punctuation
emailsplit['text'] = emailsplit['text'].apply(lambda x: re.sub(r'[^\w\s]','',x)) # remove punctuation with regex

# create cached object of stopwords to improve speed
cachedStopwords = set(stopwords.words('english'))

# remove stop words
emailsplit['text'] = emailsplit.apply(lambda x: [item for item in x if item not in cachedStopwords])
emailsplit['text'] = emailsplit['text'].str[8:] # remove "subject" from beginning of each email

# remove numbers (alternatively, could convert to a single constant like a punctuation symbol to preserve some info)
# emailsplit['text'] = emailsplit['text'].apply(lambda x: [item for item in x if item not in set())])

#========================================
# NOTES:
# create one preprocessing function where we can choose what processing to do. Easier to check how it affects results.
# does feature_extraction already do the above? may be redundant

### Create bag of words and tf-idf sparse matrix

In [56]:
# create bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

corpus = [] # list of email content, where each item in list is an email
for i in range(len(emailsplit['text'])):
    corpus.append(emailsplit.loc[i]['text'])
    
vectorizer = CountVectorizer() # create vectorizer

wordmatrix = vectorizer.fit_transform(corpus) # sparse matrix where each value ij is how many times the word j occurs in email i

In [57]:
# TF-IDF (term frequency-inverse document frequency), a weighting scheme for words.
# The weight increases when a word occurs many times in a small number of documents, leading to increased discriminatory power of the word. 
# The weight decreases when the word occurrs infrequently, or occurs in a large number of documents. 

tfidf_transformer = TfidfTransformer().fit(wordmatrix) # create transformer based on vocab in train set
tfidf = tfidf_transformer.transform(wordmatrix) # calculate tf-idf of each word

## Train Model 1: Naive Bayes

## Train Model 2: Decision Trees

In [82]:
from sklearn import tree
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf, emailsplit['spam'], test_size=0.33)

clf1 = tree.DecisionTreeClassifier()
clf1 = clf1.fit(X_train, y_train)

# cross validation
from sklearn.cross_validation import cross_val_score
cross_val_score(clf1, X_train, y_train, cv=10).mean()

0.9582958797258259

## Train Model 3: Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf, emailsplit['spam'], test_size=0.33)

clf2 = RandomForestClassifier(max_depth=2, random_state=0)
clf2 = clf2.fit(X_train, y_train)

# cross validation
from sklearn.cross_validation import cross_val_score
cross_val_score(clf1, X_train, y_train, cv=10).mean()

0.946052273716275

### Tuning hyperparameters

In [81]:
# Random Forest
# tuning for max_depth

# Create grid search
# def make_grid_search(classifier, grid, train_features, train_outcome, test_features, test_outcome):
#     from sklearn.pipeline import make_pipeline
#     from sklearn.preprocessing import MinMaxScaler
#     scaler = MinMaxScaler()
#     param_grid = grid
#     model = classifier
#     pipe = make_pipeline(scaler, model)
    
#     from sklearn.model_selection import GridSearchCV
#     grid = GridSearchCV(pipe, param_grid)
#     search_model = grid.fit(train_features, train_outcome)
#     return search_model
from sklearn.model_selection import GridSearchCV

# n_estimators = np.array(np.arange(50,70,1)) # the optimal parameters always seem to be the bigger ones
# model = RandomForestClassifier()
# grid = GridSearchCV(estimator=model, param_grid=dict(n_estimators=n_estimators), return_train_score=True)
# grid.fit(X_train, y_train)
# print(grid.best_score_)
# print(grid.best_estimator_.n_estimators)

# SVM
# Tuning

0.9465728433672139
63


#### Performance Metrics

ROC/AUC, confusion matrix, precision vs. recall



array([[0, 1, 2, 3, 4, 5, 6]])

### Creating extra variables

In [None]:
# summary stats .. any difference in length, word diversity, number of punctuation symbols?

# might not even need these ... instead of counting dollar signs, just doing word/punct frequency in naive bayes might be more effective
emails_copy = pd.read_csv("emails.csv")

# punctuation count
count_punct = lambda l1: sum([1 for x in l1 if x in set(string.punctuation)])
emails_copy = emails_copy.assign(punct = emails_copy['text'].apply(count_punct))

# message length
emails_copy = emails_copy.assign(length = emails_copy['text'].apply(len))

# count dollar sign punctuation
count_dollar = lambda l1: sum([1 for x in l1 if x in set('$')])
emails_copy = emails_copy.assign(dollar = emails_copy['text'].apply(count_dollar))

In [None]:
count_dollar = lambda l1: sum([1 for x in l1 if x in set('$')])
emails_copy = emails_copy.assign(dollar = emails_copy['text'].apply(count_dollar))

In [2]:
for i in range(0,6579):
    print(i*1.1)

0.0
1.1
2.2
3.3000000000000003
4.4
5.5
6.6000000000000005
7.700000000000001
8.8
9.9
11.0
12.100000000000001
13.200000000000001
14.3
15.400000000000002
16.5
17.6
18.700000000000003
19.8
20.900000000000002
22.0
23.1
24.200000000000003
25.3
26.400000000000002
27.500000000000004
28.6
29.700000000000003
30.800000000000004
31.900000000000002
33.0
34.1
35.2
36.300000000000004
37.400000000000006
38.5
39.6
40.7
41.800000000000004
42.900000000000006
44.0
45.1
46.2
47.300000000000004
48.400000000000006
49.50000000000001
50.6
51.7
52.800000000000004
53.900000000000006
55.00000000000001
56.1
57.2
58.300000000000004
59.400000000000006
60.50000000000001
61.60000000000001
62.7
63.800000000000004
64.9
66.0
67.10000000000001
68.2
69.30000000000001
70.4
71.5
72.60000000000001
73.7
74.80000000000001
75.9
77.0
78.10000000000001
79.2
80.30000000000001
81.4
82.5
83.60000000000001
84.7
85.80000000000001
86.9
88.0
89.10000000000001
90.2
91.30000000000001
92.4
93.50000000000001
94.60000000000001
95.7
96.8000000

1074.7
1075.8000000000002
1076.9
1078.0
1079.1000000000001
1080.2
1081.3000000000002
1082.4
1083.5
1084.6000000000001
1085.7
1086.8000000000002
1087.9
1089.0
1090.1000000000001
1091.2
1092.3000000000002
1093.4
1094.5
1095.6000000000001
1096.7
1097.8000000000002
1098.9
1100.0
1101.1000000000001
1102.2
1103.3000000000002
1104.4
1105.5
1106.6000000000001
1107.7
1108.8000000000002
1109.9
1111.0
1112.1000000000001
1113.2
1114.3000000000002
1115.4
1116.5
1117.6000000000001
1118.7
1119.8000000000002
1120.9
1122.0
1123.1000000000001
1124.2
1125.3000000000002
1126.4
1127.5
1128.6000000000001
1129.7
1130.8000000000002
1131.9
1133.0
1134.1000000000001
1135.2
1136.3000000000002
1137.4
1138.5
1139.6000000000001
1140.7
1141.8000000000002
1142.9
1144.0
1145.1000000000001
1146.2
1147.3000000000002
1148.4
1149.5
1150.6000000000001
1151.7
1152.8000000000002
1153.9
1155.0
1156.1000000000001
1157.2
1158.3000000000002
1159.4
1160.5
1161.6000000000001
1162.7
1163.8000000000002
1164.9
1166.0
1167.10000000000

2174.7000000000003
2175.8
2176.9
2178.0
2179.1000000000004
2180.2000000000003
2181.3
2182.4
2183.5
2184.6000000000004
2185.7000000000003
2186.8
2187.9
2189.0
2190.1000000000004
2191.2000000000003
2192.3
2193.4
2194.5
2195.6000000000004
2196.7000000000003
2197.8
2198.9
2200.0
2201.1000000000004
2202.2000000000003
2203.3
2204.4
2205.5
2206.6000000000004
2207.7000000000003
2208.8
2209.9
2211.0
2212.1000000000004
2213.2000000000003
2214.3
2215.4
2216.5
2217.6000000000004
2218.7000000000003
2219.8
2220.9
2222.0
2223.1000000000004
2224.2000000000003
2225.3
2226.4
2227.5
2228.6000000000004
2229.7000000000003
2230.8
2231.9
2233.0
2234.1000000000004
2235.2000000000003
2236.3
2237.4
2238.5
2239.6000000000004
2240.7000000000003
2241.8
2242.9
2244.0
2245.1000000000004
2246.2000000000003
2247.3
2248.4
2249.5
2250.6000000000004
2251.7000000000003
2252.8
2253.9
2255.0
2256.1000000000004
2257.2000000000003
2258.3
2259.4
2260.5
2261.6000000000004
2262.7000000000003
2263.8
2264.9
2266.0
2267.10000000000

3273.6000000000004
3274.7000000000003
3275.8
3276.9
3278.0000000000005
3279.1000000000004
3280.2000000000003
3281.3
3282.4
3283.5000000000005
3284.6000000000004
3285.7000000000003
3286.8
3287.9
3289.0000000000005
3290.1000000000004
3291.2000000000003
3292.3
3293.4
3294.5000000000005
3295.6000000000004
3296.7000000000003
3297.8
3298.9
3300.0000000000005
3301.1000000000004
3302.2000000000003
3303.3
3304.4
3305.5000000000005
3306.6000000000004
3307.7000000000003
3308.8
3309.9
3311.0000000000005
3312.1000000000004
3313.2000000000003
3314.3
3315.4
3316.5000000000005
3317.6000000000004
3318.7000000000003
3319.8
3320.9
3322.0000000000005
3323.1000000000004
3324.2000000000003
3325.3
3326.4
3327.5000000000005
3328.6000000000004
3329.7000000000003
3330.8
3331.9
3333.0000000000005
3334.1000000000004
3335.2000000000003
3336.3
3337.4
3338.5000000000005
3339.6000000000004
3340.7000000000003
3341.8
3342.9
3344.0000000000005
3345.1000000000004
3346.2000000000003
3347.3
3348.4
3349.5000000000005
3350.6

4373.6
4374.700000000001
4375.8
4376.900000000001
4378.0
4379.1
4380.200000000001
4381.3
4382.400000000001
4383.5
4384.6
4385.700000000001
4386.8
4387.900000000001
4389.0
4390.1
4391.200000000001
4392.3
4393.400000000001
4394.5
4395.6
4396.700000000001
4397.8
4398.900000000001
4400.0
4401.1
4402.200000000001
4403.3
4404.400000000001
4405.5
4406.6
4407.700000000001
4408.8
4409.900000000001
4411.0
4412.1
4413.200000000001
4414.3
4415.400000000001
4416.5
4417.6
4418.700000000001
4419.8
4420.900000000001
4422.0
4423.1
4424.200000000001
4425.3
4426.400000000001
4427.5
4428.6
4429.700000000001
4430.8
4431.900000000001
4433.0
4434.1
4435.200000000001
4436.3
4437.400000000001
4438.5
4439.6
4440.700000000001
4441.8
4442.900000000001
4444.0
4445.1
4446.200000000001
4447.3
4448.400000000001
4449.5
4450.6
4451.700000000001
4452.8
4453.900000000001
4455.0
4456.1
4457.200000000001
4458.3
4459.400000000001
4460.5
4461.6
4462.700000000001
4463.8
4464.900000000001
4466.0
4467.1
4468.200000000001
4469.3

5472.5
5473.6
5474.700000000001
5475.8
5476.900000000001
5478.0
5479.1
5480.200000000001
5481.3
5482.400000000001
5483.5
5484.6
5485.700000000001
5486.8
5487.900000000001
5489.0
5490.1
5491.200000000001
5492.3
5493.400000000001
5494.5
5495.6
5496.700000000001
5497.8
5498.900000000001
5500.0
5501.1
5502.200000000001
5503.3
5504.400000000001
5505.5
5506.6
5507.700000000001
5508.8
5509.900000000001
5511.0
5512.1
5513.200000000001
5514.3
5515.400000000001
5516.5
5517.6
5518.700000000001
5519.8
5520.900000000001
5522.0
5523.1
5524.200000000001
5525.3
5526.400000000001
5527.5
5528.6
5529.700000000001
5530.8
5531.900000000001
5533.0
5534.1
5535.200000000001
5536.3
5537.400000000001
5538.5
5539.6
5540.700000000001
5541.8
5542.900000000001
5544.0
5545.1
5546.200000000001
5547.3
5548.400000000001
5549.5
5550.6
5551.700000000001
5552.8
5553.900000000001
5555.0
5556.1
5557.200000000001
5558.3
5559.400000000001
5560.5
5561.6
5562.700000000001
5563.8
5564.900000000001
5566.0
5567.1
5568.200000000001

6572.500000000001
6573.6
6574.700000000001
6575.8
6576.900000000001
6578.000000000001
6579.1
6580.200000000001
6581.3
6582.400000000001
6583.500000000001
6584.6
6585.700000000001
6586.8
6587.900000000001
6589.000000000001
6590.1
6591.200000000001
6592.3
6593.400000000001
6594.500000000001
6595.6
6596.700000000001
6597.8
6598.900000000001
6600.000000000001
6601.1
6602.200000000001
6603.3
6604.400000000001
6605.500000000001
6606.6
6607.700000000001
6608.8
6609.900000000001
6611.000000000001
6612.1
6613.200000000001
6614.3
6615.400000000001
6616.500000000001
6617.6
6618.700000000001
6619.8
6620.900000000001
6622.000000000001
6623.1
6624.200000000001
6625.3
6626.400000000001
6627.500000000001
6628.6
6629.700000000001
6630.8
6631.900000000001
6633.000000000001
6634.1
6635.200000000001
6636.3
6637.400000000001
6638.500000000001
6639.6
6640.700000000001
6641.8
6642.900000000001
6644.000000000001
6645.1
6646.200000000001
6647.3
6648.400000000001
6649.500000000001
6650.6
6651.700000000001
6652.