In [58]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

# Read in the raw text (file downloaded from kaggle and renamed https://www.kaggle.com/uciml/sms-spam-collection-dataset)
rawData = open("SMSSpamCollection.csv").read()
parsedData = rawData.replace("ham,", "ham\t")
parsedData = parsedData.replace("spam,", "spam\t")
parsedData = parsedData.replace("\t", "\n").split("\n")
parsedData = parsedData[1:]
labelList = parsedData[0::2]
#labelList.pop()
textList = parsedData[1::2]
data = pd.DataFrame({'label': labelList, 'body_text': textList})
data.head()

Unnamed: 0,label,body_text
0,ham,"""Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there ..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,"U dun say so early hor... U c already then say...,,,"
4,ham,"""Nah I don't think he goes to usf, he lives around here though"",,,"


In [59]:
data['label'].unique()

array(['ham', 'spam'], dtype=object)

In [60]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))

import string

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / (len(text) - text.count(" ")), 3) * 100

data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8376,8377,8378,8379,8380,8381,8382,8383,8384,8385
0,97,14.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27,33.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,131,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,42,21.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,54,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.ensemble import RandomForestClassifier
dir(RandomForestClassifier)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_set_oob_score',
 '_validate_X_predict',
 '_validate_data',
 '_validate_estimator',
 '_validate_y_class_weight',
 'apply',
 'decision_path',
 'feature_importances_',
 'fit',
 'get_params',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params']

In [10]:
print(RandomForestClassifier())

RandomForestClassifier()


## Explore RandomForestCLassifier through Cross-validation

In [12]:
from sklearn.model_selection import KFold, cross_val_score

In [67]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_features, data['label'], cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.97399103, 0.9793722 , 0.97757848, 0.96858169, 0.97486535])

## Build RandomForest model with holdout test set

In [19]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [69]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [70]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.05151704196199005, 'body_len'),
 (0.048547918204973596, 8371),
 (0.03211340039077488, 2023),
 (0.028156633428548475, 'punct%'),
 (0.028058241078234836, 3363),
 (0.02497231213611817, 5970),
 (0.022944573905466104, 7606),
 (0.01837095157149955, 7274),
 (0.017544918781514464, 5033),
 (0.016028728315289825, 7474),
 (0.015196062168334785, 6233),
 (0.014755645489067371, 3672),
 (0.014146498051238883, 2252),
 (0.013347498344495903, 8045),
 (0.012727357793451173, 695),
 (0.012690985196193029, 6529),
 (0.010781215882258667, 1115),
 (0.010298365782251755, 2316),
 (0.010094306969315322, 5361),
 (0.009592073750421648, 6991),
 (0.009231140778753942, 2392),
 (0.008548936776821298, 354),
 (0.008397489574342944, 1576),
 (0.00834121151855442, 392),
 (0.00776262547249227, 397),
 (0.007640008742715338, 7719),
 (0.007621043922187197, 7848),
 (0.007574272152727351, 2104),
 (0.007266475181581469, 6162),
 (0.0072595251224430715, 6295),
 (0.007029894214186041, 437),
 (0.006683794862710232, 696),
 (0.00660

In [71]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {}, Recall: {}, Accuracy: {}'.format(round(precision, 3), round(recall, 3), (y_pred==y_test).sum() / len(y_test)))

Precision: 1.0, Recall: 0.623, Accuracy: 0.9533632286995516


### Explore Random Forest model with grid-search
#### Build your own grid-search

In [79]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {}, Recall: {}, Accuracy: {}'.
          format(n_est, depth, round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_test), 3)))  

In [81]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 1.0, Recall: 0.348, Accuracy: 0.919
Est: 10 / Depth: 20 ---- Precision: 1.0, Recall: 0.623, Accuracy: 0.953
Est: 10 / Depth: 30 ---- Precision: 0.991, Recall: 0.768, Accuracy: 0.97
Est: 10 / Depth: None ---- Precision: 0.973, Recall: 0.797, Accuracy: 0.972
Est: 50 / Depth: 10 ---- Precision: 1.0, Recall: 0.333, Accuracy: 0.917
Est: 50 / Depth: 20 ---- Precision: 1.0, Recall: 0.667, Accuracy: 0.959
Est: 50 / Depth: 30 ---- Precision: 1.0, Recall: 0.761, Accuracy: 0.97
Est: 50 / Depth: None ---- Precision: 1.0, Recall: 0.797, Accuracy: 0.975
Est: 100 / Depth: 10 ---- Precision: 1.0, Recall: 0.29, Accuracy: 0.912
Est: 100 / Depth: 20 ---- Precision: 1.0, Recall: 0.587, Accuracy: 0.949
Est: 100 / Depth: 30 ---- Precision: 1.0, Recall: 0.739, Accuracy: 0.968
Est: 100 / Depth: None ---- Precision: 1.0, Recall: 0.862, Accuracy: 0.983


## Evaluate Random Forest model performance

In [84]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

In [160]:
# some pre-processing
# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

print(X_tfidf_feat)
X_count_feat.head()

      body_len  punct%    0    1    2    3    4    5    6    7  ...  8376  \
0           97    14.4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
1           27    33.3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
2          131     6.9  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3           42    21.4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
4           54    13.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
...        ...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   
5568       137     9.5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
5569        33    15.2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
5570        53    22.6  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
5571       103     3.9  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
5572        24    16.7  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   

      8377  8378  8379  8380  8381  8382  8383  8384  8385  
0      0.0   0

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8376,8377,8378,8379,8380,8381,8382,8383,8384,8385
0,97,14.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,27,33.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,131,6.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42,21.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,54,13.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
rf = RandomForestClassifier()
param ={'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,36.978561,0.7385,0.627119,0.138091,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977578,0.977578,0.976682,0.970377,0.976661,0.975775,0.002729,1
10,38.360199,1.648807,0.505153,0.074082,,150,"{'max_depth': None, 'n_estimators': 150}",0.977578,0.974888,0.976682,0.970377,0.978456,0.975596,0.002865,2
8,71.629715,1.363582,0.750297,0.020217,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978475,0.975785,0.977578,0.969479,0.976661,0.975596,0.003188,3
11,62.179225,1.043739,0.433688,0.079459,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.975785,0.976682,0.969479,0.974865,0.974878,0.002847,4
4,32.910108,0.679701,0.584578,0.078852,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.974888,0.974888,0.977578,0.969479,0.976661,0.974699,0.002809,5


In [92]:
rf = RandomForestClassifier()
param ={'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,37.536535,0.490694,0.673007,0.168374,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978475,0.974888,0.976682,0.971275,0.975763,0.975417,0.002388,1
11,60.977668,0.884955,0.401941,0.055645,,300,"{'max_depth': None, 'n_estimators': 300}",0.976682,0.975785,0.975785,0.970377,0.974865,0.974699,0.002236,2
10,36.863991,1.20071,0.466119,0.103793,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.975785,0.974888,0.968582,0.975763,0.974699,0.003287,3
8,71.975272,1.251871,0.854194,0.045987,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.975785,0.975785,0.975785,0.969479,0.975763,0.974519,0.00252,4
3,5.465225,0.296444,0.326072,0.034061,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.977578,0.973094,0.969507,0.97307,0.97307,0.973264,0.002563,5


## Introduction to Gradient boosting

### Explore GradientBoostingClassifier

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

In [163]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {}, Recall: {}, Accuracy: {}'.
          format(n_est, max_depth, lr, round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_test), 3)))  

In [None]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 1.0, Recall: 0.077, Accuracy: 0.892
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.964, Recall: 0.815, Accuracy: 0.975
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.829, Recall: 0.785, Accuracy: 0.956


  _warn_prf(average, modifier, msg_start, len(result))


Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 0.0, Recall: 0.0, Accuracy: 0.883
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.935, Recall: 0.892, Accuracy: 0.98
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.85, Recall: 0.831, Accuracy: 0.963
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 1.0, Recall: 0.015, Accuracy: 0.885
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.923, Recall: 0.923, Accuracy: 0.982
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.88, Recall: 0.9, Accuracy: 0.974
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 0.667, Recall: 0.015, Accuracy: 0.884
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.929, Recall: 0.908, Accuracy: 0.981
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.901, Recall: 0.908, Accuracy: 0.978
Est: 100 / Depth: 3 / LR: 0.01 ---- Precision: 1.0, Recall: 0.485, Accuracy: 0.94
Est: 100 / Depth: 3 / LR: 0.1 ---- Precision: 0.941, Recall: 0.862, Accuracy: 0.978
Est: 100 / Depth: 3 / LR: 1 ---- Precision: 0.856, Recall: 0.823, Accuracy: 0.963
Est: 100 / De

### Evaluate Gradient Boosting with GridSearchCV

In [101]:
gb = GradientBoostingClassifier()
param ={'n_estimators': [100, 150],
        'max_depth': [7, 11, 15],
        'learning_rate': [0.1]}
gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,495.455216,6.217503,0.159341,0.015314,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150}",0.970404,0.982063,0.974888,0.973968,0.97307,0.974878,0.003892,1
3,2134.97243,21.418881,0.28646,0.011101,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 150}",0.973094,0.982063,0.974888,0.971275,0.971275,0.974519,0.004003,2
1,373.022921,20.131748,0.393668,0.053233,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}",0.974888,0.976682,0.973094,0.976661,0.968582,0.973981,0.003008,3
2,1040.436149,811.846202,0.301794,0.057283,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 100}",0.970404,0.980269,0.976682,0.972172,0.970377,0.973981,0.003896,4
4,693.663395,674.37539,0.269183,0.027125,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100}",0.970404,0.982063,0.972197,0.971275,0.97307,0.973802,0.004226,5


In [102]:
gb = GradientBoostingClassifier()
param ={'n_estimators': [100, 150],
        'max_depth': [7, 11, 15],
        'learning_rate': [0.1]}
gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,495.324935,2.785917,0.190579,0.043517,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150}",0.973991,0.979372,0.973094,0.97307,0.970377,0.973981,0.002955,1
3,2620.852999,3.585602,0.303053,0.037751,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 150}",0.973991,0.981166,0.9713,0.97307,0.968582,0.973622,0.004199,2
1,1711.79669,1165.469866,0.287431,0.032164,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}",0.973094,0.978475,0.970404,0.97307,0.970377,0.973084,0.002952,3
2,274.362426,1.695316,0.256189,0.007652,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 100}",0.972197,0.981166,0.973094,0.969479,0.968582,0.972904,0.004454,4
0,2569.074754,1.074256,0.322658,0.086858,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}",0.973991,0.976682,0.970404,0.971275,0.971275,0.972725,0.002317,5


# Model Selection

In [125]:
X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

## Vectorize text

In [152]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), pd.DataFrame(tfidf_test.toarray())], axis=1)

In [154]:
X_train_vect

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7348,7349,7350,7351,7352,7353,7354,7355,7356,7357
0,40,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,64,7.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,94,12.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,138,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,30,13.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4454,42,9.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4455,88,5.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4456,94,11.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Final evaluation of models

In [118]:
import time

In [156]:
rf = RandomForestClassifier(n_estimators=50, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = end - start
start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
predict_time = end - start

precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

print('Fit time: {} / Predict_time: {} ---- Precision: {}, Recall: {}, Accuracy: {}'.
      format(round(fit_time, 3), round(predict_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_test), 3)))  

Fit time: 1.292 / Predict_time: 0.097 ---- Precision: 1.0, Recall: 0.795, Accuracy: 0.972


In [157]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = end - start
start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
predict_time = end - start

precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

print('Fit time: {} / Predict_time: {} ---- Precision: {}, Recall: {}, Accuracy: {}'.
      format(round(fit_time, 3), round(predict_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum() / len(y_test), 3)))  

Fit time: 235.294 / Predict_time: 0.159 ---- Precision: 0.941, Recall: 0.848, Accuracy: 0.972
