### Random Forest Classifier and Gradient Boosting for Spam Detection

### Import Libraries

In [12]:
import nltk
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score,GridSearchCV
from sklearn.model_selection import train_test_split
import time
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()


### Read Data

In [3]:

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']


### Create New Features

In [4]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

In [5]:
data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

###  Clean Data

In [6]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Vectorize Data

In [7]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])


In [8]:
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat.head()
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier()


### Explore RandomForestClassifier through Cross-Validation

In [10]:
rf = RandomForestClassifier(n_jobs = -1) #process to run in parallel n_jobs
k_fold = KFold(n_splits=5)
cross_val_score(rf,X_tfidf_feat,data['label'],cv = k_fold, scoring = 'accuracy', n_jobs = -1)

array([0.97755835, 0.97935368, 0.97753819, 0.96675651, 0.97394429])

### Explore RandomForestClassifier through Holdout Set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat, data['label'], test_size=0.2)

In [None]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [58]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[(0.05282243552076597, 'body_len'),
 (0.04507915670361323, 2031),
 (0.03449488572159723, 7350),
 (0.02735674823317056, 5724),
 (0.026792970564506732, 1803),
 (0.024913273508426596, 3134),
 (0.023583002887695912, 6285),
 (0.020926868241438922, 6746),
 (0.0181195290893347, 690),
 (0.014729427715585653, 5988)]

Body length is the most important feature

In [59]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [60]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 1.0 / Recall: 0.579 / Accuracy: 0.947


### Build our own Grid-search

In [61]:
 def train_RF(n_est,depth):
        rf = RandomForestClassifier(n_estimators = n_est, max_depth = depth, n_jobs = -1)
        rf_model = rf.fit(X_train,y_train)
        y_pred = rf_model.predict(X_test)
        precision, recall, fscore, support = score(y_test,y_pred, pos_label ='spam', average = 'binary')
        print('Est:{} / Depth:{}-----Precision:{}/Recall:{}/Accuracy:{}'.format(n_est,depth,round(precision,3),round(recall,3),round((y_pred == y_test).sum()/len(y_pred),3)))        

In [62]:
for n_est in [ 10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est,depth)

Est:10 / Depth:10-----Precision:1.0/Recall:0.264/Accuracy:0.908
Est:10 / Depth:20-----Precision:1.0/Recall:0.614/Accuracy:0.952
Est:10 / Depth:30-----Precision:0.978/Recall:0.643/Accuracy:0.953
Est:10 / Depth:None-----Precision:1.0/Recall:0.8/Accuracy:0.975
Est:50 / Depth:10-----Precision:1.0/Recall:0.25/Accuracy:0.906
Est:50 / Depth:20-----Precision:1.0/Recall:0.643/Accuracy:0.955
Est:50 / Depth:30-----Precision:1.0/Recall:0.721/Accuracy:0.965
Est:50 / Depth:None-----Precision:1.0/Recall:0.8/Accuracy:0.975
Est:100 / Depth:10-----Precision:1.0/Recall:0.193/Accuracy:0.899
Est:100 / Depth:20-----Precision:1.0/Recall:0.571/Accuracy:0.946
Est:100 / Depth:30-----Precision:1.0/Recall:0.736/Accuracy:0.967
Est:100 / Depth:None-----Precision:1.0/Recall:0.821/Accuracy:0.978


**Grid-search:** Exhaustively search all parameter combinations in a given grid to determine the best model.

**Cross-validation:** Divide a dataset into k subsets and repeat the holdout method k times where a different subset is used as the holdout set in each iteration.

### Exploring parameter settings using GridSearchCV

In [64]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)

gs_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,51.088594,0.136203,0.489423,0.02218,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.979354,0.978456,0.973944,0.969452,0.973046,0.97485,0.003647,1
10,30.78618,3.158017,0.411723,0.111499,,150,"{'max_depth': None, 'n_estimators': 150}",0.974865,0.975763,0.97664,0.969452,0.971249,0.973594,0.002766,2
7,26.878921,0.461756,0.357425,0.020511,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978456,0.976661,0.973944,0.965858,0.973046,0.973593,0.00432,3
11,60.963019,9.769666,0.560169,0.222967,,300,"{'max_depth': None, 'n_estimators': 300}",0.978456,0.977558,0.973944,0.967655,0.97035,0.973593,0.00413,4
3,2.808466,0.079664,0.248321,0.039679,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.97307,0.972172,0.969452,0.964061,0.977538,0.971259,0.004441,5


In [65]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,57.30887,6.182443,0.476446,0.099013,,300,"{'max_depth': None, 'n_estimators': 300}",0.979354,0.97307,0.973944,0.966757,0.972147,0.973054,0.004024,1
10,30.922411,0.762327,0.383424,0.045426,,150,"{'max_depth': None, 'n_estimators': 150}",0.977558,0.975763,0.974843,0.966757,0.969452,0.972875,0.004082,2
7,27.873789,0.236803,0.34816,0.018274,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978456,0.97307,0.973046,0.967655,0.969452,0.972336,0.003707,3
8,54.005328,0.861005,0.518463,0.036777,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977558,0.974865,0.973046,0.966757,0.969452,0.972336,0.003837,4
6,3.245726,0.059328,0.213365,0.013155,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.973968,0.972172,0.974843,0.963163,0.973046,0.971438,0.004233,5


### Explore GradientBoostingClassifier Attributes & Hyperparameters

In [67]:
print(dir(GradientBoostingClassifier))
print(GradientBoostingClassifier())

['_SUPPORTED_LOSS', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_initialized', '_check_n_features', '_check_params', '_clear_state', '_compute_partial_dependence_recursion', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_get_tags', '_init_state', '_is_initialized', '_make_estimator', '_more_tags', '_raw_predict', '_raw_predict_init', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_resize_state', '_staged_raw_predict', '_validate_data', '_validate_estimator', '_validate_y', '_warn_mae_for_criterion', 'apply

### Build our own Grid-search

In [69]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, msg_start, len(result))


Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.874
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.894 / Recall: 0.721 / Accuracy: 0.954
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.87 / Recall: 0.764 / Accuracy: 0.956


  _warn_prf(average, modifier, msg_start, len(result))


Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.874
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.856 / Recall: 0.807 / Accuracy: 0.959
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.835 / Recall: 0.793 / Accuracy: 0.954
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.007 / Accuracy: 0.875
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.853 / Recall: 0.829 / Accuracy: 0.961
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.879 / Recall: 0.829 / Accuracy: 0.964
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 1.0 / Recall: 0.014 / Accuracy: 0.876
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.854 / Recall: 0.836 / Accuracy: 0.961
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.866 / Recall: 0.829 / Accuracy: 0.962
Est: 100 / Depth: 3 / LR: 0.01 ---- Precision: 0.908 / Recall: 0.493 / Accuracy: 0.93
Est: 100 / Depth: 3 / LR: 0.1 ---- Precision: 0.908 / Recall: 0.771 / Accuracy: 0.961
Est: 100 / Depth: 3 / LR: 1 ---- Precision: 0.845 / Recall: 0.779 / 

### Exploring parameter settings using GridSearchCV

In [5]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
5,368.558762,0.308644,0.969643,1.0,0.1,11,150,"{'n_estimators': 150, 'learning_rate': 0.1, 'm...",1,0.965919,...,0.966757,1.0,0.968553,1.0,0.969452,1.0,7.128055,0.030754,0.004141,0.0
2,214.185205,0.303001,0.969283,1.0,0.1,7,150,"{'n_estimators': 150, 'learning_rate': 0.1, 'm...",2,0.965919,...,0.968553,1.0,0.96496,1.0,0.967655,1.0,2.241176,0.043978,0.005181,0.0
8,381.475534,0.206334,0.968744,1.0,0.1,15,150,"{'n_estimators': 150, 'learning_rate': 0.1, 'm...",3,0.965022,...,0.969452,1.0,0.965858,1.0,0.967655,1.0,44.316129,0.036329,0.003816,0.0
1,157.876316,0.351555,0.968205,1.0,0.1,7,100,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",4,0.965919,...,0.968553,1.0,0.96496,1.0,0.965858,1.0,3.158135,0.187909,0.003954,0.0
7,353.960683,0.258787,0.968205,1.0,0.1,15,100,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",4,0.964126,...,0.969452,1.0,0.966757,1.0,0.967655,1.0,10.086312,0.056744,0.002968,0.0


In [7]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [50, 100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_count_feat, data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
5,341.866984,0.335839,0.969463,1.0,0.1,11,150,"{'n_estimators': 150, 'learning_rate': 0.1, 'm...",1,0.965022,...,0.97035,1.0,0.963163,1.0,0.97035,1.0,3.450213,0.048011,0.00532,0.0
7,335.012637,0.264707,0.969283,1.0,0.1,15,100,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",2,0.965022,...,0.967655,1.0,0.96496,1.0,0.972147,1.0,7.205083,0.027251,0.004513,0.0
2,213.266939,0.279388,0.968385,0.999955,0.1,7,150,"{'n_estimators': 150, 'learning_rate': 0.1, 'm...",3,0.965919,...,0.968553,1.0,0.960467,1.0,0.966757,0.999775,2.651567,0.022217,0.006508,9e-05
8,356.912155,0.183561,0.968385,1.0,0.1,15,150,"{'n_estimators': 150, 'learning_rate': 0.1, 'm...",3,0.962332,...,0.967655,1.0,0.963163,1.0,0.971249,1.0,45.305546,0.031481,0.005594,0.0
4,241.101819,0.263556,0.968205,1.0,0.1,11,100,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",5,0.963229,...,0.97035,1.0,0.961366,1.0,0.968553,1.0,3.736889,0.026132,0.005716,0.0


### Final evaluation of models

In [14]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 7.417 / Predict time: 0.234 ---- Precision: 1.0 / Recall: 0.857 / Accuracy: 0.982


In [None]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))