In [2]:
import nltk
import pandas as pd
import re
import string

In [27]:
pd.set_option('display.max_colwidth', 100)
dataset = pd.read_csv("SMSSpamCollection.txt", sep = '\t', header = None)
dataset.columns = ['label', 'body_text']

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [28]:
dataset['body_len'] = dataset['body_text'].apply(lambda x: len(x) - x.count(" "))

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)* 100
dataset['punct%'] = dataset['body_text'].apply(lambda x:count_punct(x))

def count_capital(text):
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
    count = sum([1 for char in text_no_punct if (char == char.upper() and char != " ")])
    return count
dataset['cap_count'] = dataset['body_text'].apply(lambda x: count_capital(x))

In [29]:
def clean_text(text):
    text = " ".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
    

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(analyzer = clean_text)
Term_matrix_tv = tv.fit_transform(dataset['body_text'])
Term_matrix_tv.shape


(5572, 64)

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer = clean_text)
Term_matrix_cv = cv.fit_transform(dataset['body_text'])
Term_matrix_cv.shape

(5572, 64)

In [71]:
X_features_tv = pd.concat([dataset['body_len'], dataset['punct%'], pd.DataFrame(Term_matrix_tv.toarray())], axis = 1)
X_features_cv = pd.concat([dataset['body_len'], dataset['punct%'], pd.DataFrame(Term_matrix_cv.toarray())], axis = 1)
print(X_features_tv.shape)
print(X_features_cv.shape)

(5572, 66)
(5572, 66)


In [72]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1)

In [73]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=5)
cross_val_score(rf, X_features, dataset['label'], cv = kfold, scoring = 'accuracy', n_jobs=-1)

array([0.98206278, 0.98834081, 0.97935368, 0.97576302, 0.98384201])

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_features, dataset['label'], test_size = 0.2)
print(y_test)
rf = RandomForestClassifier(n_estimators = 50, max_depth = 20, n_jobs = -1)
rf_model = rf.fit(X_train, y_train)

792      ham
1453     ham
4786    spam
5242     ham
2821    spam
        ... 
2943     ham
1357     ham
1700     ham
1439     ham
1779     ham
Name: label, Length: 1115, dtype: object


In [76]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse = True)[:10]

[(0.16422827583710814, 1),
 (0.1330459689760953, 2),
 (0.10266677824092713, 9),
 (0.09450560459966086, 6),
 (0.0684412337585724, 8),
 (0.03884860703649875, 10),
 (0.03408300857692229, 7),
 (0.02685981742240459, 'body_len'),
 (0.026068759857867687, 3),
 (0.02428907917757818, 13)]

y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')
print('Precison {} // Recall {} // Accuracy {}'.format(round(precision,3), 
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [77]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth = depth, n_jobs =-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')
    print('Est: {}, Depth: {} -------> Precison {} // Recall {} // Accuracy {}'.format(n_est, depth,
                                                        round(precision,3), 
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [78]:
for n_est in [10,50,100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10, Depth: 10 -------> Precison 0.942 // Recall 0.89 // Accuracy 0.976
Est: 10, Depth: 20 -------> Precison 0.965 // Recall 0.853 // Accuracy 0.974
Est: 10, Depth: 30 -------> Precison 0.986 // Recall 0.865 // Accuracy 0.978
Est: 10, Depth: None -------> Precison 0.979 // Recall 0.877 // Accuracy 0.979
Est: 50, Depth: 10 -------> Precison 0.98 // Recall 0.908 // Accuracy 0.984
Est: 50, Depth: 20 -------> Precison 0.98 // Recall 0.883 // Accuracy 0.98
Est: 50, Depth: 30 -------> Precison 0.966 // Recall 0.883 // Accuracy 0.978
Est: 50, Depth: None -------> Precison 0.98 // Recall 0.902 // Accuracy 0.983
Est: 100, Depth: 10 -------> Precison 0.98 // Recall 0.896 // Accuracy 0.982
Est: 100, Depth: 20 -------> Precison 0.986 // Recall 0.89 // Accuracy 0.982
Est: 100, Depth: 30 -------> Precison 0.98 // Recall 0.908 // Accuracy 0.984
Est: 100, Depth: None -------> Precison 0.987 // Recall 0.902 // Accuracy 0.984


In [95]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
param = {'n_estimators': [10, 100, 150, 300], 'max_depth': [30, 60, 90, None] }
gs = GridSearchCV(rf, param, cv = kfold, n_jobs = -1)
gs_fit = gs.fit(X_features_tv, dataset['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[:5]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,1.242877,0.024505,0.029122,0.000977,90,100,"{'max_depth': 90, 'n_estimators': 100}",0.979372,0.988341,0.980251,0.977558,0.982944,0.981693,0.00375,1
6,1.876383,0.074699,0.040292,0.002054,60,150,"{'max_depth': 60, 'n_estimators': 150}",0.980269,0.987444,0.980251,0.973968,0.98474,0.981334,0.004593,2
1,1.141946,0.027925,0.026729,0.000747,30,100,"{'max_depth': 30, 'n_estimators': 100}",0.980269,0.988341,0.977558,0.974865,0.985637,0.981334,0.004997,3
3,3.628608,0.094954,0.084774,0.014508,30,300,"{'max_depth': 30, 'n_estimators': 300}",0.980269,0.987444,0.978456,0.977558,0.982047,0.981155,0.003503,4
2,1.760801,0.03874,0.040891,0.002275,30,150,"{'max_depth': 30, 'n_estimators': 150}",0.980269,0.989238,0.979354,0.973968,0.982047,0.980975,0.004933,5


In [96]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
param = {'n_estimators': [10, 100, 150, 300], 'max_depth': [30, 60, 90, None] }
gs = GridSearchCV(rf, param, cv = kfold, n_jobs = -1)
gs_fit = gs.fit(X_features_cv, dataset['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,2.526449,0.041469,0.084172,0.006487,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.982063,0.988341,0.980251,0.976661,0.983842,0.982232,0.00387,1
15,1.927051,0.155492,0.054855,0.007596,,300,"{'max_depth': None, 'n_estimators': 300}",0.981166,0.989238,0.979354,0.976661,0.983842,0.982052,0.004289,2
1,0.844143,0.020878,0.031316,0.001018,30.0,100,"{'max_depth': 30, 'n_estimators': 100}",0.98296,0.987444,0.979354,0.978456,0.982047,0.982052,0.003166,3
11,2.301647,0.244001,0.073204,0.016647,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.981166,0.987444,0.980251,0.978456,0.981149,0.981693,0.00304,4
5,0.850925,0.041205,0.030718,0.001164,60.0,100,"{'max_depth': 60, 'n_estimators': 100}",0.982063,0.987444,0.980251,0.975763,0.982944,0.981693,0.003795,5
