In [1]:
import pandas as pd
import nltk
import string
import re

In [2]:
dataset = pd.read_csv('SMSSpamCollection.txt', sep = '\t', header = None)
dataset.columns = ['label', 'body_text']


In [3]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [13]:
dataset['body_len'] = dataset['body_text'].apply(lambda x : len(x) - x.count(' '))
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)* 100
dataset['punct%'] = dataset['body_text'].apply(lambda x:count_punct(x))

def count_capital(text):
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
    count = sum([1 for char in text_no_punct if (char == char.upper() and char != " ")])
    return count
dataset['cap_count'] = dataset['body_text'].apply(lambda x: count_capital(x))

In [14]:
def clean_text(text):
    text = " ".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(analyzer = clean_text)
tm_tv = tv.fit_transform(dataset['body_text'])
X_features_tv = pd.concat([dataset['body_len'], dataset['punct%'], dataset['cap_count'], pd.DataFrame(tm_tv.toarray())], axis = 1)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer = clean_text)
tm_cv = tv.fit_transform(dataset['body_text'])
X_features_cv = pd.concat([dataset['body_len'], dataset['punct%'], dataset['cap_count'], pd.DataFrame(tm_tv.toarray())], axis = 1)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()

In [18]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features_tv, dataset['label'], test_size = 0.2)

In [28]:
def trainGB(n_est, depth, lr):
    gb = GradientBoostingClassifier(n_estimators = n_est, max_depth = depth, learning_rate = lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')
    print('Est: {}, depth: {}, learning_rate: {}----->Precision: {}, Recall: {}, Accuracy: {}'.format(n_est, depth, lr,
                                                                                                      round(precision, 3),
                                                                                                      round(recall, 3),
                                                                                                      round((y_pred == y_test).sum() / len(y_pred), 3)
                                                                                                     ))

In [29]:
for n_est in [100, 200, 400, 500]:
    for depth in [3, 7, 13, 19]:
        for lr in [0.01, 0.1, 1]:
            trainGB(n_est, depth, lr)

Est: 100, depth: 3, learning_rate: 0.01----->Precision: 0.958, Recall: 0.838, Accuracy: 0.976
Est: 100, depth: 3, learning_rate: 0.1----->Precision: 0.937, Recall: 0.875, Accuracy: 0.978
Est: 100, depth: 3, learning_rate: 1----->Precision: 0.922, Recall: 0.868, Accuracy: 0.975
Est: 100, depth: 7, learning_rate: 0.01----->Precision: 0.945, Recall: 0.882, Accuracy: 0.979
Est: 100, depth: 7, learning_rate: 0.1----->Precision: 0.937, Recall: 0.875, Accuracy: 0.978
Est: 100, depth: 7, learning_rate: 1----->Precision: 0.923, Recall: 0.882, Accuracy: 0.977
Est: 100, depth: 13, learning_rate: 0.01----->Precision: 0.917, Recall: 0.897, Accuracy: 0.978
Est: 100, depth: 13, learning_rate: 0.1----->Precision: 0.898, Recall: 0.904, Accuracy: 0.976
Est: 100, depth: 13, learning_rate: 1----->Precision: 0.879, Recall: 0.904, Accuracy: 0.973
Est: 100, depth: 19, learning_rate: 0.01----->Precision: 0.917, Recall: 0.897, Accuracy: 0.978
Est: 100, depth: 19, learning_rate: 0.1----->Precision: 0.904, Recal

In [30]:
from sklearn.model_selection import GridSearchCV

In [37]:
gb = GradientBoostingClassifier()
param = {'n_estimators':[100, 150], 'max_depth':[7,11,15,19], 'learning_rate':[0.1]}
gs = GridSearchCV(gb, param, cv = 5, n_jobs = -1)
gs_model = gs.fit(X_features_tv, dataset['label'])
pd.DataFrame(gs_model.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,11.6592,0.157523,0.013564,0.003253,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.98296,0.987444,0.980251,0.978456,0.982944,0.982411,0.00304,1
0,7.586119,0.156151,0.009575,0.000798,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.980269,0.986547,0.978456,0.976661,0.982944,0.980975,0.003475,2
3,9.184656,0.482383,0.011369,0.001018,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.979372,0.983857,0.979354,0.975763,0.979354,0.97954,0.002569,3
2,8.907515,0.724837,0.01137,0.002239,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.979372,0.984753,0.980251,0.975763,0.977558,0.97954,0.00303,4
4,8.874552,0.768765,0.015558,0.006663,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.972197,0.983857,0.979354,0.972172,0.970377,0.975591,0.005155,5


In [38]:
gb = GradientBoostingClassifier()
param = {'n_estimators':[100, 150], 'max_depth':[7,11,15,19], 'learning_rate':[0.1]}
gs_cv = GridSearchCV(gb, param, cv = 5, n_jobs = -1)
gs_model_cv = gs_cv.fit(X_features_cv, dataset['label'])
pd.DataFrame(gs_model_cv.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,8.804869,0.111613,0.011401,0.0008,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.981166,0.987444,0.980251,0.978456,0.981149,0.981693,0.00304,1
1,13.590291,0.167053,0.015503,0.001549,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.981166,0.986547,0.980251,0.977558,0.982047,0.981514,0.002932,2
2,10.457935,1.043579,0.012205,0.000749,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.979372,0.984753,0.979354,0.975763,0.978456,0.97954,0.002922,3
3,11.201084,0.740664,0.013963,0.00088,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.979372,0.984753,0.980251,0.975763,0.976661,0.97936,0.003166,4
4,10.706933,0.837969,0.0128,0.000977,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.972197,0.983857,0.979354,0.972172,0.971275,0.975771,0.004983,5


In [None]:
import 