In [1]:
import pandas as pd
import nltk
import string
import re

In [2]:
dataset = pd.read_csv('SMSSpamCollection.txt', sep = '\t', header = None)
dataset.columns = ['label', 'body_text']


In [3]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [4]:
dataset['body_len'] = dataset['body_text'].apply(lambda x : len(x) - x.count(' '))
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)* 100
dataset['punct%'] = dataset['body_text'].apply(lambda x:count_punct(x))

def count_capital(text):
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
    count = sum([1 for char in text_no_punct if (char == char.upper() and char != " ")])
    return count
dataset['cap_count'] = dataset['body_text'].apply(lambda x: count_capital(x))

In [5]:
def clean_text(text):
    text = " ".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [6]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset[['body_text', 'body_len', 'punct%', 'cap_count']],dataset['label'], test_size = 0.2)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(analyzer = clean_text)
tv_idf = tv.fit(X_train['body_text'])

Term_matrix_train = tv_idf.transform(X_train['body_text'])
Term_matrix_test = tv_idf.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%', 'cap_count']].reset_index(drop = True)
           , pd.DataFrame(Term_matrix_train.toarray())], axis = 1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%', 'cap_count']].reset_index(drop = True)
           , pd.DataFrame(Term_matrix_test.toarray())], axis = 1)

In [17]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [19]:
rf = RandomForestClassifier(n_estimators = 150, max_depth = None, n_jobs = -1)
start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = end - start

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = end - start

precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')
print(print('Fit Time: {}, Predict Time: {} ----->Precision: {}, Recall: {}, Accuracy: {}'.format(round(fit_time, 3),
                                                                                                  round(pred_time, 3),
                                                                                                      round(precision, 3),
                                                                                                      round(recall, 3),
                                                                                                      round((y_pred == y_test).sum() / len(y_pred), 3)
                                                                                                     )))


Fit Time: 0.271, Predict Time: 0.104 ----->Precision: 0.992, Recall: 0.909, Accuracy: 0.987
None


In [20]:
gb = GradientBoostingClassifier(n_estimators = 150, max_depth = 11)
start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = end - start

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = end - start

precision, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')
print(print('Fit Time: {}, Predict Time: {} ----->Precision: {}, Recall: {}, Accuracy: {}'.format(round(fit_time, 3),
                                                                                                  round(pred_time, 3),
                                                                                                      round(precision, 3),
                                                                                                      round(recall, 3),
                                                                                                      round((y_pred == y_test).sum() / len(y_pred), 3)
                                                                                                     )))

Fit Time: 5.155, Predict Time: 0.005 ----->Precision: 0.977, Recall: 0.909, Accuracy: 0.986
None
