# Part 0: Preprocessing

In [1]:
# Import the relevant modules

import pandas as pd
import numpy as np
import os

In [2]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

In [3]:
# Specify model directory

model_dir = os.path.join(os.path.dirname(os.getcwd()), 'Model')

In [4]:
df = pd.read_json(os.path.join(data_dir, 'sms_to_ml.json'))
df = df.sort_index()
df.head()

Unnamed: 0,label,n_token,avg_wlen,n_num,has_num,n_upper,n_stops,has_email,has_money,has_phone,has_url
0,ham,23,4.0,0,0,0,4,0,0,0,0
1,ham,8,3.0,0,0,0,0,0,0,0,0
2,spam,37,3.357143,3,1,4,5,0,0,1,0
3,ham,13,3.641026,0,0,2,2,0,0,0,0
4,ham,15,4.142857,0,0,1,5,0,0,0,0


In [5]:
# Importing raw text
df_raw = pd.read_csv(os.path.join(data_dir,'SMSSpamCollection.txt'), delimiter = '\t', header = None)
df_raw.columns = ['label', 'text']
df_raw.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#  Part 1: Machine Learning model with extracted feature (Baseline)

In [6]:
# Creating predictors (X) and and label (y)

X = df.loc[:, df.columns != 'label']
y = (df.label == 'spam').values.astype(np.int)
indices = df.index

In [7]:
# Import options and modules

%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

In [8]:
# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

In [9]:
# Set the best fbeta. Here, we want to punish misclassification of ham for spam 10 times as much

best_beta = 0.1

## Logistic Regression

# Create pipeline which includes Scaling and Logistic Regression, Create a classifier using GridSearch for the best parameters

steps = [('scaler', StandardScaler()),
         ('lr', LogisticRegression(solver = 'lbfgs'))]
pipeline = Pipeline(steps)
parameters = {'lr__C':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [10]:
# Load the trained model

filename = 'lr.sav'
clf = pickle.load(open(os.path.join(model_dir, filename), 'rb'))

In [11]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.954454
Accuracy on test data:     0.953363
AUC-ROC score     0.980943
Fbeta score     0.949581


In [12]:
# Record the score

tr_acc_lr = training_accuracy
te_acc_lr = test_accuracy
auc_lr = auc
fbeta_lr = fbeta

# Save the model

filename = 'lr.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Random Forest

from sklearn.ensemble import RandomForestClassifier

# Create pipeline which includes Scaling and Random Forest, Create a classifier using GridSearch for the best parameters

steps = [('scaler', StandardScaler()), ('rf', RandomForestClassifier())]
pipeline = Pipeline(steps)
parameters = {'rf__n_estimators':[10 , 20, 30, 40, 50],
             'rf__max_features': ['auto', 'sqrt', 'log2']}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

forest = RandomForestClassifier(n_estimators = 50, max_features = 'auto').fit(X,y)
importances = forest.feature_importances_

plt.bar(x = X.columns, height = importances)
plt.xticks(rotation = 90)
plt.xlabel('Features')
plt.ylabel('Importances')
plt.show()

In [13]:
# Load the trained model

filename = 'rf.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [14]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.999551
Accuracy on test data:     0.974888
AUC-ROC score     0.975051
Fbeta score     0.940790


In [15]:
# Record the score

tr_acc_rf = training_accuracy
te_acc_rf = test_accuracy
auc_rf = auc
fbeta_rf = fbeta

# Save the model

filename = 'rf.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Support Vector Classification (SVC)

from sklearn.svm import SVC

# Create pipeline which includes Scaling and SVC, Create a classifier using GridSearch for the best parameters

steps = [('scaler', StandardScaler()),
         ('svc', SVC(probability=True))]
pipeline = Pipeline(steps)
parameters = {'svc__C':[0.01, 0.1, 1],
             'svc__kernel':['linear', 'poly', 'rbf'],
             'svc__gamma':['auto', 'scale']}

clf = GridSearchCV(pipeline, parameters, cv = 3, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [16]:
# Load the trained model

filename = 'svc.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [17]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.972627
Accuracy on test data:     0.973094
AUC-ROC score     0.936384
Fbeta score     0.959767


In [18]:
# Record the score

tr_acc_svc = training_accuracy
te_acc_svc = test_accuracy
auc_svc = auc
fbeta_svc = fbeta

# Save the model

filename = 'svc.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Gradient Boosted Trees

from sklearn.ensemble import GradientBoostingClassifier

steps = [('scaler', StandardScaler()), ('gbc', GradientBoostingClassifier())]
pipeline = Pipeline(steps)
parameters = {'gbc__n_estimators':[10, 50, 100, 200, 500],
             'gbc__max_features': ['auto', 'sqrt', 'log2'],
             'gbc__learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

steps = [('scaler', StandardScaler()), 
         ('gbc', GradientBoostingClassifier(learning_rate = 0.1, max_features = 'sqrt', n_estimators = 100))]
clf = Pipeline(steps)
clf.fit(Xtrain, ytrain)

In [19]:
# Load the trained model

filename = 'gbc.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [20]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.975993
Accuracy on test data:     0.973991
AUC-ROC score     0.985705
Fbeta score     0.946712


In [21]:
# Record the score

tr_acc_gbc = training_accuracy
te_acc_gbc = test_accuracy
auc_gbc = auc
fbeta_gbc = fbeta

# Save the model

filename = 'gbc.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Results of baseline algorithms:

In [22]:
result = pd.DataFrame({'Model':['Logistic Regression', 'Random Forest', 'SVC', 'GBC'],
             'Training_Accuracy':[tr_acc_lr, tr_acc_rf, tr_acc_svc, tr_acc_gbc],
             'Test_Accuracy':[te_acc_lr, te_acc_rf, te_acc_svc, te_acc_gbc],
             'AUC':[auc_lr, auc_rf, auc_svc, auc_gbc],
             'Fbeta':[fbeta_lr, fbeta_rf, fbeta_svc, fbeta_gbc]})
result.round(3)

Unnamed: 0,Model,Training_Accuracy,Test_Accuracy,AUC,Fbeta
0,Logistic Regression,0.954,0.953,0.981,0.95
1,Random Forest,1.0,0.975,0.975,0.941
2,SVC,0.973,0.973,0.936,0.96
3,GBC,0.976,0.974,0.986,0.947


<div class="span5 alert alert-success">
Here we can see that GBC gave us the best result in both Test Accuracy and AUC. However, SVC gives us the best Fbeta score. We choose this algorithm to be our baseline.
</div>

In [23]:
# Load the trained model

filename = 'svc.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

# Confusion matrix and misclassified

In [24]:
from sklearn.metrics import confusion_matrix

In [25]:
confusion_matrix(ytest, clf.predict(Xtest), labels=None, sample_weight=None)

array([[961,   5],
       [ 25, 124]], dtype=int64)

<div class="span5 alert alert-success">
Our baseline algorithm isn't too good at the classification and we can still see a few instances where it misclassified a ham message as spam. These 5 misclassifications might mean a big deal (potentially leading to throwing out real messages!). Ideally, we don't want any of such.
</div>

In [26]:
my_indices = np.where(ytest != clf.predict(Xtest))[0]
misclassified = [itest[i] for i in my_indices]

In [27]:
df_mis = df[df.index.isin(misclassified)]
df_mis.head()

Unnamed: 0,label,n_token,avg_wlen,n_num,has_num,n_upper,n_stops,has_email,has_money,has_phone,has_url
263,ham,13,3.166667,1,1,10,0,0,0,1,0
598,spam,15,3.772727,3,1,1,3,0,0,0,0
731,spam,16,3.428571,1,1,0,1,0,0,0,0
751,spam,21,3.851852,1,1,0,8,0,0,0,0
856,spam,32,3.823529,1,1,1,9,0,0,0,0


In [28]:
pd.set_option('display.max_colwidth', -1)

df_raw.iloc[df_mis.index]

Unnamed: 0,label,text
263,ham,MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H*
598,spam,You have an important customer service announcement. Call FREEPHONE 0800 542 0825 now!
731,spam,Email AlertFrom: Jeri StewartSize: 2KBSubject: Low-cost prescripiton drvgsTo listen to email call 123
751,spam,"Do you realize that in about 40 years, we'll have thousands of old ladies running around with tattoos?"
856,spam,Talk sexy!! Make new friends or fall in love in the worlds most discreet text dating service. Just text VIP to 83110 and see who you could meet.
907,spam,"all the lastest from Stereophonics, Marley, Dizzee Racal, Libertines and The Strokes! Win Nookii games with Flirt!! Click TheMob WAP Bookmark or text WAP to 82468"
1073,spam,Dear U've been invited to XCHAT. This is our final attempt to contact u! Txt CHAT to 86688
1086,ham,"FR'NDSHIP is like a needle of a clock. Though V r in d same clock, V r nt able 2 met. Evn if V meet,itz only 4few seconds. Bt V alwys stay conected. Gud 9t;-)"
1235,ham,Hello-/@drivby-:0quit edrunk sorry iff pthis makes no senrd-dnot no how ^ dancce 2 drum n basq!ihave fun 2nhite x ros xxxxxxx
1407,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TODAY IS YOUR LUCKY DAY! 2 FIND OUT WHY LOG ONTO HTTP://WWW.URAWINNER.COM THERE IS A FANTASTIC SURPRISE AWAITING FOR YOU"


<div class="span5 alert alert-success">
Here we can see the drawback of this approach. SMSs with clearly flaggable words like 'Txt', 'Free', the punctuation '!' were ignored as a spam signal (a few urls, phone numbers were missed because of its uniqueness). We can further code these features into our existing model to improve it, but this clearly shows the advantage of the Bag of Word model we will explore next.</div>

# Part 2: Machine Learning model with Bag of Words


In [29]:
# Create X and y set

X = df_raw.text
y = (df_raw.label == 'spam').values.astype(np.int)
indices = df_raw.index

In [30]:
# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

## Naive Bayes Bi-Gram on BOW model

In [31]:
# Import Naive Bayes

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

steps = [('vec', CountVectorizer(stop_words = 'english', ngram_range = (1, 2), token_pattern=r'\b\w+\b')),
         ('nb', MultinomialNB())]
pipeline = Pipeline(steps)
parameters = {'vec__min_df':[0.01, 0.1, 1, 10, 100],
              'nb__alpha':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [32]:
# Load the trained model

filename = 'nb_bow.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [33]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.998654
Accuracy on test data:     0.993722
AUC-ROC score     0.983096
Fbeta score     0.999512


In [34]:
# Record the score

tr_acc_nb_bow = training_accuracy
te_acc_nb_bow = test_accuracy
auc_nb_bow = auc
fbeta_nb_bow = fbeta

# Save the model

filename = 'nb_bow.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Naive Bayes Bi-Gram on TFIDF model

In [35]:
# Import Naive Bayes and Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

steps = [('vec', TfidfVectorizer(stop_words = 'english', ngram_range = (1, 2), token_pattern=r'\b\w+\b')),
         ('nb', MultinomialNB())]
pipeline = Pipeline(steps)
parameters = {'vec__min_df':[0.01, 0.1, 1, 10, 100],
              'nb__alpha':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [36]:
# Load the trained model

filename = 'nb_tfidf.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [37]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.999551
Accuracy on test data:     0.989238
AUC-ROC score     0.992226
Fbeta score     0.972148


In [38]:
# Record the score

tr_acc_nb_tfidf = training_accuracy
te_acc_nb_tfidf = test_accuracy
auc_nb_tfidf = auc
fbeta_nb_tfidf = fbeta

# Save the model

filename = 'nb_tfidf.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## SVC on BOW model

steps = [('vec', CountVectorizer(min_df = 1, stop_words = 'english', ngram_range = (1, 2), token_pattern=r'\b\w+\b')),
         ('svc', SVC(probability = True))]
pipeline = Pipeline(steps)
parameters = {'svc__C':[0.01, 0.1, 1],
             'svc__kernel':['linear', 'poly', 'rbf'],
             'svc__gamma':['auto', 'scale']}

clf = GridSearchCV(pipeline, parameters, cv = 3, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [39]:
# Load the trained model

filename = 'svc_bow.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [40]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.998878
Accuracy on test data:     0.982960
AUC-ROC score     0.989405
Fbeta score     0.991160


In [41]:
# Record the score

tr_acc_svc_bow = training_accuracy
te_acc_svc_bow = test_accuracy
auc_svc_bow = auc
fbeta_svc_bow = fbeta

# Save the model

filename = 'svc_bow.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## SVC on TFIDF model

steps = [('vec', TfidfVectorizer(stop_words = 'english', ngram_range = (1, 2), token_pattern=r'\b\w+\b')),
         ('svc', SVC(probability = True))]
pipeline = Pipeline(steps)
parameters = {'vec__min_df':[0.01, 0.1, 1, 10, 100],
             'svc__C':[0.01, 0.1, 1],
             'svc__kernel':['linear', 'poly', 'rbf'],
             'svc__gamma':['auto', 'scale']}

clf = GridSearchCV(pipeline, parameters, cv = 3, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [42]:
# Load the trained model

filename = 'svc_tfidf.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [43]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.998654
Accuracy on test data:     0.992825
AUC-ROC score     0.991934
Fbeta score     0.985938


In [44]:
# Record the score

tr_acc_svc_tfidf = training_accuracy
te_acc_svc_tfidf = test_accuracy
auc_svc_tfidf = auc
fbeta_svc_tfidf = fbeta

# Save the model

filename = 'svc_tfidf.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Results from bag of word models

In [45]:
result = pd.DataFrame({'Model':['NB_BoW', 'NB_Tfidf', 'SVC_BoW', 'SVC_Tfidf'],
             'Training_Accuracy':[tr_acc_nb_bow, tr_acc_nb_tfidf, tr_acc_svc_bow, tr_acc_nb_tfidf],
             'Test_Accuracy':[te_acc_nb_bow, te_acc_nb_tfidf, te_acc_svc_bow, te_acc_nb_tfidf],
             'AUC':[auc_nb_bow, auc_nb_tfidf, auc_svc_bow, auc_svc_tfidf],
             'Fbeta':[fbeta_nb_bow, fbeta_nb_tfidf, fbeta_svc_bow, fbeta_svc_tfidf]})
result.round(3)

Unnamed: 0,Model,Training_Accuracy,Test_Accuracy,AUC,Fbeta
0,NB_BoW,0.999,0.994,0.983,1.0
1,NB_Tfidf,1.0,0.989,0.992,0.972
2,SVC_BoW,0.999,0.983,0.989,0.991
3,SVC_Tfidf,1.0,0.989,0.992,0.986


<div class="span5 alert alert-success">
We are able to get much better results by using a bag of words approach in this dataset. The test accuracy, AUC score and fbeta score increased by 1-2% point. Interestingly, using a TD-IDF vectorizer seems to increase our AUC score but reduce test accuracy and Fbeta score slightly for Naive Bayes. Otherwise, SVC and NB seems to perform very identically. It seems that if we care about Fbeta more, we should go with the simpler approach, using just a pure bag of word model. Here, we choose our Naive Bayes Bag of Word as the best model.
</div>

In [46]:
# Load the trained model

filename = 'nb_bow.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [47]:
my_indices = np.where(ytest != clf.predict(Xtest))[0]
misclassified = [itest[i] for i in my_indices]

In [48]:
df_mis = df[df.index.isin(misclassified)]
df_mis.head()

Unnamed: 0,label,n_token,avg_wlen,n_num,has_num,n_upper,n_stops,has_email,has_money,has_phone,has_url
227,spam,34,3.65,2,1,3,4,0,0,0,0
731,spam,16,3.428571,1,1,0,1,0,0,0,0
751,spam,21,3.851852,1,1,0,8,0,0,0,0
2402,spam,41,3.52381,1,1,3,9,0,0,0,0
2663,spam,32,2.8,0,0,1,17,0,0,0,0


In [49]:
pd.set_option('display.max_colwidth', -1)

df_raw.iloc[df_mis.index]

Unnamed: 0,label,text
227,spam,"Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES"
731,spam,Email AlertFrom: Jeri StewartSize: 2KBSubject: Low-cost prescripiton drvgsTo listen to email call 123
751,spam,"Do you realize that in about 40 years, we'll have thousands of old ladies running around with tattoos?"
2402,spam,Babe: U want me dont u baby! Im nasty and have a thing 4 filthyguys. Fancy a rude time with a sexy bitch. How about we go slo n hard! Txt XXX SLO(4msgs)
2663,spam,"Hello darling how are you today? I would love to have a chat, why dont you tell me what you look like and what you are in to sexy?"
3360,spam,Sorry I missed your call let's talk when you have the time. I'm on 07090201529
3864,spam,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50"


<div class="span5 alert alert-success">
Our chosen spam filter made no mistakes misclassifying ham. It's conservative nature when predicting spam cause it to misclassify quite a few spams, however. In conclusion, it seems that TFidf model performs better generally, but BoW models perform better under business assumptions. We will use the Naive Bayes Bag of Word model for our combined model.
</div>

# Part 3: Various ensembling techniques

## Stacked Model (With dense and sparse features)

In [50]:
# Vectorizing the sparse features

df_combined = df
df_combined['text'] = df_raw['text']
vec = CountVectorizer(min_df = 1, stop_words = 'english', ngram_range = (1, 2), token_pattern=r'\b\w+\b')
vec_fit = vec.fit(df_combined.text)
sf = vec.fit_transform(df_combined.text)
sf

<5572x39684 sparse matrix of type '<class 'numpy.int64'>'
	with 93387 stored elements in Compressed Sparse Row format>

In [51]:
# Scaling dense features

from sklearn.preprocessing import MinMaxScaler

dense_feat = df.drop(['text', 'label'], axis =1)

ss = MinMaxScaler()

dense_feat = ss.fit_transform(dense_feat)
dense_feat

array([[0.10091743, 0.0625    , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03211009, 0.04166667, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.16513761, 0.04910714, 0.33333333, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.06422018, 0.0317029 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.11926606, 0.04619565, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02752294, 0.06111111, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [52]:
# Combine the features

from scipy.sparse import coo_matrix, hstack

dense_feat = coo_matrix(dense_feat)
dense_feat

<5572x10 sparse matrix of type '<class 'numpy.float64'>'
	with 21906 stored elements in COOrdinate format>

In [53]:
# Derive X and Y

X = hstack([sf, dense_feat.astype(float)])
y = (df.label == 'spam').values.astype(np.int)
indices = df.index

In [54]:
# Derive train/test set

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

In [55]:
Xtrain

<4457x39694 sparse matrix of type '<class 'numpy.float64'>'
	with 92282 stored elements in Compressed Sparse Row format>

# Train a Multinomial Naive Bayes over the the combined dataset (with both dense and sparse features)

steps = [('nb', MultinomialNB())]
pipeline = Pipeline(steps)
parameters = {'nb__alpha':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

clf.best_params_

In [56]:
# Load the trained model

filename = 'stack_dense_sparse.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [57]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain, ytrain)
test_accuracy = clf.score(Xtest, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.988557
Accuracy on test data:     0.987444
AUC-ROC score     0.975468
Fbeta score     0.991841


In [58]:
# Glimps confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, clf.predict(Xtest), labels=None, sample_weight=None)

array([[965,   1],
       [ 13, 136]], dtype=int64)

In [59]:
# Glimpes misclassifications

my_indices = np.where(ytest != clf.predict(Xtest))[0]
misclassified = [itest[i] for i in my_indices]
df_mis = df[df.index.isin(misclassified)]
df_mis

Unnamed: 0,label,n_token,avg_wlen,n_num,has_num,n_upper,n_stops,has_email,has_money,has_phone,has_url,text
227,spam,34,3.65,2,1,3,4,0,0,0,0,"Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES"
263,ham,13,3.166667,1,1,10,0,0,0,1,0,MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H*
731,spam,16,3.428571,1,1,0,1,0,0,0,0,Email AlertFrom: Jeri StewartSize: 2KBSubject: Low-cost prescripiton drvgsTo listen to email call 123
751,spam,21,3.851852,1,1,0,8,0,0,0,0,"Do you realize that in about 40 years, we'll have thousands of old ladies running around with tattoos?"
856,spam,32,3.823529,1,1,1,9,0,0,0,0,Talk sexy!! Make new friends or fall in love in the worlds most discreet text dating service. Just text VIP to 83110 and see who you could meet.
2402,spam,41,3.52381,1,1,3,9,0,0,0,0,Babe: U want me dont u baby! Im nasty and have a thing 4 filthyguys. Fancy a rude time with a sexy bitch. How about we go slo n hard! Txt XXX SLO(4msgs)
2575,spam,32,4.166667,0,0,2,10,0,0,0,0,Your next amazing xxx PICSFREE1 video will be sent to you enjoy! If one vid is not enough for 2day text back the keyword PICSFREE1 to get the next video.
2663,spam,32,2.8,0,0,1,17,0,0,0,0,"Hello darling how are you today? I would love to have a chat, why dont you tell me what you look like and what you are in to sexy?"
2742,spam,36,3.2,2,1,3,11,0,0,0,0,I don't know u and u don't know me. Send CHAT to 86688 now and let's find each other! Only 150p/Msg rcvd. HG/Suite342/2Lands/Row/W1J6HL LDN. 18 years or over.
2770,spam,31,3.066667,1,1,0,9,0,0,0,0,Burger King - Wanna play footy at a top stadium? Get 2 Burger King before 1st Sept and go Large or Super with Coca-Cola and walk out a winner


In [60]:
# Record the score

tr_acc_stack_dense_sparse = training_accuracy
te_acc_stack_dense_sparse = test_accuracy
auc_stack_dense_sparse = auc
fbeta_stack_dense_sparse = fbeta

# Save the model

filename = 'stack_dense_sparse.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Stacked Model (With dense features and proba of sparse features)

In [61]:
# Create X and y set for sparse matrix

X = df_raw.text
y = (df_raw.label == 'spam').values.astype(np.int)
indices = df_raw.index

In [62]:
# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

In [63]:
# Fit sparse features

steps = [('vec', CountVectorizer(stop_words = 'english', ngram_range = (1, 2), token_pattern=r'\b\w+\b')),
         ('nb', MultinomialNB())]
pipeline = Pipeline(steps)
parameters = {'vec__min_df':[0.01, 0.1, 1, 10, 100],
              'nb__alpha':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain, ytrain)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [64]:
clf.best_params_

{'nb__alpha': 1, 'vec__min_df': 1}

In [65]:
best_vec = clf
%store best_vec

Stored 'best_vec' (GridSearchCV)


In [66]:
# Derive probability of sparse matrix
Xtrain_proba = pd.DataFrame(clf.predict_proba(Xtrain), index = itrain)
Xtest_proba = pd.DataFrame(clf.predict_proba(Xtest), index = itest)

In [67]:
# Creating predictors (X) and and label (y) for dense matrix
df = pd.read_json(os.path.join(data_dir, 'sms_to_ml.json'))
df = df.sort_index()

In [68]:
X = df.loc[:, df.columns != 'label']
y = (df.label == 'spam').values.astype(np.int)
indices = df.index

In [69]:
# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

In [70]:
Xtrain_combined = pd.merge(Xtrain, Xtrain_proba, left_index=True, right_index=True)
Xtest_combined = pd.merge(Xtest, Xtest_proba, left_index=True, right_index=True)

In [71]:
Xtrain_combined.head()

Unnamed: 0,n_token,avg_wlen,n_num,has_num,n_upper,n_stops,has_email,has_money,has_phone,has_url,0,1
1978,23,4.0,2,1,2,6,0,1,0,0,1.540579e-18,1.0
3989,27,2.833333,0,0,1,5,0,0,0,0,1.0,4.417952e-11
3935,13,3.285714,0,0,0,2,0,0,0,0,0.9999999,7.412489e-08
4078,21,3.785714,0,0,0,9,0,0,0,0,1.0,1.865699e-08
4086,33,3.625,0,0,1,9,0,0,0,0,8.131628000000001e-18,1.0


# Train the model with SVC

steps = [('scaler', StandardScaler()), ('svc', SVC(probability=True))]
pipeline = Pipeline(steps)
parameters = {'svc__C':[0.01, 0.1, 1],
             'svc__kernel':['linear', 'poly', 'rbf'],
             'svc__gamma':['auto', 'scale']}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy")
clf.fit(Xtrain_combined, ytrain)

clf.best_params_

In [72]:
# Load the trained model

filename = 'stack_dense_proba.sav'
clf = pickle.load(open(os.path.join(model_dir,filename), 'rb'))

In [73]:
# Derive accuracy scores
training_accuracy = clf.score(Xtrain_combined, ytrain)
test_accuracy = clf.score(Xtest_combined, ytest)

# Derive roc_score
probs = clf.predict_proba(Xtest_combined)[:, 1]
auc = roc_auc_score(ytest, probs)

# Derive fbeta_score

fbeta = fbeta_score(ytest, clf.predict(Xtest_combined), beta = best_beta)

# Print the accuracy and roc

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))
print("AUC-ROC score     {:2f}".format(auc))
print("Fbeta score     {:2f}".format(fbeta))

Accuracy on training data: 0.999103
Accuracy on test data:     0.993722
AUC-ROC score     0.996575
Fbeta score     0.999512


In [74]:
# Glimps confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(ytest, clf.predict(Xtest_combined), labels=None, sample_weight=None)

array([[966,   0],
       [  7, 142]], dtype=int64)

In [75]:
# Glimpes misclassifications

my_indices = np.where(ytest != clf.predict(Xtest_combined))[0]
misclassified = [itest[i] for i in my_indices]
df_mis = df_combined[df_combined.index.isin(misclassified)]
df_mis

Unnamed: 0,label,n_token,avg_wlen,n_num,has_num,n_upper,n_stops,has_email,has_money,has_phone,has_url,text
227,spam,34,3.65,2,1,3,4,0,0,0,0,"Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES"
731,spam,16,3.428571,1,1,0,1,0,0,0,0,Email AlertFrom: Jeri StewartSize: 2KBSubject: Low-cost prescripiton drvgsTo listen to email call 123
751,spam,21,3.851852,1,1,0,8,0,0,0,0,"Do you realize that in about 40 years, we'll have thousands of old ladies running around with tattoos?"
2402,spam,41,3.52381,1,1,3,9,0,0,0,0,Babe: U want me dont u baby! Im nasty and have a thing 4 filthyguys. Fancy a rude time with a sexy bitch. How about we go slo n hard! Txt XXX SLO(4msgs)
2663,spam,32,2.8,0,0,1,17,0,0,0,0,"Hello darling how are you today? I would love to have a chat, why dont you tell me what you look like and what you are in to sexy?"
3360,spam,18,2.0,1,1,2,6,0,0,1,0,Sorry I missed your call let's talk when you have the time. I'm on 07090201529
3864,spam,27,2.677419,0,0,2,6,0,1,0,0,"Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50"


In [76]:
# Record the score

tr_acc_stack_dense_proba = training_accuracy
te_acc_stack_dense_proba = test_accuracy
auc_stack_dense_proba = auc
fbeta_stack_dense_proba = fbeta

# Save the model

filename = 'stack_dense_proba.sav'
pickle.dump(clf, open(os.path.join(model_dir,filename), 'wb'))

## Results of stacked model

In [77]:
result = pd.DataFrame({'Model':['stack_combined', 'stack_proba'],
             'Training_Accuracy':[tr_acc_stack_dense_sparse, tr_acc_stack_dense_proba],
             'Test_Accuracy':[te_acc_stack_dense_sparse, te_acc_stack_dense_proba],
             'AUC':[auc_stack_dense_sparse, auc_stack_dense_proba],
             'Fbeta':[fbeta_stack_dense_sparse, fbeta_stack_dense_proba]})
result.round(3)

Unnamed: 0,Model,Training_Accuracy,Test_Accuracy,AUC,Fbeta
0,stack_combined,0.989,0.987,0.975,0.992
1,stack_proba,0.999,0.994,0.997,1.0


<div class="span5 alert alert-success">
After much experimenting we were able to find our best model, which is the stack_proba model. This model is not only good in general (Accuracy score of 0.994 and AUC of 0.997) but also very strong given our business objectives (Fbeta almost 1!).
</div>

In [78]:
best_model = clf

In [79]:
# Save the stacked_combined model for our spam filter

%store best_model

Stored 'best_model' (GridSearchCV)
