In [30]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from scipy.stats import randint

In [2]:
df = pd.read_csv('sms_spam.csv', encoding = "ISO-8859-1")

This notebook's focus is on using classifiers, and later on classification with train-validation-test splits + cross-validation. I've chosen to use a ready-made dataset since the focus is on the classification methods rather than on preprocessing data.

In [3]:
# The last 3 columns are not meaningful, so we'll drop those
df = df.drop(df.columns[2:5], axis=1)

# We can also rename the columns to be a bit more informative
df = df.rename(index=str, columns={"v1": "labels", "v2": "text"})

# We'll also convert the spam/ham labels to 1 and 0
df['labels'] = df.labels.map({'ham':0, 'spam':1})

df.head()

Unnamed: 0,labels,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.labels.value_counts()

0    4825
1     747
Name: labels, dtype: int64

We have 747 spam texts and 4825 good texts, which indicates that we do not suffer from a severe class imbalance.

In [5]:
X = df.text
Y = df.labels

# Split the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

vectorizer = CountVectorizer()

In [6]:
# Fit vectorizer to the training data, transform training data
X_train_df = vectorizer.fit_transform(X_train)

# Just transform the test data 
X_test_df = vectorizer.transform(X_test)

# Multinomial Naive Bayes

In [7]:
nb = MultinomialNB()
nb.fit(X_train_df, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
# Create a dict for the predictions of all the models we'll try
prediction = dict()

prediction["Multinomial"] = nb.predict(X_test_df)

# Looks like this (an array of 0s, 1s)
prediction['Multinomial']

array([0, 1, 0, ..., 1, 0, 0])

In [9]:
# Score the predictions

accuracy_score(Y_test, prediction['Multinomial'])

0.9863603732950467

Nice! 98% accuracy. We should make sure to take a look at the sensitivity and specificity of these predictions.

In [16]:
print(classification_report(Y_test, prediction['Multinomial'], target_names = ['Ham', 'Spam']), sep='\n')

             precision    recall  f1-score   support

        Ham       0.99      1.00      0.99      1197
       Spam       0.97      0.93      0.95       196

avg / total       0.99      0.99      0.99      1393



Looking at the classification_report, we can see that the performance is excellent for real texts and very strong for spam. Since it's worse for a user to miss real messages from people they know than to get the occasional spam message, this is an ideal situation.

Let's take a look at some of the messages that were misclassified:

In [11]:
# Find texts in X_test that were actually spam (Y_test = 1), but predicted Ham (0)
X_test[Y_test > prediction["Multinomial"]]

4254    Block Breaker now comes in deluxe format with ...
68      Did you hear about the new \Divorce Barbie\"? ...
671            SMS. ac sun0819 posts HELLO:\You seem cool
1457    CLAIRE here am havin borin time & am now alone...
3458    Not heard from U4 a while. Call me now am here...
1939    More people are dogging in your area now. Call...
2247    Back 2 work 2morro half term over! Can U C me ...
2662    Hello darling how are you today? I would love ...
2821    ROMCAPspam Everyone around should be respondin...
3417    LIFE has never been this much fun and great un...
4067    TBS/PERSOLVO. been chasing us since Sept forå£...
4674    Hi babe its Chloe, how r u? I was smashed on s...
4371    Ur balance is now å£600. Next question: Comple...
3862    Oh my god! I've found your number again! I'm s...
Name: text, dtype: object

# Logistic Regression

In [12]:
logreg = LogisticRegression()
logreg.fit(X_train_df, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
# Generate predictions
prediction["Logistic"] = logreg.predict(X_test_df)

# Score predictions
accuracy_score(Y_test, prediction['Logistic'])

0.9791816223977028

Slightly worse performance than the Naive Bayes classifier, but still pretty darned good.

In [15]:
# Generate another classification report
print(classification_report(Y_test, prediction['Logistic'], target_names = ['Ham', 'Spam']), sep='\n')

             precision    recall  f1-score   support

        Ham       0.98      1.00      0.99      1197
       Spam       0.99      0.86      0.92       196

avg / total       0.98      0.98      0.98      1393



Recall for Spam is a bit worse with the logistic regression, pulling the f1-score down.

# Support Vector Machine

In [19]:
svm = SVC()
svm.fit(X_train_df, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
# Generate predictions
prediction["SVM"] = svm.predict(X_test_df)

# Score predictions
accuracy_score(Y_test, prediction['SVM'])

0.8592964824120602

Yikes. The SVM performed pretty poorly on this task out of the box. Let's investigate.

In [21]:
print(classification_report(Y_test, prediction['SVM'], target_names = ['Ham', 'Spam']), sep='\n')

             precision    recall  f1-score   support

        Ham       0.86      1.00      0.92      1197
       Spam       0.00      0.00      0.00       196

avg / total       0.74      0.86      0.79      1393



  'precision', 'predicted', average, warn_for)


We can see from the warning generated above that the SVM failed to predict "spam" for any texts. 

We can verify that this matches the accuracy score by noticing that the accuracy score is simply the percentage of non-spam texts in the dataset, meaning that the SVM chose "Ham" and was right 86% of the time simply because 86% of our texts are indeed not spam.

In [24]:
4825 / (4825 + 747)

0.8659368269921034

Perhaps the SVM just needs some tuning and coaxing in order to work well for this data, but we will come back to that later. First, we'll try one last classification method.

# Decision Tree Classifier

In [25]:
tree = DecisionTreeClassifier()
tree.fit(X_train_df, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
# Generate predictions
prediction["Tree"] = tree.predict(X_test_df)

# Score predictions
accuracy_score(Y_test, prediction['Tree'])

0.9633883704235463

This accuracy is the third best out of the four classifiers we've tried so far.

In [27]:
print(classification_report(Y_test, prediction['Tree'], target_names = ['Ham', 'Spam']), sep='\n')

             precision    recall  f1-score   support

        Ham       0.98      0.98      0.98      1197
       Spam       0.88      0.86      0.87       196

avg / total       0.96      0.96      0.96      1393



The tree seems to be letting a lot more spam slip through than our first two classifiers.

# Hyperparameter tuning
The Naive Bayes classifier was our best classifier by far. Let's see if we can tune the other classifiers a bit to get them up to, or near, the same accuracy score.

In [28]:
from sklearn.model_selection import GridSearchCV

# Set up a "grid" of values we'd like to test to find out which results in the best performance
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Perform a grid search for the logistic regression classifier, then re-fit the data
logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)
logreg_cv.fit(X_train_df, Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e-05, 8.48343e-05, 7.19686e-04, 6.10540e-03, 5.17947e-02,
       4.39397e-01, 3.72759e+00, 3.16228e+01, 2.68270e+02, 2.27585e+03,
       1.93070e+04, 1.63789e+05, 1.38950e+06, 1.17877e+07, 1.00000e+08])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 1389495.494373136}
Best score is 0.9856424982053122


Awesome. The performance is now on par with the NB classifier. Let's try this same strategy for the other classifiers.

In [31]:
# Decision trees take a lot of parameters, making them an ideal use case for RandomizedSearchCV

param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)
tree_cv.fit(X_train_df, Y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [3, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a1b8a0da0>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a1b8a0a90>, 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [32]:
print("Tuned Tree Parameters: {}".format(tree_cv.best_params_)) 
print("Best score is {}".format(tree_cv.best_score_))

Tuned Tree Parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 2}
Best score is 0.8724575257238574


Very interesting - we see that the tree with parameters chosen from the grid actually performs worse than an out-of-the-box tree with default parameters. Perhaps we ought to just leave this one alone.

Let's try to work on the SVM now.

In [34]:
# Reusing the same param_grid we used for the logistic regression, since both classifiers take a C value
svm_cv = GridSearchCV(svm, param_grid, cv = 5)
svm_cv.fit(X_train_df, Y_train)

print("Tuned SVM Parameters: {}".format(svm_cv.best_params_)) 
print("Best score is {}".format(svm_cv.best_score_))

Tuned SVM Parameters: {'C': 2275.845926074791}
Best score is 0.9844460397224216


Wow! The SVM is now our second-best performer, going by best score. Impressive how much it was able to improve. 

While running the CV, however, a significant amount of processing power and time were used, making SVM impractical for very large datasets.

To wrap up, let's play with the Naive Bayes classifier a bit and see if we can get it to perform any better than it already has.

In [42]:
alpha_values = np.arange(0.1, 4, 0.1)
alpha_grid = {'alpha': alpha_values}

nb_cv = GridSearchCV(nb, alpha_grid, cv = 5)
nb_cv.fit(X_train_df, Y_train)

print("Tuned NB Parameters: {}".format(nb_cv.best_params_)) 
print("Best score is {}".format(nb_cv.best_score_))

Tuned NB Parameters: {'alpha': 1.8000000000000003}
Best score is 0.9818138310600623


Hmm. Looks like the performance is roughly equal or even a little worse. For this dataset, using out-of-the-box NB seems to be more ideal than fussing with the hyperparameters.