**CLASS**: Naive Bayes SMS spam classifier using sklearn

Data source: https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [None]:
## READING IN THE DATA

# read tab-separated file using pandas
import pandas as pd
%matplotlib inline

In [None]:
df = pd.read_table('../data/sms.tsv',
                   sep='\t', header=None, names=['label', 'msg'])

In [None]:
# examine the data
df.head()

In [None]:
df.label.value_counts().plot(kind='bar')

In [None]:
# Get the null accuracy rate
df.label.value_counts() / df.shape[0]

In [None]:
df.msg.describe()

In [None]:
# convert label to a quantitative binary variable
df['label'] = df.label.map({'ham':0, 'spam':1})
df.head()

In [None]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state=1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# start with a simple example
train_simple = ['call you tonight',
                'Call me a cab',
                'please call me... PLEASE 44!']

# learn the 'vocabulary' of the training data
vect = CountVectorizer()
train_simple_dtm = vect.fit_transform(train_simple)
vect.get_feature_names()

In [None]:
# transform training data into a 'document-term matrix'
train_simple_dtm = vect.transform(train_simple)
train_simple_dtm

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(train_simple_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
# transform testing data into a document-term matrix (using existing vocabulary, notice don't is missing)
test_simple = ["please don't call me"]
test_simple_dtm = vect.transform(test_simple)
test_simple_dtm.toarray()
pd.DataFrame(test_simple_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
## REPEAT PATTERN WITH SMS DATA

# instantiate the vectorizer
vect = CountVectorizer()

# learn vocabulary and create document-term matrix in a single step
train_dtm = vect.fit_transform(X_train)
train_dtm

In [None]:
# transform testing data into a document-term matrix
test_dtm = vect.transform(X_test)
test_dtm

In [None]:
# store feature names and examine them
train_features = vect.get_feature_names()
len(train_features)
train_features[:10], train_features[-10:]

In [None]:
# convert train_dtm to a regular array
train_arr = train_dtm.toarray()
# remember that each ROW is an sms
# and each COLUMN is a token
train_arr

In [None]:
## SIMPLE SUMMARIES OF THE TRAINING DATA
# refresher on numpy
import numpy as np
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
arr

In [None]:
sum(arr[0, :]) #sum of the first row

In [None]:
sum(arr[:,0]) # sum of the first column

In [None]:
################
### EXERCISE ###
################
# count how many times the 0th token appears across ALL messages in train_arr


In [None]:
################
### EXERCISE ###
################
#calculate the number of tokens in the 0th message in train_arr


In [None]:
np.sum(arr) # adds up all numbers

In [None]:
np.sum(arr, axis=0) # adds up by column

In [None]:
np.sum(arr, axis=1)  # adds up by row

In [None]:
################
### EXERCISE ###
################
# count how many times EACH token appears across ALL messages in train_arr


In [None]:
# create a dataframe with two columns, "count" and "token" that holds the token count for each token
# use the numpy sum methods
train_token_counts = pd.DataFrame({'token':train_features, 'count':np.sum(train_arr, axis=0)})
train_token_counts.sort_index(by='count', ascending=False).head()

In [None]:
## MODEL BUILDING WITH NAIVE BAYES
## http://scikit-learn.org/stable/modules/naive_bayes.html

# train a Naive Bayes model using train_dtm
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

In [None]:
# make predictions on test data using test_dtm
preds = nb.predict(test_dtm)
preds

In [None]:
# compare predictions to true labels
from sklearn import metrics
print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)

In [None]:
# predict (poorly calibrated) probabilities and calculate AUC
probs = nb.predict_proba(test_dtm)[:, 1]
probs
print metrics.roc_auc_score(y_test, probs)

In [None]:
################
### EXERCISE ###
################
# show the message text for the false positives


In [None]:
################
### EXERCISE ###
################
# show the message text for the false negatives


In [None]:
## COMPARE NAIVE BAYES AND LOGISTIC REGRESSION
## USING ALL DATA AND CROSS-VALIDATION
vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
# create a document-term matrix using all data
all_dtm = vect.fit_transform(df.msg)

# instantiate logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

# compare AUC using cross-validation
# note: this is slightly improper cross-validation... can you figure out why?
from sklearn.cross_validation import cross_val_score

from datetime import datetime

In [None]:
now = datetime.now()
mean = cross_val_score(logreg, all_dtm, df.label, cv=10, scoring='accuracy').mean()
print (datetime.now()-now).total_seconds(), "Seconds for logistic regression", mean

In [None]:
now = datetime.now()
mean = cross_val_score(nb, all_dtm, df.label, cv=10, scoring='accuracy').mean()
print (datetime.now()-now).total_seconds(), "Seconds for naive bayes", mean
# about 4 times faster

In [None]:
'''
Model evaluation metrics (confusion matrix, ROC/AUC)
'''

## READ DATA AND SPLIT INTO TRAIN/TEST

# read in the data
import pandas as pd
data = pd.read_csv('../data/Default.csv')
data.head()

In [None]:
# create X and y
X = data[['balance']]
y = data.default

In [None]:
# split into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# create logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
# predict and calculate accuracy in one step
logreg.score(X_test, y_test)

In [None]:
# predict in one step, calculate accuracy in a separate step
preds = logreg.predict(X_test)
from sklearn import metrics
print metrics.accuracy_score(y_test, preds)

In [None]:
# compare to null accuracy rate
y_test.mean()
1 - y_test.mean()

In [None]:
## CONFUSION MATRIX

# print confusion matrix
print metrics.confusion_matrix(y_test, preds)

In [None]:
# nicer confusion matrix
from nltk import ConfusionMatrix
print ConfusionMatrix(list(y_test), list(preds))

In [None]:
# sensitivity: percent of correct predictions when reference value is 'default'
21 / float(58 + 21)
print metrics.recall_score(y_test, preds)

# specificity: percent of correct predictions when reference value is 'not default'
print 2416 / float(2416 + 5)


In [None]:
# predict probabilities
import matplotlib.pyplot as plt
probs = logreg.predict_proba(X_test)[:, 1]
plt.hist(probs)

In [None]:
# use 0.5 cutoff for predicting 'default'
import numpy as np
preds = np.where(probs > 0.5, 1, 0)
print ConfusionMatrix(list(y_test), list(preds))

In [None]:
# change cutoff for predicting default to 0.2
preds = np.where(probs > 0.2, 1, 0)
print ConfusionMatrix(list(y_test), list(preds))

In [None]:
# check accuracy, sensitivity, specificity
print metrics.accuracy_score(y_test, preds)
print 45 / float(34 + 45)
print 2340 / float(2340 + 81)

In [None]:
## ROC CURVES and AUC

# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, probs)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')

In [None]:
# calculate AUC
print metrics.roc_auc_score(y_test, probs)

In [None]:
# use AUC as evaluation metric for cross-validation
from sklearn.cross_validation import cross_val_score
X = data[['balance']]
y = data.default
logreg = LogisticRegression()
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

In [None]:
# compare to a model with an additional feature
X = data[['balance', 'income']]
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

In [None]:
# ROC/AUC are very meaningful when:
# 1. class sizes are inbalances
# 2. comparing across different binary classification models

In [None]:
## BONUS: CALCULATE THE 'SPAMMINESS' OF EACH TOKEN


# create separate DataFrames for ham and spam
df_ham = df[df.label==0]
df_spam = df[df.label==1]

In [None]:
# learn the vocabulary of ALL messages and save it
vect = CountVectorizer(ngram_range=(1,1), stop_words='english')
vect.fit(df.msg)
all_features = vect.get_feature_names()

In [None]:
# create document-term matrix of ham, then convert to a regular array
ham_dtm = vect.transform(df_ham.msg)
ham_arr = ham_dtm.toarray()
ham_dtm

In [None]:
# create document-term matrix of spam, then convert to a regular array
spam_dtm = vect.transform(df_spam.msg)
spam_arr = spam_dtm.toarray()
spam_dtm

In [None]:
# count how many times EACH token appears across ALL messages in ham_arr
ham_counts = np.sum(ham_arr, axis=0)

In [None]:
# count how many times EACH token appears across ALL messages in spam_arr
spam_counts = np.sum(spam_arr, axis=0)

In [None]:
# create a DataFrame of tokens with their separate ham and spam counts
all_token_counts = pd.DataFrame({'token':all_features, 'ham':ham_counts, 'spam':spam_counts})
all_token_counts.head()

In [None]:
# add one to ham counts and spam counts so that ratio calculations (below) make more sense
all_token_counts['ham'] = all_token_counts.ham + 1
all_token_counts['spam'] = all_token_counts.spam + 1

In [None]:
# calculate ratio of spam-to-ham for each token
all_token_counts['spam_ratio'] = all_token_counts.spam / all_token_counts.ham
all_token_counts.sort_index(by='spam_ratio', ascending = False).head(10)