In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Iris example

In [125]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

iris = datasets.load_iris()
gnb = GaussianNB()
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)

print("Number of mislabeled points out of a total {} points : {}".format(iris.data.shape[0],(iris.target != y_pred).sum()))

Number of mislabeled points out of a total 150 points : 6


# Spam or Ham

## You do

* Read in the file
* rename the columns
* print out the first five rows
* What are the labels and the label counts?

In [150]:
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep = '\t', header=None)
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [151]:
df.label.value_counts()

ham     4825
spam     747
dtype: int64

In [152]:
df['label'] = df.label.map({'ham': 0 , 'spam':1})

In [153]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Working with text

In [139]:
from sklearn.feature_extraction.text import CountVectorizer

example = ['this is sentance one one', 'this is sentance two']

count_vect = CountVectorizer()
count_vect.fit(example)
example_count = count_vect.transform(example)

In [140]:
print(count_vect.get_feature_names())
print(example_count.toarray())

[u'is', u'one', u'sentance', u'this', u'two']
[[1 2 1 1 0]
 [1 0 1 1 1]]


### Try to do this on the spam data set

In [51]:
count_vect = CountVectorizer()
count_vect.fit(np.array(df['message']))
example_count = count_vect.transform(np.array(df['message'])).toarray()

In [98]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(example_count, df.label, random_state=1)

### Build a Naive Bayes model on the train set and predict on the test set

### How well did it do?

In [141]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [142]:
prediction = nb.predict(X_test)
1.0*sum(prediction == y_test)/len(prediction)

0.9849246231155779

In [143]:
from sklearn import metrics
print metrics.confusion_matrix(y_test, prediction)

[[1198   10]
 [  11  174]]


### Adding in some stop words and min df

In [119]:
count_vect = CountVectorizer(min_df = 3, stop_words='english')
count_vect.fit(np.array(df['message']))
example_count = count_vect.transform(np.array(df['message'])).toarray()

nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [120]:
prediction = nb.predict(X_test)
1.0*sum(prediction == y_test)/len(prediction)

0.9849246231155779

In [121]:
from sklearn import metrics
print metrics.confusion_matrix(y_test, prediction)

[[1198   10]
 [  11  174]]


In [122]:
len(count_vect.get_feature_names())

2586