#Naive Bayes and Text Data#
### 15 July 2015 ###

**Applying Bayes Theorem to the Iris Data set**

In [1]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [12]:
# load the iris data
iris = load_iris()

# round up the measurements
X = np.ceil(iris.data)

# clean up column names
col_names = [name[:-5].replace(' ', '_') for name in iris.feature_names]

# read into pandas
df = pd.DataFrame(X, columns=col_names)

# create a list of species using iris.target and iris.target_names
species = [iris.target_names[num] for num in iris.target]

# add the species list as a new DataFrame column
df['species'] = species

In [13]:
iris

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [14]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,6,4,2,1,setosa
1,5,3,2,1,setosa
2,5,4,2,1,setosa
3,5,4,2,1,setosa
4,5,4,2,1,setosa


Let's say there's an out of sample observation of 7, 3, 5, 2

Predict the species of iris

In [17]:
# let's look for all observations with features [7, 3, 5, 2]
df[(df.sepal_length==7) & (df.sepal_width==3) & (df.petal_length == 5) & (df.petal_width==2)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
54,7,3,5,2,versicolor
58,7,3,5,2,versicolor
63,7,3,5,2,versicolor
68,7,3,5,2,versicolor
72,7,3,5,2,versicolor
73,7,3,5,2,versicolor
74,7,3,5,2,versicolor
75,7,3,5,2,versicolor
76,7,3,5,2,versicolor
77,7,3,5,2,versicolor


In [15]:
# count the species for the observations
df[(df.sepal_length==7) & (df.sepal_width==3) & (df.petal_length == 5) & (df.petal_width==2)].species.value_counts()

versicolor    13
virginica      4
dtype: int64

In [16]:
# count species for all observations
df.species.value_counts()

setosa        50
versicolor    50
virginica     50
dtype: int64

So looking at it with Baye's theorem... calculate the probability of each class given the measurements

$$P(versicolor|7352) = P(7352|versicolor) * P(versicolor) 
/ P(7352)$$

$$P(versicolor|7352) = (13/50) * (50/150) / (17/150)$$

We frame a classification problem as three different conditional probability equations, using Bayes theorem.

##Naive Bayes Classification##



In [14]:
# read tab-separated file using pandas
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/sms.tsv'
sms = pd.read_table(url, sep='\t', header=None, names=['label', 'msg'])

In [15]:
sms.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
sms.label.value_counts()

ham     4825
spam     747
dtype: int64

In [17]:
sms['label'] = sms.label.map({'ham':0, 'spam':1})

In [18]:
sms.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sms.msg, sms.label, random_state=1)

(1393L,)

In [21]:
print X_train.shape
print X_test.shape

(4179L,)
(1393L,)


###**Count Vectorizer**###

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
# list of three sample text messages (pretend data)
train_simple = ['call you tonight',
                'Call me a cab',
                'please call me... PLEASE!']

In [24]:
vect = CountVectorizer()

In [25]:
# Count Vectorizer is not a model but it has a fit method
vect.fit(train_simple)

CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
vect.get_feature_names()

[u'cab', u'call', u'me', u'please', u'tonight', u'you']

Returns:
- lower case
- unique words
- ignores 'a'
- no punctuation

In [27]:
train_simple_dtm = vect.transform(train_simple)

*Transform* --> transform to *document term matrix*

In [28]:
train_simple_dtm

<3x6 sparse matrix of type '<type 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [29]:
train_simple_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

3 x 6 matrix
- one row for each message
- one column for each token (word)

In [30]:
pd.DataFrame(train_simple_dtm.toarray(), columns = vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [31]:
# creating test data (pretend)
test_simple = ["please don't call me"]

In [32]:
test_simple_dtm = vect.transform(test_simple)

In [34]:
test_simple_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [35]:
pd.DataFrame(test_simple_dtm.toarray(), columns = vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


It ignores words that it has never seen before. Why?

1. We can't do anything in Naive Bayes with wordss we've never seen before ("don't")
2. We're training the model on six features (the six individual words). Can't test on a different number of features!

So the steps for using vectorizer:
1. Fit --> learn vocabulary
    - Fit on the training data
    - Transform on the training data
2. Transform --> take the vocabulary we have used to construct the document term matrix
    - Transform on the testing data

###Now using on the main sms data###

In [36]:
# need to re-instantiate
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
    # since we have to do a fit and then a transform on the training data, there's a neat function that combines the two

In [37]:
test_dtm = vect.transform(X_test)

In [38]:
train_dtm

<4179x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [39]:
test_dtm

<1393x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [50]:
## EXAMINING THE FEATURES AND THEIR COUNTS

# store feature names and examine them
train_features = vect.get_feature_names()
print train_features[:50]
print train_features[-50:]

[u'00', u'000', u'008704050406', u'0121', u'01223585236', u'01223585334', u'0125698789', u'02', u'0207', u'02072069400', u'02073162414', u'02085076972', u'021', u'03', u'04', u'0430', u'05', u'050703', u'0578', u'06', u'07', u'07008009200', u'07090201529', u'07090298926', u'07123456789', u'07732584351', u'07734396839', u'07742676969', u'0776xxxxxxx', u'07781482378', u'07786200117', u'078', u'07801543489', u'07808', u'07808247860', u'07808726822', u'07815296484', u'07821230901', u'07880867867', u'0789xxxxxxx', u'07946746291', u'0796xxxxxx', u'07973788240', u'07xxxxxxxxx', u'08', u'0800', u'08000407165', u'08000776320', u'08000839402', u'08000930705']
[u'yer', u'yes', u'yest', u'yesterday', u'yet', u'yetunde', u'yijue', u'ym', u'ymca', u'yo', u'yoga', u'yogasana', u'yor', u'yorge', u'you', u'youdoing', u'youi', u'youphone', u'your', u'youre', u'yourjob', u'yours', u'yourself', u'youwanna', u'yowifes', u'yoyyooo', u'yr', u'yrs', u'ything', u'yummmm', u'yummy', u'yun', u'yunny', u'yuo', u'

In [51]:
# convert train_dtm to a regular array
train_arr = train_dtm.toarray()
train_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [52]:
# count how many times EACH token appears across ALL messages in train_arr
# sum over an axis
import numpy as np
np.sum(train_arr, axis=0)

array([ 5, 23,  2, ...,  1,  1,  1], dtype=int64)

In [74]:
# create a DataFrame of tokens with their counts
train_token_counts = pd.DataFrame({'token':train_features, 'count':np.sum(train_arr, axis=0)})
train_token_counts.sort("count")

Unnamed: 0,count,token
3727,1,jules
4172,1,mallika
4169,1,malarky
4165,1,makiing
4161,1,maintaining
4158,1,mails
4157,1,mailed
4151,1,magicalsongs
4150,1,maggi
4149,1,magazine


In [57]:
## MODEL BUILDING WITH NAIVE BAYES
## http://scikit-learn.org/stable/modules/naive_bayes.html

# train a Naive Bayes model using train_dtm
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
    # note that we're fitting on train_dtm, because we need numbers

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [60]:
# make predictions on test data using test_dtm
y_pred_class = nb.predict(test_dtm)

In [61]:
# compare predictions to true labels
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)
print metrics.confusion_matrix(y_test, y_pred_class)

0.988513998564
[[1203    5]
 [  11  174]]


In [63]:
# predict (poorly calibrated) probabilities and calculate AUC
# can't take them seriously as calibrated
y_pred_prob = nb.predict_proba(test_dtm)[:, 1]
y_pred_prob

array([  2.87744864e-03,   1.83488846e-05,   2.07301295e-03, ...,
         1.09026171e-06,   1.00000000e+00,   3.98279868e-09])

In [64]:
print metrics.roc_auc_score(y_test, y_pred_prob)

0.986643100054


In [75]:
# show the message text for the false positives
X_test[y_test < y_pred_class]

array(['Waiting for your call.', 'Also andros ice etc etc',
       'No calls..messages..missed calls', 'No pic. Please re-send.',
       'No calls..messages..missed calls'], dtype=object)

In [78]:
# show the message text for the false negatives
X_test[y_test > y_pred_class]

array([ "LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323.",
       "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, \xc2\xa31.50 to rcv",
       "Xmas & New Years Eve tickets are now on sale from the club, during the day from 10am till 8pm, and on Thurs, Fri & Sat night this week. They're selling fast!",
       "Hi I'm sue. I am 20 years old and work as a lapdancer. I love sex. Text me live - I'm i my bedroom now. text SUE to 89555. By TextOperator G2 1DA 150ppmsg 18+",
       'Would you like to see my XXX pics they are so hot they were nearly banned in the uk!',
       'CALL 09090900040 & LISTEN TO EXTREME DIRTY LIVE CHAT GOING ON IN THE OFFICE RIGHT NOW TOTAL PRIVACY NO ONE KNOWS YOUR [sic] LISTENING 60P MIN 24/7MP 0870753331018+',
       'thesmszone.com lets you send free anonymous and maske

###We can still use logistic regression###

In [79]:
## COMPARING NAIVE BAYES WITH LOGISTIC REGRESSION

# instantiate/fit/predict
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm, y_train)
y_pred_class = logreg.predict(test_dtm)
y_pred_prob = logreg.predict_proba(test_dtm)[:, 1]

In [80]:
# evaluate
print metrics.accuracy_score(y_test, y_pred_class)
print metrics.confusion_matrix(y_test, y_pred_class)
print metrics.roc_auc_score(y_test, y_pred_prob)

0.989231873654
[[1206    2]
 [  13  172]]
0.994144889923


Hey so it actually did a tiny bit better than Naive Bayes...

In [82]:
# show false positives and false negatives
X_test[y_test < y_pred_class]

array([ 'Cheers for the message Zogtorius. I\xc2\x92ve been staring at my phone for an age deciding whether to text or not.',
       "Forgot you were working today! Wanna chat, but things are ok so drop me a text when you're free / bored etc and i'll ring. Hope all is well, nose essay and all xx"], dtype=object)

In [83]:
X_test[y_test > y_pred_class]

array(['Call FREEPHONE 0800 542 0578 now!',
       'Urgent Ur \xc2\xa3500 guaranteed award is still unclaimed! Call 09066368327 NOW closingdate04/09/02 claimcode M39M51 \xc2\xa31.50pmmorefrommobile2Bremoved-MobyPOBox734LS27YF',
       "LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323.",
       'Would you like to see my XXX pics they are so hot they were nearly banned in the uk!',
       'CALL 09090900040 & LISTEN TO EXTREME DIRTY LIVE CHAT GOING ON IN THE OFFICE RIGHT NOW TOTAL PRIVACY NO ONE KNOWS YOUR [sic] LISTENING 60P MIN 24/7MP 0870753331018+',
       'thesmszone.com lets you send free anonymous and masked messages..im sending this message from there..do you see the potential for abuse???',
       'RECPT 1/3. You have ordered a Ringtone. Your order is being processed...',
       'Hi this is Amy, we will be sending you a free phone number in a couple of days, which will give 

In [84]:
## BONUS CONTENT: CALCULATING THE 'SPAMMINESS' OF EACH TOKEN

# create separate DataFrames for ham and spam
sms_ham = sms[sms.label==0]
sms_spam = sms[sms.label==1]

# learn the vocabulary of ALL messages and save it
vect.fit(sms.msg)
all_features = vect.get_feature_names()

# create document-term matrix of ham, then convert to a regular array
ham_dtm = vect.transform(sms_ham.msg)
ham_arr = ham_dtm.toarray()

# create document-term matrix of spam, then convert to a regular array
spam_dtm = vect.transform(sms_spam.msg)
spam_arr = spam_dtm.toarray()

# count how many times EACH token appears across ALL messages in ham_arr
ham_counts = np.sum(ham_arr, axis=0)

# count how many times EACH token appears across ALL messages in spam_arr
spam_counts = np.sum(spam_arr, axis=0)

# create a DataFrame of tokens with their separate ham and spam counts
all_token_counts = pd.DataFrame({'token':all_features, 'ham':ham_counts, 'spam':spam_counts})

# add one to ham counts and spam counts so that ratio calculations (below) make more sense
all_token_counts['ham'] = all_token_counts.ham + 1
all_token_counts['spam'] = all_token_counts.spam + 1

# calculate ratio of spam-to-ham for each token
all_token_counts['spam_ratio'] = all_token_counts.spam / all_token_counts.ham
all_token_counts.sort('spam_ratio')

Unnamed: 0,ham,spam,token,spam_ratio
3684,319,1,gt,0.003135
4793,317,1,lt,0.003155
3805,232,1,he,0.004310
6843,168,1,she,0.005952
4747,163,1,lor,0.006135
2428,151,1,da,0.006623
4550,136,1,later,0.007353
1247,90,1,ask,0.011111
6626,90,1,said,0.011111
2714,89,1,doing,0.011236
