![title](Header__0002_8.png)
___
# Chapter 8 - Basic Algorithmic Learning
## Segment 3 - Naive Bayes Classifiers

In [None]:
# We'll be building a spam filter.  
#
# Naive Bayes is a machine learning method you can use to predict the likelihood that an event will occur given 
# evidence that's present in your data. 
#
# Conditional probability   
# P(BIA) = P(A and B)/ P(A)
#
# 3 types of Naive Bayes Models:
#   1 - Multinomial - Good for when your features (categorical or continuous) describe discrete frequency counts
# (Ex word counts)
#  2 - Bernoulli - good for making predictions form binary features. 
#  3 - Gaussian - good for making predictions from normally distributed features. 
#
# Use Cases:
# - Spam detection
# - Customer Classification
# - Credit risk prediction
# - health risk prediction
#
# Assumptions:
# - Predictions are independent of each other. 
# - A priori assumption: this is an assumption that the past conditions still hold true. When we make predictions from 
# historical values, we will get incorrect results if present circumstances have changed. 
# - All regression models maintain an a priori assumpiton as well. 
#
#

In [1]:
import numpy as np
import pandas as pd

import urllib

import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

## Naive Bayes
### Using Naive Bayes to predict spam

In [2]:
# The url is where the spam dataset comes from. It comes from the university of California Irvine
# calling the dataset raw_data. need to use the url open function to bring in the data. 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
raw_data = urllib.urlopen(url)
#using the npy loadtxt function to load the dataset in as csv. labeling the delimiter ,
dataset = np.loadtxt(raw_data, delimiter=",")
print dataset[0]

[   0.       0.64     0.64     0.       0.32     0.       0.       0.       0.
    0.       0.       0.64     0.       0.       0.       0.32     0.
    1.29     1.93     0.       0.96     0.       0.       0.       0.       0.
    0.       0.       0.       0.       0.       0.       0.       0.       0.
    0.       0.       0.       0.       0.       0.       0.       0.       0.
    0.       0.       0.       0.       0.       0.       0.       0.778
    0.       0.       3.756   61.     278.       1.   ]


In [3]:
#now we need to isolate the predictor variables in the dataset. These 48 features describe word frequency counts. 
#calling the dataset of predictive features x. 
X = dataset[:,0:48]

#isolating the target variable. spam will get a label of 1 and not spam will get a label of 0. 
#target variable is y. 
y = dataset[:, -1]

In [5]:
#labeling our test and training datasets.  
#test size is .33 so 33% is going to be used for testing. 
#random state is basically the seed. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

In [10]:
#the dataset is filled with continuous variables that describe frequency counts words. 
#we'll be combining bernouliNB with binning to convert the frequency counts to values. 
#we also pass in our xtrain and ytrain
BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)
print(BernNB)

#generating some predictive labels from our model. 
y_expect = y_test
y_pred = BernNB.predict(X_test)

#assessing the accuracy of the model. accuracy score of 85 for the model. 
print accuracy_score(y_expect, y_pred)

BernoulliNB(alpha=1.0, binarize=True, class_prior=None, fit_prior=True)
0.855826201448


In [11]:
#calling fit method on multinomial Niave Bayes Model. 

MultiNB = MultinomialNB()

MultiNB.fit(X_train, y_train)
print(MultiNB)

#printing in accuracy again. 87% so it's a bit better. 
y_pred = MultiNB.predict(X_test)
print accuracy_score(y_expect, y_pred)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.873601053325


In [12]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)

y_pred = GausNB.predict(X_test)
#81% accuracy
print accuracy_score(y_expect, y_pred)

GaussianNB()
0.813034891376


In [13]:
#through trail and error we are able to optimize the accuracy of our model. 
#binarize is different. 
BernNB = BernoulliNB(binarize=0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)
print accuracy_score(y_expect, y_pred)

BernoulliNB(alpha=1.0, binarize=0.1, class_prior=None, fit_prior=True)
0.895325872284
