In [8]:
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd

---
Naives Bayes for Continuous Features
=====
***

##$$p(x_{i}\text{ }|\text{ }y) = \frac{1}{\sqrt{2\pi\sigma^{2}_{y}}}\text{exp}\left(-\frac{\left(x_{i}-\mu_{y}\right)^{2}}{2\sigma^{2}_{y}} \right)$$

- for each feature within each class you calculate a mean and a variance $\mu_{y}\text{ and }\sigma_{y}$
- for each example you calculate the probability with respect to the mean and variance of each class

####Consider the Iris Dataset

In [68]:
iris = datasets.load_iris()

In [129]:
def get_means_and_variances(iris):
    '''This calculates the means and the variances by feature and by class.
    For each feature we calculate the mean and the variance separately for each class.
    There are 4 features and 3 classes'''

    means = []
    varis = []
    for feature in [0, 1, 2, 3]:
        for cls in [0, 1, 2]:
            
            #Separate the data by class and feature
            f1 = np.array(iris.data[iris.target==cls, feature])
            
            #append the mean and variance to the two accumulator lists
            means.append(f1.mean())
            varis.append(f1.std()*f1.std())
            
    return (means, varis)

In [130]:
mv = get_means_and_variances(iris)

In [131]:
def calculate_L(iris, n, mv):
    '''Using our lists of the means and variances we take a record, n, and estimate the probability, using the
    Gaussian distribution formula for each of the features within the record
    The probabilities for the features are calculated against each class mean and each class variance'''
    
    #the overall results accumulator list
    res = []
    for i, feature in enumerate(iris.data[n]):
        
        #class accumulator list for each feature in the feature vector
        res_in = []
        for j, cls in enumerate([0, 1, 2]):
            
            #get the correct index to find the correct mean and variance
            index = i * 3 + j
            
            #calculate the Guassian
            #mv[0] are the means, mv[1] are the variances
            p = (1.0/np.sqrt(2.0*np.pi*mv[1][index]))*\
            np.exp(-((feature - mv[0][index])*(feature - mv[0][index]))/(2.0*mv[1][index]))
            
            #accumulate the results for the feature with the 3 classes
            res_in.append(p)
            
        #put one list into another to return a list of lists
        res.append(res_in)
    return res

In [138]:
def print_res(res):
    '''A function to pretty print the results'''
    
    #For each feature print out the class probabilities for the 3 classes
    for i in [0, 1, 2, 3]:
        print '\n\nfeature {:d}'.format(i+1)
        for j in [0, 1, 2]:
            print 'class {:d} = {:5.2f}'.format(j, res[i][j]),
            
    #Now sum BY CLASS - p(x1|C1)*p(x2|C1)*p(x3|C1)
    #repeat for C1, C2, and C3
    print "\n\n"
    for j in [0, 1, 2]:
        pres = 1.0
        for i in [0, 1, 2, 3]:
            pres *= res[i][j]
        print 'class {:d} = {:5.5f}'.format(j, pres),
    #the biggest magnitude wins the classification
        

In [139]:
#The first record belong to class 1
print iris.target[1]
res = calculate_L(iris, 1, mv)
print_res(res)

In [140]:
#The fifty first record belong to class 2
print iris.target[51]
res = calculate_L(iris, 51, mv)
print_res(res)

In [141]:
#The one hundred and first record belong to class 2
print iris.target[101]
res = calculate_L(iris, 101, mv)
print_res(res)

####Now use the sklearn routine and run a quick classifier

In [142]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.5, random_state = 2)

In [143]:
clf = GaussianNB()
clf.fit(X_train, y_train)

In [144]:
y_pred = clf.predict(X_test)

In [145]:
pd.crosstab(y_test, y_pred, rownames=["Actual"], colnames=["Predicted"])

In [None]:
---
Question
===
1. What advantages do you not