# Naive Bayes
### Dano Gillam

In [69]:
import numpy as np
import pandas as pd

In [70]:
seed_data = np.loadtxt('/home/danogillam/databases/seeds_dataset.txt')

In [71]:
print 'seed_data.shape = ', seed_data.shape
D = seed_data.shape[0] #D : dataset size


seed_data.shape =  (210, 8)


In [72]:
features = ['area','perimeter','compactness','length',
            'width','asymmetry_coefficient','groove_length']
label = ['species']
seeds = pd.DataFrame(seed_data,columns=features+label)

In [73]:
seeds.head()

Unnamed: 0,area,perimeter,compactness,length,width,asymmetry_coefficient,groove_length,species
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


## Problem 1

Randomly select a test set and training set

In [74]:
test_ind = np.random.choice(D,replace=False,size=40)
train_ind = list(set(range(D)).difference(test_ind))

train = seeds.iloc[train_ind]
test = seeds.iloc[test_ind][features]
test_labels  = seeds.iloc[test_ind][label].values.flatten()


Find the mean of each feature of each label in the training set

In [75]:
train_label_means = [train[features][train.species==i].mean() for i in xrange(1,4)]
train_label_means
print np.shape(train_label_means)

(3, 7)


Find the variance of each feature of each label in the training set

In [76]:
train_label_variances = [train[features][train.species==i].var() for i in xrange(1,4)]
print np.shape(train_label_variances)

(3, 7)


#### Naive Bayes

In [77]:
from scipy.stats import norm
prediction1 = []
#iterate datapoints
for X in test.values:
    prediction1.append(np.argmax([sum(
                                    [norm.logpdf(x,loc=mu,scale=sig) for x,mu,sig 
                                    in zip(X,train_label_means[i],train_label_variances[i])]
                                ) for i in xrange(3)])+1)
        

Compare predictions to correct labels.

In [78]:
np.sum(test_labels==prediction1)/40.

0.625

## Problem 2

Use sklearn to perform naive Bayes.

In [79]:
from sklearn.naive_bayes import GaussianNB
alg = GaussianNB()
alg.fit(train[features],train.species)
prediction2 = alg.predict(test)

Compare predictions with correct labels.

In [80]:
np.sum(test_labels==prediction2)/40.

0.94999999999999996

## Problem 3

In [122]:
from __future__ import division
from scipy.stats import norm

def naivebayes(P,X,means,variances,n_labels):
    return np.argmax([ P[i] + sum(
                [norm.logpdf(x,loc=mu,scale=sig) for x,mu,sig 
                in zip(X,means[i],variances[i])]
            ) for i in xrange(n_labels)])

class NaiveBayes():
    def __init__(self):
        
        print 'Loading SpamFeatures...',
        try: self.docs = np.load('/home/danogillam/databases/spamtext/SpamFeatures')
        except:
            feature_filename = '/home/danogillam/databases/spamtext/SpamFeatures.txt'
            self.docs = np.genfromtxt(feature_filename)
            np.save('/home/danogillam/databases/spamtext/SpamFeatures',self.docs)
        self.D = len(self.docs)
        
        print '\rLoading SpamLabels...',
        try: self.labels = np.load('/home/danogillam/databases/spamtext/SpamLabels')
        except:
            labels_filename = '/home/danogillam/databases/spamtext/SpamLabels.txt'
            self.labels = np.genfromtxt(labels_filename)
            np.save('/home/danogillam/databases/spamtext/SpamLabels',self.labels)
        self.L = len(self.labels)
        
        print '\rLoading SpamVocab...',
        try: self.vocab = np.load('/home/danogillam/databases/spamtext/SpamVocab')
        except:
            vocab_filename = '/home/danogillam/databases/spamtext/SpamVocab.txt'
            self.vocab =  np.genfromtxt(vocab_filename)
            np.save('/home/danogillam/databases/spamtext/SpamVocab',self.vocab)
        self.V = len(self.vocab)
        
        print '\rSplitting data...',
        train, test, train_labels, test_labels = self._split_data()
        self.train = train
        self.test  = test
        self.train_labels = train_labels
        self.test_labels = test_labels
        
        print '\rFinding means and variances...',
        self.train_label_means = np.array([train[train_labels==i].mean(axis=0) for i in np.unique(train_labels)])
        self.train_label_vars  = np.array([train[train_labels==i].var(axis=0)  for i in np.unique(train_labels)])
        print '\r',
        
        self.P = np.unique(self.labels,return_counts=True)[1]
        
        
    def _split_data(self,testsize=500):
        test_ind = np.random.choice(self.D,replace=False,size=testsize)
        train_ind = list(set(range(self.D)).difference(test_ind))

        train = self.docs[train_ind]
        test = self.docs[test_ind]
        
        train_labels = self.labels[train_ind]
        test_labels  = self.labels[test_ind]
        return train, test, train_labels, test_labels
    
    def naivebayes_predict(self):
        
        #iterate datapoints
        for it,X in enumerate(self.test):
            print '\r',it,
            prediction1.append(self._naivebayes(X))
        print np.sum(self.test_labels==prediction1)/len(self.test_labels)
        return prediction1
    
    def _naivebayes(self,X):
        return np.argmax([ self.P[i] + sum(
                    [norm.logpdf(x,loc=mu,scale=sig) for x,mu,sig 
                    in zip(X,self.train_label_means[i],self.train_label_vars[i])]
                ) for i in np.unique(self.train_labels)])
    
    #this does the same thing... but in parallel!!! 8 times faster!
    def naivebayes_predict_parallel(self):
        return Parallel(n_jobs=8,verbose=5)(
            delayed(naivebayes)(self.P,
                                X,
                                self.train_label_means,
                                self.train_label_vars,
                                2) 
            for X in self.test)

    def sk_naivebayes_predict(self):
        alg = GaussianNB()
        alg.fit(self.train,self.train_labels)
        return alg.predict(self.test) 
        
    
nb = NaiveBayes()
print nb.D
print nb.L
print nb.V
print nb.train_label_means.shape

5172
5172
8167
(2, 8167)


## Problem 4

My implementation

In [123]:
prediction01 = nb.naivebayes_predict_parallel()

[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   12.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   30.9s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:   56.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:  1.8min finished


In [124]:
print np.sum(nb.test_labels==prediction01)/len(nb.test_labels)

0.282


Sk_learn's implementation

In [125]:
prediction02 = nb.sk_naivebayes_predict()
print np.sum(nb.test_labels==prediction01)/len(nb.test_labels)

0.282


Awesome! Both methods get the exact same score! 

# The End