# Naive Bayes

## David Kartchner

In [21]:
from __future__ import division
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB

### Problem 1:

** Part 1:  Separate seed data into train and test sets **

In [22]:
seeds = np.loadtxt("seeds_dataset.txt")
num_rows = seeds.shape[0]
num_cols = seeds.shape[1]
test_rows = np.random.choice(num_rows, size=40, replace=False)
test_data = seeds[test_rows]
train_mask = np.ones(num_rows, dtype=bool)
train_mask[test_rows] = 0
train_data = seeds[train_mask]
row_labels = ['Area', 'Perimeter', 'Compactness', 'Length', 'Width', 'Asymmetry Coefficient', 'Groove Length', 'Species']

#Make datasets into Pandas data frames
train = pd.DataFrame(train_data, index=list(range(train_data.shape[0])), columns=row_labels)
train['Species'] = train['Species'].astype("category")
train['Species'].cat.categories = ['Kama', 'Rosa', 'Canadian']
test = pd.DataFrame(test_data, index=list(range(test_data.shape[0])), columns=row_labels)
test['Species'] = test['Species'].astype("category")
test['Species'].cat.categories = ['Kama', 'Rosa', 'Canadian']

**Part 2: Calculate mean and variance of each feature for each type of wheat using training set**

In [23]:
grouped_train = train.groupby("Species")
means = grouped_train.mean()
variances = grouped_train.var()

kama_means = np.mean(train_data[np.where(train_data[:,-1]==1)], axis=0)[:-1]
rosa_means = np.mean(train_data[np.where(train_data[:,-1]==2)], axis=0)[:-1]
canadian_means = np.mean(train_data[np.where(train_data[:,-1]==3)], axis=0)[:-1]

kama_vars = np.var(train_data[np.where(train_data[:,-1]==1)], axis=0)[:-1]
rosa_vars = np.var(train_data[np.where(train_data[:,-1]==2)], axis=0)[:-1]
canadian_vars = np.var(train_data[np.where(train_data[:,-1]==3)], axis=0)[:-1]

print "Means:"
print means
print "\n\nVariances:"
print variances

Means:
               Area  Perimeter  Compactness    Length     Width  \
Species                                                           
Kama      14.214407  14.241525     0.879208  5.491051  3.228237   
Rosa      18.532586  16.215690     0.884469  6.175690  3.700190   
Canadian  11.810943  13.217547     0.848853  5.217472  2.844189   

          Asymmetry Coefficient  Groove Length  
Species                                         
Kama                   2.650054       5.069949  
Rosa                   3.650603       6.036741  
Canadian               4.673755       5.108000  


Variances:
              Area  Perimeter  Compactness    Length     Width  \
Species                                                          
Kama      1.457263   0.335827     0.000266  0.055029  0.031026   
Rosa      1.845360   0.332053     0.000234  0.065830  0.033152   
Canadian  0.504759   0.117603     0.000449  0.021173  0.021446   

          Asymmetry Coefficient  Groove Length  
Species            

**Part 3:  Predict labels using uniform prior**

In [24]:
kama_probs = np.sum(-(test_data[:,:-1] - kama_means)**2/(2*kama_vars), axis=1) -.5*np.sum(np.log(2*np.pi*kama_vars)) + np.log(np.sum(train_data[:,-1]==1)/train_data.shape[0])
rosa_probs = np.sum(-(test_data[:,:-1] - rosa_means)**2/(2*rosa_vars), axis=1) -.5*np.sum(np.log(2*np.pi*rosa_vars)) + np.log(np.sum(train_data[:,-1]==2)/train_data.shape[0])
canadian_probs = np.sum(-(test_data[:,:-1] - canadian_means)**2/(2*canadian_vars), axis=1) -.5*np.sum(np.log(2*np.pi*canadian_vars)) + np.log(np.sum(train_data[:,-1]==3)/train_data.shape[0])
likelihoods = np.array([kama_probs, rosa_probs, canadian_probs])
predicted_vals = np.argmax(likelihoods, axis=0)+1

**Part 4: Show percent correct**

In [25]:
num_misclassified = len(np.where((test_data[:,-1]-predicted_vals)!=0)[0])
test_size = test_data.shape[0]
percent_correct = (test_size-num_misclassified)/test_size
print "Percent Correct: ", percent_correct

Percent Correct:  0.875


### Problem 2:

In [26]:
nb_classifier = GaussianNB()
nb_classifier.fit(train_data[:,:-1], train_data[:,-1])
pred_labels = nb_classifier.predict(test_data[:, :-1])
print (test_data.shape[0] - sum(pred_labels != test_data[:,-1]))/test_data.shape[0]

0.875


### Problem 3:

**Create a Naive Bayes Classifier for spam recognition**

In [27]:
class naiveBayes(object):
    """
    Perfoms naive bayes classification for word-count document features.
    """
    def __init__(self):
        print "You've been initialized!"
        
    def fit(self, X, Y):
        """
        Inputs:
            Y:  Nx1 array of labels for corresponding to each row of X.  Labels should be nonnegative integers.
            X:  NxK array of feature counts, where each entry is an integer.  Each row of X corresponds to an entry of Y.
        """
        #Make sure X and Y have same number of rows
        if X.shape[0] != Y.shape[0]:
            raise ValueError("X and Y must have same number of rows!")
            
        N = X.shape[0]
        K = X.shape[1]
        
        #Initialize vector of labels
        self.labels = np.unique(Y)
        
        #Initialize probability of each label (called self.prior_probs)
        self.num_labels = len(self.labels)
        self.prior_counts = np.empty(self.num_labels)
        for i in xrange(self.num_labels): 
            self.prior_counts[i] = len(np.where(Y==self.labels[i]))
        self.prior_probs = self.prior_counts/N
        
        #Create dict of feature probs for each category
        self.feature_probs = {}
        for label in self.labels:
            self.feature_probs[label] = np.ones(K)
            label_mask = np.where(Y==label)
            self.feature_probs[label] += np.sum(X[label_mask], axis=0)
            self.feature_probs[label] /= np.sum(self.feature_probs[label])
        print "Naive Bayes Model has been fit!"
    
    def predict(self, X):
        self.prediction_probs = np.empty((X.shape[0],self.num_labels))
        for i in xrange(self.num_labels):
            self.prediction_probs[:,i] = np.log(self.prior_probs[i])+(X.dot(np.log(self.feature_probs[i])))
            
        return np.argmax(self.prediction_probs, axis=1)
        

### Problem 4:

**Test our classifier and benchmark against sklearn implementation**

In [28]:
labels = np.loadtxt("SpamLabels.txt")
features = np.loadtxt("SpamFeatures.txt")

n = len(labels)
test_size= 500
test_mask = np.random.choice(n, size=test_size, replace=False)
train_mask = np.ones(n, dtype=bool)
train_mask[test_mask] = 0

test_labels = labels[test_mask]
test_features = features[test_mask]

train_labels = labels[train_mask]
train_features = features[train_mask]

In [29]:
spam_classifier = naiveBayes()
spam_classifier.fit(train_features, train_labels)
test_prediction = spam_classifier.predict(test_features)
print "Classification rate: ", (n-np.sum(np.abs(test_prediction-test_labels)))/n

You've been initialized!
Naive Bayes Model has been fit!
Classification rate:  0.995746326373


In [30]:
mnb = MultinomialNB()
mnb.fit(train_features, train_labels)
test_prediction = mnb.predict(test_features)
print "Classification rate: ", (n-np.sum(np.abs(test_prediction-test_labels)))/n

Classification rate:  0.996133023975
