In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
% matplotlib inline

In [2]:
digits_data=np.loadtxt("digits.csv",delimiter=",")

In [4]:
def test_train_split(X,testsize):
    target=np.copy(X[:,-1])
    classes=list(np.unique(target))
    train=np.empty((1,X.shape[1]))
    test=np.empty((1,X.shape[1]))
    for i in classes:
        rows,columns=X[target==i].shape
        seed=np.random.choice(rows,size=int(rows*(1-testsize)),replace=False)
        train=np.append(train,X[target==i][seed,:],axis=0)
        test=np.append(test,np.delete(X[target==i],seed,axis=0),axis=0)
    train=train[1:,:]
    test=test[1:,:]
    np.random.shuffle(train)
    np.random.shuffle(test)
    return train[:,:-1],train[:,-1], test[:,:-1],test[:,-1]

In [5]:
def prior_prob(target):
    classes=list(np.unique(target))
    prior=[]
    for x in classes:
        prior.append(float(sum(target==x))/float(len(target)))
    return prior

In [6]:
def var_mean(X,target):
    classes=list(np.unique(target))
    mean=np.zeros((len(classes),X.shape[1]))
    var=np.zeros((len(classes),X.shape[1]))
    for x in classes:
        mean[x]=np.apply_along_axis(np.mean,0,X[target==x]).reshape(1,X.shape[1])
        var[x]=np.apply_along_axis(np.var,0,X[target==x]).reshape(1,X.shape[1])
    return mean,var

In [7]:
def compute_prob(mean,var,test_x,target,prior):
    classes=list(np.unique(target.astype(int)))
    prob=np.zeros((test_x.shape[0],test_x.shape[1]))
    marg_prob=np.zeros((test_x.shape[0],len(np.unique(digits_data))))
    for y in classes:
        for x in range(0,test_x.shape[0]):
            for z in range(0,test_x.shape[1]):
                if var[y,z]==0:
                    var[y,z]=np.power(np.random.normal(0,0.0001),2)
                denominator=math.sqrt(2.0*math.pi*var[y,z])
                numerator=math.exp(-(((pow((test_x[x,z]-mean[y,z]),2))/(2.0*var[y,z]))))
                prob[x,z]=numerator/denominator
        marg_prob[:,y]=list(np.apply_along_axis(np.product,1,prob)*prior[y])
    return marg_prob

In [8]:
def accuracy(ytest,yhat):
    right=float((yhat==ytest).sum())
    accuracy=right/float(ytest.shape[0])
    return accuracy

In [9]:
def GNB(Xtest,Xtrain,ytrain):
    prior=prior_prob(ytrain)
    mean,var=var_mean(Xtrain,ytrain)
    marg_prob=compute_prob(mean,var,Xtest,ytrain,prior)
    yhat=(np.argmax(marg_prob,axis=1)).astype(int)
    return yhat

In [10]:
def learning_curve(Xtrain,Xtest,ytrain,func):
    accu=[]
    for i in [0.1,0.25,0.5,0.75,1]:
        rows,columns=Xtrain.shape
        seed=np.random.choice(rows,size=int(rows*i),replace=False)
        xtrain=Xtrain[seed,:]
        target=ytrain[seed]
        accu.append(accuracy(ytest,func(Xtest,xtrain,target)))
    return accu    

In [12]:
#GNB Digits
p=[]
for i in range(0,10):
    Xtrain,ytrain,Xtest,ytest=test_train_split(digits_data,0.2)
    p.append(learning_curve(Xtrain,Xtest,ytrain,GNB))
p_GNB_digits=np.asarray(p)

In [23]:
print "Digits Error GNB"
print np.mean(list(1-np.array(p_GNB_digits)),axis=0)
print "Standard Deveation"
print np.std(list(1-np.array(p_GNB_digits)),axis=0)

Digits Error GNB
[ 0.26978022  0.19010989  0.18379121  0.19587912  0.19313187]
Standard Deveation
[ 0.02511296  0.02986303  0.02452452  0.05227077  0.02686274]


In [15]:
boston_data=np.loadtxt("boston.csv",delimiter=",")
boston_data_50=np.copy(boston_data)
boston_data_75=np.copy(boston_data)
boston_data_50[:,-1]=(boston_data_50[:,-1]>=np.median(boston_data_50[:,-1])).astype(int)
boston_data_75[:,-1]=(boston_data_75[:,-1]>=np.percentile(boston_data_75[:,-1],75)).astype(int)

In [16]:
#Logistic Boston 50
p=[]
for i in range(0,10):
    Xtrain,ytrain,Xtest,ytest=test_train_split(boston_data_50,0.2)
    p.append(learning_curve(Xtrain,Xtest,ytrain,GNB))
p_Boston_50_GNB=np.asarray(p)

In [24]:
print "Boston50 Error GNB"
print np.mean(list(1-np.array(p_Boston_50_GNB)),axis=0)
print "Standard Deveation"
print np.std(list(1-np.array(p_Boston_50_GNB)),axis=0)

Boston50 Error GNB
[ 0.25882353  0.24901961  0.24705882  0.24509804  0.24509804]
Standard Deveation
[ 0.0418709   0.03286873  0.02586452  0.02875074  0.02875074]


In [18]:
#Logistic Boston 75
p=[]
for i in range(0,10):
    Xtrain,ytrain,Xtest,ytest=test_train_split(boston_data_75,0.2)
    p.append(learning_curve(Xtrain,Xtest,ytrain,GNB))
p_Boston_75_GNB=np.asarray(p)

In [25]:
print "Boston75 Error GNB"
print np.mean(list(1-np.array(p_Boston_75_GNB)),axis=0)
print "Standard Deveation"
print np.std(list(1-np.array(p_Boston_75_GNB)),axis=0)

Boston75 Error GNB
[ 0.30196078  0.30686275  0.29313725  0.30098039  0.30392157]
Standard Deveation
[ 0.08918875  0.05298655  0.05807531  0.04493791  0.04822892]
