In [217]:
import math
import numpy as np
from numpy import genfromtxt
import operator

In [218]:
# import X and y from csv file
def read_logistic_data(filename):
    my_data = genfromtxt(filename, delimiter=';')
    X = []
    y= []
    for i in range(len(my_data)):
        new_data = my_data[i]
        y.append([new_data[-1]])
        X.append(np.delete(new_data, -1))
    return X, y

X_train, y_train = read_logistic_data('digits123-1.csv')
X_test, y_test = read_logistic_data('digits123-2.csv')

In [219]:
'''
INPUT: 
- training data (features and class concatenated)

OUTPUT:
- dictionary where classes are keys, with training examples
  corresponding to that class as values.
'''
def separate_classes(data):
    classdict = dict()
    for i in range(len(data)):
        x = data[i]
        if (x[-1] not in classdict):
            classdict[x[-1]] = []
        classdict[x[-1]].append(x)
    
    return classdict

classes = separate_classes(np.concatenate((X_train, y_train), axis=1))

In [54]:
'''
INPUT:
- numpy array

OUTPUT:
- mean of values in array
- variance of values in array
'''
def mean_var(x):
    mean = sum(x)/len(x) 
    variance = sum(((x-mean)**2))/len(x)   
    return mean, variance    

In [176]:
'''
INPUT:
- class

OUTPUT:
- dictionary where classes are keys, with (mean, variance) list
  of training examples corresponding to correct class.
'''
def mean_var_class(c):
    mean_var_list = list()
    
    # loop over every attribute
    for i in range(len(classes[c][0])-1):
        mean_var_attribute = np.array([])
        # loop over every training example
        for j in range(len(classes[c])):
            mean_var_attribute = np.concatenate((mean_var_attribute, np.array([classes[c][j][i]])), axis=0)
        mean_var_list.append(mean_var(mean_var_attribute))
    
    return mean_var_list


# create dict: key = class, entry = list with mean and variance for every feature    
mean_var_dict = {1.0 : mean_var_class(1.0), 2.0: mean_var_class(2.0), 3.0 : mean_var_class(3.0)}

In [179]:
'''
INPUT:
- mean
- variance
- value for feature x

OUTPUT:
- probability density function
'''
def pdf(mean, variance, x):
    # ignore training examples with value 0 for mean and variance,
    # i.e. return p = 1, so that multiplying with p does not change value.
    if ((mean == 0) & (variance == 0)):
        p = 1
    else:
        p = (1/(math.sqrt(2*variance*math.pi)))*math.e**-(((x-mean)**2)/(2*variance))
    
    return p

In [220]:

'''
INPUT:
- list with classes
- test instance x

OUTPUT:
- dictionary where classes are keys, with p that x
  is in class as value.
'''
def class_probabilities(classes, x):
    probabilities = {}
    for c in classes:     
        p = 1
        for i in range(len(x)):
            feature_train = x[i]
            feature_mean = mean_var_dict[c][i][0]
            feature_var = mean_var_dict[c][i][1]
            p *= pdf(feature_mean, feature_var, feature_train)
        probabilities[c] = p
            
    return probabilities

'''
INPUT:
- list with classes
- test instance x

OUTPUT:
- prediction for class
'''
def prediction(classes, x):
    # return highest probability found in class_probabilities
    return max(class_probabilities(classes, x).iteritems(), key=operator.itemgetter(1))[0]

print class_probabilities([1.0, 2.0, 3.0], X_train[0])
print prediction([1.0, 2.0, 3.0], X_train[0])

{1.0: 3.952058804347397e-47, 2.0: 2.1858205950974448e-64, 3.0: 1.8196239624211918e-63}
1.0


In [224]:
'''
INPUT:


OUTPUT:
'''


def evaluation_gaussian_nb(X_test, y_test):
    true = 0
    false = 0   
    for i in range(len(X_test)):
        predicted = prediction([1.0 ,2.0 ,3.0], X_test[i])
        actual = y_train[i][0]
        if predicted == actual:
            true += 1
        if predicted != actual:
            false += 1
             
    return true, false

true, false = evaluation_gaussian_nb(X_test, y_test)

print 'Correct:   ', true
print 'False:     ', false
print 'Accurracy: ', round((float(true)/float(false + true)), 5)    

Correct:    172
False:      68
Accurracy:  0.71667
