In [3]:
#import
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold

In [4]:
'''
Function desc: Read data from csv file and load it into pandas data frame
Input: File name
returns: A data frame containing all features and classes
'''
def read_data(file):
    df = pd.read_csv(file, names=['sepal_length','sepal_width','petal_length','petal_width','classes'])
    return df

In [5]:
nD_data = read_data('iris.csv')

In [6]:
'''
Funtcion des: Calculate prior probability of each class
Input: One dimensional data
Output: Prior probability of each class
'''

def calculate_prior(nD_data):
    priors = []
    cls = pd.unique(nD_data.classes)
    for i in range(len(cls)):
        sz=nD_data.groupby('classes').size()[i]
        priors.append((cls[i],(sz/float(nD_data.shape[0]))))
    return priors

In [7]:
alpha = calculate_prior(nD_data)
print alpha

[('Iris-setosa', 0.33333333333333331), ('Iris-versicolor', 0.33333333333333331), ('Iris-virginica', 0.33333333333333331)]


In [8]:
'''
Function desc:  Calculate the parameters mu and sigma. Split the obtained data based on the class and calculate the mean and 
                standard deviation.
                
Input : One-dimensional data
Output: mean and standard deviation
                    
'''

def calculate_parameters(d):
        covar = []
        mean = []
        cls = pd.unique(d.classes)
        for i in cls:
            data = d[d.classes == i]
            data_split = data.ix[:,0:4]
            mu = data_split.mean()
            mean.append(mu)
            sig = data_split.cov()
            covar.append(sig)
        return mean,covar

In [22]:
mean,covar = calculate_parameters(nD_data)
print mean
print covar[2]

[sepal_length    5.006
sepal_width     3.418
petal_length    1.464
petal_width     0.244
dtype: float64, sepal_length    5.936
sepal_width     2.770
petal_length    4.260
petal_width     1.326
dtype: float64, sepal_length    6.588
sepal_width     2.974
petal_length    5.552
petal_width     2.026
dtype: float64]
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      0.404343     0.093763      0.303290     0.049094
sepal_width       0.093763     0.104004      0.071380     0.047629
petal_length      0.303290     0.071380      0.304588     0.048824
petal_width       0.049094     0.047629      0.048824     0.075433


In [10]:
'''
Function desc: To calculate the membership function
Input: data, mean, standard deviation and prior probability
Returns: value of the membership function
'''

def nD_GDA_gx(x,mean,cov,prior):
    temp = x-mean
    sub_term = temp.tolist()
    dot1 = np.dot(np.transpose(sub_term),np.linalg.inv(cov))
    dot2 = np.dot(dot1,sub_term)
    return -0.5*np.log(np.linalg.det(cov)) - 0.5*dot2 + np.log(prior)
     

In [11]:
'''
Function desc: Gaussian determinant analysis - to predict to which class the input belongs to.
Input: parameters of the Gaussian model, prior probabilities
Output: Predited class of the input
'''

def predict_labels(x,mean,sigma,prior):
    classes = pd.unique(nD_data.classes)
    likelihood = ['xxxxx' for i in range(len(classes))]
    max_likelihood = ['xxxxx' for i in range(x.shape[0])]
    d1 = x.ix[:,0:4]
    for i in range(len(d1)):
        for j in range(len(classes)):
            mu=mean[j]
            cov=sigma[j]
            priorj=prior[j][1]
            likelihood[j] = nD_GDA_gx(d1.iloc[i],mu,cov,priorj)
        max_likelihood[i] = classes[likelihood.index(max(likelihood))]
    return max_likelihood
            
        

In [12]:
'''
Function desc: To calculate the model parameters suh as precision, recall and f-measure from the confusion-matrix
Inuput: Actual and predicted values
Output: model parameters


Classification accuracy
(TP + TN) / (TP + TN + FP + FN)
Error rate
(FP + FN) / (TP + TN + FP + FN)

Precision: (or Positive predictive value)
proportion of predicted positives which
are actual positive
TP / (TP + FP)
Recall: proportion of actual positives
which are predicted positive
TP / (TP + FN)

Error rate: 1- classification accuracy
'''

def model_eval(actual,predicted):
    Truth= pd.Series(actual,name = 'Truth' )
    Predicted = pd.Series(predicted,name='Predicted')
    confusion_matrix = pd.crosstab(Truth, Predicted)
    print confusion_matrix
    arr_cm = confusion_matrix.as_matrix()
    diag = arr_cm.diagonal()
    accuracy = float(sum(diag))/np.sum(arr_cm)
    precision = np.divide(diag,1.0*(np.sum(arr_cm,axis=0)))
    recall = np.divide(diag,1.0*(np.sum(arr_cm,axis=1)))
    fmeasure = 2*((precision * recall)/(precision + recall))
    return confusion_matrix,accuracy,precision,recall,fmeasure

In [23]:
'''Function to perform 10-fold cross validation. In this method the test and traparams in indices are split using using the inbuit
   'KFold' function.
    Input 1: 'x' 
    Input 2: True predicted values
    Input 3: No of folds (10 by default)
    
    Performance measures such as accuracy, precision, f-measure
'''
def x_fold_validation(data,nfolds=10):
    confusion_matrix_list=[]
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    fmeasure_list=[]
    x=data.ix[:,1:6]
    y=data.classes
    x.reset_index()
    cv = KFold(len(y), nfolds,shuffle=True,random_state=23) #inbuilt function to split the indices
    for train_idx, test_idx in cv:
        x_train = x.ix[train_idx]
        y_train = y[train_idx]
        x_test = x.ix[test_idx]
        y_test = y[test_idx]
        mean,sigma = calculate_parameters(x_train)
        priori = calculate_prior(x_train)
        predict = predict_labels(x_test,mean,sigma,priori)
        conf_matrix,accuracy,precision,recall,fmeasure = model_eval(y_test.tolist(),predict)
        confusion_matrix_list.append(conf_matrix)
        accuracy_list.append(accuracy.tolist())
        precision_list.append(precision.tolist())
        recall_list.append(recall.tolist())
        fmeasure_list.append(fmeasure.tolist())
    print '################################'
    print 'Average of the model parameters'
    print '################################'
    print 'Error rate:', (1-np.mean(accuracy_list,axis=0))
    print 'Accuracy:',np.mean(accuracy_list)
    print 'Precision:', np.mean(precision_list,axis=0)
    print 'Recall:', np.mean(recall_list,axis=0)
    print 'F-measure:', np.mean(fmeasure_list,axis=0)

In [24]:
x_fold_validation(nD_data.reset_index())

Predicted        Iris-setosa  Iris-versicolor  Iris-virginica
Truth                                                        
Iris-setosa                6                0               0
Iris-versicolor            0                4               0
Iris-virginica             0                0               5
Predicted        Iris-setosa  Iris-versicolor  Iris-virginica
Truth                                                        
Iris-setosa                6                0               0
Iris-versicolor            0                4               0
Iris-virginica             0                1               4
Predicted        Iris-setosa  Iris-versicolor  Iris-virginica
Truth                                                        
Iris-setosa                6                0               0
Iris-versicolor            0                6               0
Iris-virginica             0                0               3
Predicted        Iris-setosa  Iris-versicolor  Iris-virginica
Truth   