In [1]:
#import statements
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.preprocessing import label_binarize

In [2]:
'''
Function desc: Read data from csv file and load it into pandas data frame
Input: File name
returns: A data frame containing one feature (sepal-length) and classes (Iris-versicolor and Iris-virginica)
'''

def read_data(file):
    df = pd.read_csv(file, names=['sepal_length','sepal_width','petal_length','petal_width','classes'])
    one_df= df[['sepal_length','classes']]
    f=one_df.query('classes != "Iris-setosa"')
    return f

In [3]:
oneD_data = read_data('iris.csv')  #read data


In [4]:
'''
Funtcion des: Calculate prior probability of each class
Input: One dimensional data
Output: Prior probability of each class
'''
def calculate_prior(oneD_data):
    priors = []
    cls = pd.unique(oneD_data.classes)
    for i in range(len(cls)):
        sz=oneD_data.groupby('classes').size()[i]
        priors.append((cls[i],(sz/float(oneD_data.shape[0]))))
    return priors
        
    
    

In [5]:
alpha = calculate_prior(oneD_data)
print alpha

[('Iris-versicolor', 0.5), ('Iris-virginica', 0.5)]


In [6]:
'''
Function desc:  Calculate the parameters mu and sigma. Split the obtained data based on the class and calculate the mean and 
                standard deviation.
                
Input : One-dimensional data
Output: mean and standard deviation
                    
'''



def calculate_parameters(d):
        sd = []
        mean = []
        cls = pd.unique(d.classes)
        for i in cls:
            v = oneD_data[oneD_data.classes == i]
            mean.append(np.mean(v))
            sd.append(np.std(v))
        return mean,sd

In [7]:
mu,sigma = calculate_parameters(oneD_data)
print mu
print sigma

[sepal_length    5.936
dtype: float64, sepal_length    6.588
dtype: float64]
[sepal_length    0.510983
dtype: float64, sepal_length    0.629489
dtype: float64]


In [8]:
'''
Function desc: To calculate the membership function
Input: data, mean, standard deviation and prior probability
Returns: value of the membership function
'''
def oneD_GDA_gx(x,mean,sd,prior):
     return -np.log(sd) - 0.5*(x-mean)**2/(2*(sd)**2) +np.log(prior[1])
    

In [9]:
'''
Function desc: To determine the class of the outputs from the membership function
Input: Values from the membership function
Output: Class of  the input

'''


def oneD_GDA_det(g1,g2,cls):
    
    if float(g1) > float(g2):
        return cls[0]
    else:
        return cls[1]
    
    

In [10]:
'''
Function desc: Gaussian determinant analysis - to predict to which class the input belongs to.
Input: parameters of the Gaussian model, prior probabilities
Output: Predited class of the input
'''


def oneD_GDA(x,mu,sigma,prior):
    predicted = ['xxxxx' for i in range(x.shape[0])]
    d1 = x.ix[:,0].tolist()
    for i in range(len(d1)):
        predicted[i] = oneD_GDA_det(oneD_GDA_gx(d1[i],mu[0],sigma[0],prior[0]),oneD_GDA_gx(d1[i],mu[1],sigma[1],prior[1]),
                                    pd.unique(oneD_data.classes))
    return predicted

In [11]:
cls=oneD_GDA(oneD_data,mu,sigma,alpha)

In [12]:
'''
Function desc: To calculate the model parameters suh as precision, recall and f-measure from the confusion-matrix
Inuput: Actual and predicted values
Output: model parameters


Classification accuracy
(TP + TN) / (TP + TN + FP + FN)
Error rate
(FP + FN) / (TP + TN + FP + FN)

Precision: (or Positive predictive value)
proportion of predicted positives which
are actual positive
TP / (TP + FP)
Recall: proportion of actual positives
which are predicted positive
TP / (TP + FN)

Error rate: 1- classification accuracy

'''

def model_eval(actual,predicted):
    Truth= pd.Series(actual,name = 'Truth' )
    Predicted = pd.Series(predicted,name='Predicted')
    confusion_matrix = pd.crosstab(Truth, Predicted)
    arr_cm = confusion_matrix.as_matrix()
    diag = arr_cm.diagonal()
    accuracy = float(sum(diag))/np.sum(arr_cm)
    precision = np.divide(diag,1.0*(np.sum(arr_cm,axis=1)))
    recall = np.divide(diag,1.0*(np.sum(arr_cm,axis=0)))
    fmeasure = 2*((precision * recall)/(precision + recall))
    return confusion_matrix,accuracy,precision,recall,fmeasure
    
    

In [13]:
'''Function to perform 10-fold cross validation. In this method the test and traparams in indices are split using using the inbuit
   'KFold' function.
    Input 1: 'x' 
    Input 2: True predicted values
    Input 3: No of folds (10 by default)
    
    Performance measures such as accuracy, precision, f-measure
'''

def x_fold_validation(data,nfolds=10):
    confusion_matrix_list=[]
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    fmeasure_list=[]
    x=data.ix[:,1:3]
    y=data.classes
    cls= pd.unique(y)
    x.reset_index()
    MSE=[]
    cv = KFold(len(y), nfolds,shuffle = True,random_state=23) #inbuilt function to split the indices
    for train_idx, test_idx in cv:
        x_train = x.ix[train_idx]
        y_train = y[train_idx]
        x_test = x.ix[test_idx]
        y_test = y[test_idx]
        mean,sigma = calculate_parameters(x_train)
        prior = calculate_prior(x_train)
        predict = oneD_GDA(x_test,mean,sigma,prior)
        conf_matrix,accuracy,precision,recall,fmeasure = model_eval(y_test.tolist(),predict)
        print conf_matrix
        confusion_matrix_list.append(conf_matrix)
        accuracy_list.append(accuracy.tolist())
        precision_list.append(precision.tolist())
        recall_list.append(recall.tolist())
        fmeasure_list.append(fmeasure.tolist())
    print '################################'
    print 'Average of the model parameters'
    print '################################'
    print 'Error rate:', (1-np.mean(accuracy_list,axis=0))
    print 'Accuracy:',np.mean(accuracy_list)
    print 'Precision:', np.mean(precision_list,axis=0)
    print 'Recall:', np.mean(recall_list,axis=0)
    print 'F-measure:', np.mean(fmeasure_list,axis=0)
    

        
        
        


In [14]:
x_fold_validation(oneD_data.reset_index())

Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                4               1
Iris-virginica                 2               3
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                4               0
Iris-virginica                 2               4
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                3               2
Iris-virginica                 3               2
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                5               1
Iris-virginica                 0               4
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                2               1
Iris-virginica                 4               3
Predicted        Iri