In [1]:
#import statements
import matplotlib.pyplot as plt
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_recall_curve

In [2]:
'''
Function desc: Read data from csv file and load it into pandas data frame
Input: File name
returns: A data frame containing all features and classes (Iris-versicolor and Iris-virginica)
'''

def read_data(file):
    df = pd.read_csv(file, names=['sepal_length','sepal_width','petal_length','petal_width','classes'])
    f=df.query('classes != "Iris-setosa"')
    return f

In [3]:
nD_data = read_data('iris.csv')


In [4]:
'''
Funtcion des: Calculate prior probability of each class
Input: One dimensional data
Output: Prior probability of each class
'''

def calculate_prior(nD_data):
    priors = []
    cls = pd.unique(nD_data.classes)
    for i in range(len(cls)):
        sz=nD_data.groupby('classes').size()[i]
        priors.append((cls[i],(sz/float(nD_data.shape[0]))))
    return priors
        
    
    

In [5]:
alpha = calculate_prior(nD_data)
print alpha

[('Iris-versicolor', 0.5), ('Iris-virginica', 0.5)]


In [6]:

'''
Function desc:  Calculate the parameters mu and sigma. Split the obtained data based on the class and calculate the mean and 
                standard deviation.
                
Input : One-dimensional data
Output: mean and standard deviation
                    
'''
def calculate_parameters(d):
        covar = []
        mean = []
        cls = pd.unique(d.classes)
        for i in cls:
            data = d[d.classes == i]
            data_split = data.ix[:,0:4]
            mu = data_split.mean()
            mean.append(mu)
            sig = data_split.cov()
            covar.append(sig)
        return mean,covar

In [7]:
mean,covar = calculate_parameters(nD_data)
print mean
for i in range(len(covar)):
    print covar[i]

[sepal_length    5.936
sepal_width     2.770
petal_length    4.260
petal_width     1.326
dtype: float64, sepal_length    6.588
sepal_width     2.974
petal_length    5.552
petal_width     2.026
dtype: float64]
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      0.266433     0.085184      0.182898     0.055780
sepal_width       0.085184     0.098469      0.082653     0.041204
petal_length      0.182898     0.082653      0.220816     0.073102
petal_width       0.055780     0.041204      0.073102     0.039106
              sepal_length  sepal_width  petal_length  petal_width
sepal_length      0.404343     0.093763      0.303290     0.049094
sepal_width       0.093763     0.104004      0.071380     0.047629
petal_length      0.303290     0.071380      0.304588     0.048824
petal_width       0.049094     0.047629      0.048824     0.075433


In [8]:
'''
Function desc: To calculate the membership function
Input: data, mean, standard deviation and prior probability
Returns: value of the membership function
'''

def nD_GDA_gx(x,mean,cov,prior):
    temp = x-mean
    sub_term = temp.tolist()
    dot1 = np.dot(np.transpose(sub_term),np.linalg.inv(cov))
    dot2 = np.dot(dot1,sub_term)
    return -0.5*np.log(np.linalg.det(cov)) - 0.5*dot2 + np.log(prior[1])
     
    

In [9]:
'''
Function desc: To determine the class of the outputs from the membership function
Input: Values from the membership function
Output: Class of  the input

'''


def nD_GDA_det(g1,g2,cls):
    if float(g1) > float(g2):
        return cls[0]
    else:
        return cls[1]
    
    

In [10]:
'''
Function desc: Gaussian determinant analysis - to predict to which class the input belongs to.
Input: parameters of the Gaussian model, prior probabilities
Output: Predited class of the input
'''

def nD_GDA(x,mu,sigma,prior):
    predicted = ['xxxxx' for i in range(x.shape[0])]
    d1 = x.ix[:,0:4]
    for i in range(len(d1)):
        predicted[i] = nD_GDA_det(nD_GDA_gx(d1.iloc[i],mu[0],sigma[0],prior[0]),nD_GDA_gx(d1.iloc[i],mu[1],sigma[1],prior[1]),
                                    pd.unique(x.classes))
    return predicted

In [11]:
'''
Function desc: To calculate the model parameters suh as precision, recall and f-measure from the confusion-matrix
Inuput: Actual and predicted values
Output: model parameters

'''

def model_eval(actual,predicted):
    Truth= pd.Series(actual,name = 'Truth' )
    Predicted = pd.Series(predicted,name='Predicted')
    confusion_matrix = pd.crosstab(Truth, Predicted)
    #print confusion_matrix
    arr_cm = confusion_matrix.as_matrix()
    diag = arr_cm.diagonal()
    accuracy = float(sum(diag))/np.sum(arr_cm)
    precision = np.divide(diag,1.0*(np.sum(arr_cm,axis=1)))
    recall = np.divide(diag,1.0*(np.sum(arr_cm,axis=0)))
    fmeasure = 2*((precision * recall)/(precision + recall))
    return confusion_matrix,accuracy,precision,recall,fmeasure

In [12]:
def calc_y_score(x,mean,sigma,prior):
    classes = pd.unique(nD_data.classes)
    d1 = x.ix[:,0:4]
    y_score = [0.0 for i in range(len(classes))]
    for i in range(len(d1)):
        for j in range(len(classes)):
            mu=mean[j]
            cov=sigma[j]
            priorj=prior[j]
            y_score[j] = nD_GDA_gx(d1.iloc[i],mu,cov,priorj)
    return y_score

In [13]:
'''Function to perform 10-fold cross validation. In this method the test and traparams in indices are split using using the inbuit
   'KFold' function.
    Input 1: 'x' 
    Input 2: True predicted values
    Input 3: No of folds (10 by default)
    
    Performance measures such as accuracy, precision, f-measure
'''

def x_fold_validation(data,nfolds=10):
    confusion_matrix_list=[]
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    fmeasure_list=[]
    x=data.ix[:,1:6]
    y=data.classes
    x.reset_index()
    cv = KFold(len(y), nfolds,shuffle=True,random_state=23) #inbuilt function to split the indices
    for train_idx, test_idx in cv:
        x_train = x.ix[train_idx]
        y_train = y[train_idx]
        x_test = x.ix[test_idx]
        y_test = y[test_idx]
        mean,sigma = calculate_parameters(x_train)
        prior = calculate_prior(x_train)
        predict = nD_GDA(x_test,mean,sigma,prior)
        conf_matrix,accuracy,precision,recall,fmeasure = model_eval(y_test.tolist(),predict)
        print conf_matrix
        confusion_matrix_list.append(conf_matrix)
        accuracy_list.append(accuracy)
        precision_list.append(precision.tolist())
        recall_list.append(recall.tolist())
        fmeasure_list.append(fmeasure.tolist())
    print '################################'
    print 'Average of the model parameters'
    print '################################'
    print 'Error rate:', (1-np.mean(accuracy_list))
    print 'Accuracy:',np.mean(accuracy_list) 
    print 'Precision:', np.mean(precision_list,axis=0)
    print 'Recall:', np.mean(recall_list,axis=0)
    print 'F-measure:', np.mean(fmeasure_list,axis=0)
    return precision_list,recall_list
    
        
        
        
        
        


In [17]:
def precision_recall_curve(p,r,y):
    r=np.argsort(r)
    rc=[]
    pr=[]
    for i in range(len(y)):
        for j in range(len(r)):
            rc.append(r[j][i])
            pr.append(p[j][i])
        plt.plot(rc, pr,label='Precision-recall curve of class {0} (area = {1:0.2f})' ''.format(i, np.mean(pr)),)
        plt.xlim([0.5, 1.0])
        plt.ylim([0.8, 1.02])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Extension of Precision-Recall curve to multi-class')
    plt.legend(loc="lower right")
    return plt

In [None]:
pre,rec=x_fold_validation(nD_data.reset_index())
plt=precision_recall_curve(pre,rec,pd.unique(nD_data.classes))
plt.show()

Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                4               1
Iris-virginica                 0               5
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                4               0
Iris-virginica                 0               6
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                5               0
Iris-virginica                 0               5
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                5               1
Iris-virginica                 0               4
Predicted        Iris-versicolor  Iris-virginica
Truth                                           
Iris-versicolor                3               0
Iris-virginica                 0               7
Predicted        Iri