In [1]:
#import statements
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
%matplotlib inline

In [2]:
'''
Function desc: load the data 
returns: numpy array containing all features and classes (Iris-versicolor and Iris-virginica)
'''
def load_data():
    iris = datasets.load_iris()
    x = iris['data']
    x_t=np.insert(x,0,1,axis=1) # Transform the data by adding 1 at the beginning
    y = iris['target']
    x,y = x_t[y != 2], y[y != 2] # 
    return x,y

In [3]:
x,y=load_data()

In [4]:

'''
Function used in gradient descent to predict values in each iteration
Sigmoid function
'''

def h_theta(z,theta):
        return 1./(1+np.exp(-np.dot(np.transpose(z),theta)))

In [5]:
'''
Function used to find the parameters using stochiometric gradient descent

'''

def stochastic_grad_descent(x,y):
    n = np.shape(x)[1]
    iterations= 1000 # no of iterations
    learning_rate = 0.001 #initialize step size
    theta = [0 for i in range(n)] #initialize theta
    for i in range(0, iterations):
        for j in range(len(x)):
            predicted = h_theta(x[j],theta) #predict with the initial theta
            error = predicted - np.array(y[j]) #calculate the error
            grad = np.dot(x[j].transpose(), error) #find the gradient
            theta = theta - learning_rate*grad # update the theta values
    return theta

In [6]:
theta = stochastic_grad_descent(x,y)
print theta

[-0.35454108 -0.54808846 -1.99988117  3.06256203  1.35174067]


In [7]:
'''
Function to classify the examples based on the model output

'''

def disc_fn(x,y,t):
    mod =  1./(1+np.exp(-np.dot(theta,x.T)))
    if mod > 0.5:
        return 1
    else:
        return 0

In [8]:
'''
Function to predict the examples

'''


def predict(x,y,theta):
    predicted =[0.0 for i in range(len(y))] #initialize predict
    for i in range(len(x)):
        predicted[i] = disc_fn(x[i],y[i],theta)
    return predicted

In [9]:
'''
Function desc: To calculate the model parameters suh as precision, recall and f-measure from the confusion-matrix
Inuput: Actual and predicted values
Output: model parameters


Classification accuracy
(TP + TN) / (TP + TN + FP + FN)
Error rate
(FP + FN) / (TP + TN + FP + FN)

Precision: (or Positive predictive value)
proportion of predicted positives which
are actual positive
TP / (TP + FP)
Recall: proportion of actual positives
which are predicted positive
TP / (TP + FN)

Error rate: 1- classification accuracy

'''

def model_eval(actual,predicted):
    Truth= pd.Series(actual,name = 'Truth' )
    Predicted = pd.Series(predicted,name='Predicted')
    confusion_matrix = pd.crosstab(Truth, Predicted)
    #print confusion_matrix
    arr_cm = confusion_matrix.as_matrix()
    diag = arr_cm.diagonal()
    accuracy = float(sum(diag))/np.sum(arr_cm)
    precision = np.divide(diag,1.0*(np.sum(arr_cm,axis=1)))
    recall = np.divide(diag,1.0*(np.sum(arr_cm,axis=0)))
    fmeasure = 2*((precision * recall)/(precision + recall))
    return confusion_matrix,accuracy,precision,recall,fmeasure

In [10]:
'''Function to perform 10-fold cross validation. In this method the test and traparams in indices are split using using the inbuit
   'KFold' function.
    Input 1: 'x' 
    Input 2: True predicted values
    Input 3: No of folds (10 by default)
    
    Performance measures such as accuracy, precision, f-measure
'''

def x_fold_validation(x,y,nfolds=10,shuffle=True,random_state=23):
    confusion_matrix_list=[]
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    fmeasure_list=[]
    cv = KFold(len(y), nfolds,shuffle=True,random_state=23) #inbuilt function to split the indices
    for train_idx, test_idx in cv:
        x_train = x[train_idx]
        y_train = y[train_idx]
        x_test = x[test_idx]
        y_test = y[test_idx]
        theta = stochastic_grad_descent(x_train,y_train)
        pred = predict(x_test,y_test,theta)
        conf_matrix,accuracy,precision,recall,fmeasure = model_eval(y_test.tolist(),pred)
        print conf_matrix
        confusion_matrix_list.append(conf_matrix)
        accuracy_list.append(accuracy)
        precision_list.append(precision.tolist())
        recall_list.append(recall.tolist())
        fmeasure_list.append(fmeasure.tolist())
    print '################################'
    print 'Average of the model parameters'
    print '################################'
    print 'Error rate:', (1-np.mean(accuracy_list))
    print 'Accuracy:',np.mean(accuracy_list) 
    print 'Precision:', np.mean(precision_list,axis=0)
    print 'Recall:', np.mean(recall_list,axis=0)
    print 'F-measure:', np.mean(fmeasure_list,axis=0)
            

In [11]:
x_fold_validation(x,y)

Predicted  0  1
Truth          
0          5  0
1          0  5
Predicted  0  1
Truth          
0          4  0
1          0  6
Predicted  0  1
Truth          
0          5  0
1          0  5
Predicted  0  1
Truth          
0          6  0
1          0  4
Predicted  0  1
Truth          
0          3  0
1          0  7
Predicted  0  1
Truth          
0          8  0
1          0  2
Predicted  0  1
Truth          
0          5  0
1          0  5
Predicted  0  1
Truth          
0          6  0
1          0  4
Predicted  0  1
Truth          
0          4  0
1          0  6
Predicted  0  1
Truth          
0          4  0
1          0  6
################################
Average of the model parameters
################################
Error rate: 0.0
Accuracy: 1.0
Precision: [ 1.  1.]
Recall: [ 1.  1.]
F-measure: [ 1.  1.]
