In [1]:
#import statements
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures

In [2]:
'''
Function desc: load the data 
returns: numpy array containing all features and classes (Iris-versicolor and Iris-virginica)
'''
def load_data():
    iris = datasets.load_iris()
    x = iris['data']
    y = iris['target']
    return x,y

In [3]:
x,y=load_data()

In [4]:

'''
Function used in stochastic gradient descent to predict values in each iteration

This function describes the softmax function
'''


def h_theta(z,cls_j,thetas):
    we = list()
    for the in thetas:
        we.append(np.exp(np.dot(np.transpose(z),thetas[the])))
    deno=np.sum(we)
    num = np.exp(np.dot(np.transpose(z),thetas[cls_j]))
    return num/deno

In [5]:

'''
Function used to calculate the model parameters

'''
def stochastic_grad_descent(x,y,thetas):
    n = np.shape(x)[1]
    iterations= 500 #no of iterations
    learning_rate = 0.001 #initialize step size
    for it in range(0,iterations):
        for i in thetas:
            for j in range(len(x)):
                predicted = h_theta(x[j],i,thetas) #predict with the initial theta
                if y[j] == i: #Indicator function starts
                    ye = 1
                else:
                    ye = 0 #indicator function ends
                error = predicted - ye #calculate the error
                grad = np.dot(x[j], error) #find the gradient
                thetas[i] = thetas[i] - learning_rate*grad # update the theta values
    return thetas

In [6]:
'''
Function to calculate the model parameters

'''

def calculate_params(x,y):
    theta_dict = {k:[0.01 for o in xrange(np.shape(x)[1])] for k in range(len(pd.unique(y)))} #initialize theta
    theta_dict = stochastic_grad_descent(x,y,theta_dict)
    return theta_dict
    

In [7]:
theta = calculate_params(x,y)

In [8]:
'''
Function to calculate model output based on all the parameters calculated(theta1,theta2 and theta3)
Find the maximum value and return the index

y^ = argmax(j)softmax(x,theta)
'''

def arg_max(x,y,t):
    val=[]
    for i in t:
        num = np.exp(np.dot(np.transpose(x),t[i]))
        val.append(num)
    return val.index(max(val))

In [9]:
'''
Function to predict the examples 
'''

def predict(x,y,theta):
    predicted =[0.0 for i in range(len(y))] #initialize predict 
    for i in range(len(x)):
        predicted[i] = arg_max(x[i],y[i],theta)
    return predicted

In [10]:
'''Function to calculate the mse 
   Input 1: Predicted values
   Input 2: True values
   Output: mse of predicted and truth 
'''


def mse_calculator(pred,y):
    return sum([(i-j)**2 for i,j in zip(pred,y)])/len(y) # formula to calulate mse - 1/m*sum(yhat-y)**2

In [11]:
'''
Function desc: To calculate the model parameters suh as precision, recall and f-measure from the confusion-matrix
Inuput: Actual and predicted values
Output: model parameters


Classification accuracy
(TP + TN) / (TP + TN + FP + FN)
Error rate
(FP + FN) / (TP + TN + FP + FN)

Precision: (or Positive predictive value)
proportion of predicted positives which
are actual positive
TP / (TP + FP)
Recall: proportion of actual positives
which are predicted positive
TP / (TP + FN)

'''

def model_eval(actual,predicted):
    Truth= pd.Series(actual,name = 'Truth' )
    Predicted = pd.Series(predicted,name='Predicted')
    confusion_matrix = pd.crosstab(Truth, Predicted)
    arr_cm = confusion_matrix.as_matrix()
    diag = arr_cm.diagonal()
    accuracy = float(sum(diag))/np.sum(arr_cm)
    precision = np.divide(diag,1.0*(np.sum(arr_cm,axis=1)))
    recall = np.divide(diag,1.0*(np.sum(arr_cm,axis=0)))
    fmeasure = 2*((precision * recall)/(precision + recall))
    return confusion_matrix,accuracy,precision,recall,fmeasure

In [12]:
'''Function to perform 10-fold cross validation. In this method the test and traparams in indices are split using using the inbuit
   'KFold' function.
    Input 1: 'x' 
    Input 2: True predicted values
    Input 3: No of folds (10 by default)
    
    Performance measures such as accuracy, precision, f-measure
'''

def x_fold_validation(x,y,nfolds=10,shuffle=True,random_state=23):
    confusion_matrix_list=[]
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    fmeasure_list=[]
    mse=[]
    cv = KFold(len(y), nfolds,shuffle=True,random_state=23) #inbuilt function to split the indices
    for train_idx, test_idx in cv:
        x_train = x[train_idx]
        y_train = y[train_idx]
        x_test = x[test_idx]
        y_test = y[test_idx]
        theta=calculate_params(x_train,y_train)
        pred = predict(x_test,y_test,theta)
        mse.append(mse_calculator(pred,y_test))
        conf_matrix,accuracy,precision,recall,fmeasure = model_eval(y_test.tolist(),pred)
        print conf_matrix
        confusion_matrix_list.append(conf_matrix)
        accuracy_list.append(accuracy)
        precision_list.append(precision.tolist())
        recall_list.append(recall.tolist())
        fmeasure_list.append(fmeasure.tolist())
    print '################################'
    print 'Average of the model parameters'
    print '################################'
    print 'Error rate:', 1- np.mean(accuracy_list)
    print 'Accuracy:', np.mean(accuracy_list) 
    print 'Precision:', np.mean(precision_list,axis=0)
    print 'Recall:', np.mean(recall_list,axis=0)
    print 'F-measure:', np.mean(fmeasure_list,axis=0)
    
        

In [13]:
x_fold_validation(x,y)

Predicted  0  1  2
Truth             
0          6  0  0
1          0  4  0
2          0  0  5
Predicted  0  1  2
Truth             
0          6  0  0
1          0  4  0
2          0  0  5
Predicted  0  1  2
Truth             
0          6  0  0
1          0  6  0
2          0  0  3
Predicted  0  1  2
Truth             
0          3  0  0
1          0  3  1
2          0  0  8
Predicted  0  1  2
Truth             
0          6  0  0
1          0  4  1
2          0  0  4
Predicted  0  1  2
Truth             
0          2  0  0
1          0  5  1
2          0  0  7
Predicted  0  1  2
Truth             
0          3  0  0
1          0  5  1
2          0  0  6
Predicted  0  1  2
Truth             
0          5  0  0
1          0  6  1
2          0  0  3
Predicted  0  1  2
Truth             
0          4  0  0
1          0  4  0
2          0  0  7
Predicted  0  1  2
Truth             
0          9  0  0
1          0  3  1
2          0  0  2
################################
Average of the mo