Question 9: Handwritten digit recognition using a Gaussian Generative Model

The solution has the following parts:

1. Load the data
2. Prepare the data by splitting the train data into training set and validation set
3. Create a classifier routine using multinomial_normal function
4. Iterate over several smoothing factor to land at the best smoothing factor on the validation set
5. Apply the best smoothing factor on the test data set and calculate the accuracy
6. Display a few misclassified digits and their posterior probabilities


In [1]:
%cd /Users/Deepthi/Documents/Personal/Kaggle/01_Digit Recognizer/Data

/Users/Deepthi/Documents/Personal/Kaggle/01_Digit Recognizer/Data


In [2]:
# Import the required libraries
from struct import unpack
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd
from sklearn.cross_validation import train_test_split;
import math 
from scipy.stats import multivariate_normal

### Load the data

In [3]:
from numpy import genfromtxt
train_data = genfromtxt('train.csv', delimiter=',')
test_data = genfromtxt('test.csv', delimiter=',')

In [4]:
train_labels = np.ravel(train_data[1:,:1])
train_images = train_data[1:,1:785]
test_images = test_data[1:,:]

### Prepare the data by splitting the train data into training set and validation set

In [5]:
# Split the train data into train and validation sets
train_set,validation_set, train_label, validation_label = train_test_split(train_images,train_labels, test_size=10000, random_state=4)

### Create a classifier routine using multinomial_normal function

In [6]:
# Create a function to determine the clas probabilities, calculate mean and cov and fit a gaussian
def classifier(c,data_set,label_set):
    label = []
    pd = []
    for i in range(0,10):
        priors = len(train_set[np.where(train_label==i)])/float(len(train_set))
        Mean = train_set[np.where(train_label==i)].mean(0)
        Cov = np.cov((train_set[np.where(train_label==i)]).T)
        Cov = Cov + (c*np.identity(784))
        px = multivariate_normal(mean=Mean, cov=Cov) 
        pd.append(np.log(priors) + px.logpdf(data_set))
    max_prob = np.argmax(pd, axis = 0)
    error = np.sum([i!=j for i,j in zip(max_prob, label_set)])*100/(len(label_set)*1.0)
    accuracy = 100- (np.sum([i!=j for i,j in zip(max_prob, label_set)])*100/(len(label_set)*1.0))
    return c, error, accuracy

In [7]:
np.shape(train_set)

(32000, 784)

In [8]:
np.shape(train_label)

(32000,)

In [9]:
np.ravel(train_label)

array([ 2.,  7.,  2., ...,  2.,  9.,  2.])

### Iterate over several smoothing factor to land at the best smoothing factor on the validation set

To obtain optimum smoothing factor:

1. First try in steps of 1000 to determine the most optimum zone (The zone in which you achieve best accuracy)
2. Repeat the same in steps of 100, 10 and then 1 to land on the most optimum smoothing value

In [10]:
# Go in steps of 1000 from 1000 to 15000:
for c in np.arange(1000,15000,1000):
    print classifier(c,validation_set, validation_label)

(1000, 5.3700000000000001, 94.629999999999995)
(2000, 4.8099999999999996, 95.189999999999998)
(3000, 4.7300000000000004, 95.269999999999996)
(4000, 4.7800000000000002, 95.219999999999999)
(5000, 4.8200000000000003, 95.180000000000007)
(6000, 4.9699999999999998, 95.030000000000001)
(7000, 5.1900000000000004, 94.810000000000002)
(8000, 5.3600000000000003, 94.640000000000001)
(9000, 5.4500000000000002, 94.549999999999997)
(10000, 5.5300000000000002, 94.469999999999999)
(11000, 5.7000000000000002, 94.299999999999997)
(12000, 5.8200000000000003, 94.180000000000007)
(13000, 5.9000000000000004, 94.099999999999994)
(14000, 6.0499999999999998, 93.950000000000003)


In [None]:
## Consider 2000 to 4000 in steps of 100
for c in np.arange(2000,4000,100):
    print classifier(c,validation_set, validation_label)

In [None]:
## Consider 2500 to 2700 in steps of 10
for c in np.arange(2500,2700,10):
    print classifier(c,validation_set, validation_label)

In [None]:
## Consider 2610 to 2640 in steps of 1
for c in np.arange(2610,2640,1):
    print classifier(c,validation_set, validation_label)

In [None]:
## Test the the optimum smoothing value on validation set
print classifier(2613,validation_set, validation_label)

### Apply the best smoothing factor on the test data set and calculate the accuracy

In [11]:
## Apply the classifier on the test data to understand the accuracy

def classifiertest(c,data_set):
    label = []
    pd = []
    for i in range(0,10):
        priors = len(train_images[np.where(train_labels==i)])/float(len(train_set))
        Mean = train_images[np.where(train_labels==i)].mean(0)
        Cov = np.cov((train_images[np.where(train_labels==i)]).T)
        Cov = Cov + (c*np.identity(784))
        px = multivariate_normal(mean=Mean, cov=Cov) 
        pd.append(np.log(priors) + px.logpdf(data_set))
    max_prob = np.argmax(pd, axis = 0)
    return max_prob


testoutput = classifiertest(2613,test_images)

In [12]:
testoutput = [int(x) for x in testoutput]

In [13]:
df = pd.DataFrame(testoutput)

In [14]:
df.to_csv('final_output_2', sep=',')