In [None]:
#from supportFunctions import *
import cPickle as pickle
from __future__ import division
import numpy as np
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd 

import scipy.cluster.hierarchy as hcluster
from sklearn.metrics import confusion_matrix,classification_report



In [None]:
def extractFeatures(imageDict, dist = 5):
    n = imageDict['numPointsInBox']
    y = np.zeros(n)
    
    v1,v2,h1,h2 = imageDict['boxEdges']
    
    img = imageDict['image1bit']
    featuresList = []
    
    fingerSet = imageDict['allFingerPoints']

    c = 0 
    for i in range(h1,h2):
        for j in range(v1,v2):
            x_matrix  = img[i-dist-1:i+dist , j-dist-1: j+dist]
            xVec = x_matrix.ravel()
            featuresList.append(xVec)
            
            if max(np.sum([i , j] == fingerSet, axis = 1 )) == 2:
                y[c] = 1
            
            c = c + 1

    X = np.vstack((featuresList))
    return(X,y)
    

In [None]:
def extractExamplesFromList(imageList, dist = 5):
    allFeaturesList = []
    allTargetList = []

    for i, imageDict in enumerate(imageList):
        features, target = extractFeatures(imageDict, dist = dist)
        allFeaturesList.append(features)
        allTargetList.append(target)

    x = np.vstack((allFeaturesList))
    y = np.hstack((allTargetList))
    
    return x, y

In [None]:
def Reshape_to_yHatMatrix(testingExamples, yHat):
    number_of_images = len(testingExamples)
    
    image_sizes = []
    result = []
    c = 0 
    for i in range(number_of_images):
        dummy_idx = (testingExamples[i]['boxHeight'],testingExamples[i]['boxWidth'])
        image_sizes.append(dummy_idx)
        
        test_im = yHat[c:(c + dummy_idx[0]*dummy_idx[1])] 
        c = c + (dummy_idx[0] * dummy_idx[1])
        
        test_im_matrix = test_im.reshape((dummy_idx[0], dummy_idx[1]))
        result.append(test_im_matrix)
    
    return(result)

In [None]:
def remove_small_clusters(clusters, min_finger_pixel):
    unique = set(clusters)
    for c in unique:
        if sum(clusters == c) < min_finger_pixel:
            clusters = np.delete(clusters, np.where(clusters == c))
    return(clusters)

In [None]:
pickleFileName = 'fingerDataSet' + '.pickle'
pickleFile = open(pickleFileName, 'rb')
data = pickle.load(pickleFile)
pickleFile.close()

In [None]:
plt.imshow(data[51]['image'])
plt.show()

In [None]:
#Constructing Traning Data Set:
trainingExampleIndices = np.arange(0,45)
trainingExamples = [data[index] for index in trainingExampleIndices]
trainX, trainY = extractExamplesFromList(trainingExamples, dist = 4)

#Constructing Test Data Set:
testingExampleIndices = [45, 46, 47, 48, 49, 50, 51, 52, 53]
testingExamples = [data[index] for index in testingExampleIndices]
testX, testY = extractExamplesFromList(testingExamples, dist = 4)


In [None]:
clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
clf = clf.fit(trainX, trainY)
yHat = clf.predict(testX)

In [None]:
clf.score(testX, testY)

In [None]:
tn, fp, fn, tp = confusion_matrix(testY, yHat).ravel()
confusion_matrix(testY, yHat)


In [None]:
Y_hat = Reshape_to_yHatMatrix(testingExamples, yHat)

fig = plt.figure(0, (8, 4))
plt.subplot(1,3,1)
plt.imshow(Y_hat[0])
plt.subplot(1,3,2)
plt.imshow(Y_hat[1])
plt.subplot(1,3,3)
plt.imshow(Y_hat[2])
plt.show()

In [None]:
X_i = np.where(Y_hat[1] == 1)[0]
X_j = np.where(Y_hat[1] == 1)[1]

data_cl  = np.column_stack((X_i, X_j))
data_cl.shape

In [None]:
# clustering
thresh = 1.5
clusters = hcluster.fclusterdata(data_cl, thresh, criterion="distance")
#removing small clusters (caused by false positives)
new_clusters = remove_small_clusters(clusters,15)

# plotting
plt.scatter(*np.transpose(data_cl), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of fingers: %d" % (thresh, len(set(new_clusters)))
plt.title(title)
plt.show()


In [None]:
plt.scatter(data_cl[:,1], data_cl[:,0], c=clusters)
plt.axis("equal")
title = "threshold: %f, number of fingers: %d" % (thresh, len(set(new_clusters)))
plt.title(title)
plt.show()

In [None]:
def plots_estimates_and_actuals(testingExampleIndices,yHat, testY, data):
    """
    For given yHat(estimated), testY(actual), and test examples,
    it visualizes these in subplots.
    
    yHat and testY should be vectors 
    
    """
    fig = plt.figure(0, (8, 6))
    Y_hat = Reshape_to_yHatMatrix(testingExamples, yHat)
    Y_test = Reshape_to_yHatMatrix(testingExamples, testY)
    for i in range(len(testingExampleIndices)):
        
        fig.add_subplot(1,len(testingExampleIndices),i+1)
        imageDict = data[testingExampleIndices[i]]
        #im = makeGrayScale(imageDict)
        im = imageDict['croppedImage']
        im2 = im.copy()

        #Paint with matches:
        im[Y_test[i]==1] = [90]
        im2[Y_hat[i]==1] = 0
        
        
        
        plt.imshow(im2, interpolation = 'none')
        plt.imshow(im, interpolation = 'none', alpha = 0.3)
    plt.show()

In [None]:
def plots_estimates_and_actuals_YGM(testingExampleIndices,yHat, testY, data):
    """
    For given yHat(estimated), testY(actual), and test examples,
    it visualizes these in subplots.
    
    yHat and testY should be vectors 
    
    """
    fig = plt.figure(0, (len(testingExampleIndices) + 12, len(testingExampleIndices) + 8))
    Y_hat = Reshape_to_yHatMatrix(testingExamples, yHat)
    Y_test = Reshape_to_yHatMatrix(testingExamples, testY)
    for i in range(len(testingExampleIndices)):
        
        fig.add_subplot(1,len(testingExampleIndices),i+1)
        imageDict = data[testingExampleIndices[i]]
        
        im = imageDict['croppedImage']
        grayim = np.zeros((im.shape[0], im.shape[1], 3))
        grayim[:,:,0] = 1./255*im
        grayim[:,:,1] = 1./255*im
        grayim[:,:,2] = 1./255*im
        
        grayim2 = grayim.copy()

        #Emphasizing test and estimation comparison through pixel coloring
        grayim[:,:,0][Y_test[i]==1] = 1
        grayim[:,:,1][Y_test[i]==1] = 0
        grayim[:,:,2][Y_test[i]==1] = 0
        
        grayim2[:,:,0][Y_hat[i]==1] = 0
        grayim2[:,:,1][Y_hat[i]==1] = 0
        grayim2[:,:,2][Y_hat[i]==1] = 1
        
        plt.imshow(grayim2, interpolation = 'none')
        plt.imshow(grayim, interpolation = 'none', alpha = 0.3)
    plt.show()

In [None]:
def plot_confusion_matrix(testY, yHat, is_normalized = True):
    if is_normalized == True : 
        normalized_confusion_matrix = confusion_matrix(testY, yHat)/confusion_matrix(testY, yHat).sum()
        df_confusion_normalized = pd.DataFrame(normalized_confusion_matrix, index = ["Classifier-0", "Classifier-1"], columns = ["True-0", "True-1"])

        plt.figure(figsize = (9,6))
        sn.set(font_scale=2)
        sn.heatmap(df_confusion_normalized, annot = True, fmt='.4g')
        plt.show()
    else:
        df_confusion = pd.DataFrame(confusion_matrix(testY, yHat), index = ["Classifier-0", "Classifier-1"], columns = ["True-0", "True-1"])
        plt.figure(figsize = (9,6))
        sn.set(font_scale=2)
        sn.heatmap(df_confusion, annot = True, fmt='.4g')
        plt.show()
        
    

In [None]:
plt.scatter(np.array(data_cl)[:,1], np.array(data_cl)[:,0] , c=clusters)
plt.axis("equal")
title = "threshold: %f, number of fingers: %d" % (thresh, len(set(new_clusters)))
plt.title(title)
plt.show()



In [None]:
plots_estimates_and_actuals(testingExampleIndices,yHat, testY, data)

In [None]:
plots_estimates_and_actuals_YGM(testingExampleIndices,yHat, testY, data)

In [None]:
plot_confusion_matrix(testY, yHat, is_normalized = True)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
classifier = LogisticRegression(random_state=0)
classifier.fit (trainX, trainY)

In [None]:
y_hat_logi = classifier.predict(testX)

In [None]:
Y_hat = Reshape_to_yHatMatrix(testingExamples, y_hat_logi)

fig = plt.figure(0, (8, 4))
plt.subplot(1,3,1)
plt.imshow(Y_hat[0])
plt.subplot(1,3,2)
plt.imshow(Y_hat[1])
plt.subplot(1,3,3)
plt.imshow(Y_hat[2])
plt.show()

In [None]:
plot_confusion_matrix(testY, y_hat_logi, is_normalized = True)

In [None]:
classifier.score(testX, testY)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier_RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state= 0)
classifier_RF.fit(trainX, trainY)
y_hat_RF = classifier_RF.predict(testX)

In [None]:
Y_hat = Reshape_to_yHatMatrix(testingExamples, y_hat_RF)

fig = plt.figure(0, (8, 4))
plt.subplot(1,3,1)
plt.imshow(Y_hat[0])
plt.subplot(1,3,2)
plt.imshow(Y_hat[1])
plt.subplot(1,3,3)
plt.imshow(Y_hat[2])
plt.show()

In [None]:
plot_confusion_matrix(testY, y_hat_RF, is_normalized = True)

In [None]:
classifier_RF.score(testX, testY)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(54, 27, 3))
mlp.fit(trainX , trainY)


In [None]:
y_hat_ANN = mlp.predict(testX)


confusion_matrix(testY, y_hat_ANN)



In [None]:
Y_hat = Reshape_to_yHatMatrix(testingExamples, y_hat_ANN)

fig = plt.figure(0, (8, 4))
plt.subplot(1,3,1)
plt.imshow(Y_hat[0])
plt.subplot(1,3,2)
plt.imshow(Y_hat[1])
plt.subplot(1,3,3)
plt.imshow(Y_hat[2])
plt.show()

In [None]:
plot_confusion_matrix(testY, y_hat_ANN, is_normalized = True)

In [None]:
mlp.score(testX, testY)

In [None]:
# Classify using the raw data
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier


pipeRAW = Pipeline([ ('mlp',MLPClassifier(hidden_layer_sizes=(54, 27, 3)))])



trainingExampleIndices = np.arange(0,54)
trainingExamples = [data[index] for index in trainingExampleIndices]
allX, allY = extractExamplesFromList(trainingExamples, dist = 4)

from sklearn.cross_validation import cross_val_score
print('Classification Score Without Using PCA:', cross_val_score(pipeRAW, allX, allY).mean())


In [None]:
# Classify using the raw data
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier


pipeRAW = Pipeline([ ('mlp',MLPClassifier(hidden_layer_sizes=(54, 27, 3)))])



trainingExampleIndices = np.arange(0,54)
trainingExamples = [data[index] for index in trainingExampleIndices]
allX, allY = extractExamplesFromList(trainingExamples, dist = 4)

from sklearn.model_selection import cross_val_score

print('Classification Score Without Using PCA:', cross_val_score(pipeRAW, allX, allY).mean())

In [None]:
# Classify using the PCA-processed data
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


pipePCA = Pipeline([('pca', PCA(n_components=22)), ('mlp',MLPClassifier(hidden_layer_sizes=(54, 27, 3)))])


trainingExampleIndices = np.arange(0,54)
trainingExamples = [data[index] for index in trainingExampleIndices]
allX, allY = extractExamplesFromList(trainingExamples, dist = 4)

from sklearn.cross_validation import cross_val_score

print('Classification Score After Using PCA:',cross_val_score(pipePCA, allX, allY).mean())

In [None]:
# Classify using the PCA-processed data
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


pipePCA = Pipeline([('pca', PCA(n_components=22)), ('mlp',MLPClassifier(hidden_layer_sizes=(54, 27, 3)))])


trainingExampleIndices = np.arange(0,54)
trainingExamples = [data[index] for index in trainingExampleIndices]
allX, allY = extractExamplesFromList(trainingExamples, dist = 4)

from sklearn.model_selection import cross_val_score

print('Classification Score After Using PCA:',cross_val_score(pipePCA, allX, allY).mean())

## PCA investigation

In [None]:
plt.imshow(data[12]['image'])
plt.show()

In [None]:
X = data[12]['image']
C = np.cov(X)
np.linalg.matrix_rank(C)

In [None]:
data[12]['image'].shape

$C = L P L^T$, where L is eigenvectors matrix which is orthonormal, P is diagonal eigenvalues matrix

In [None]:
P, L = np.linalg.eigh(C) # Eigendecomposition of C(Covariance matrix)

In [None]:
np.linalg.matrix_rank(L), np.linalg.matrix_rank(P)  # P is an eigenvalues array! 

In [None]:
P = P[::-1]
L = L[:,::-1]
#  np.linalg.eigh(C) this gives the ascending order, we transform it into descending order

In [None]:
np.allclose(L.dot(np.diag(P)).dot(L.T), C)# Decomposition is successful

In [None]:
plt.figure(figsize = (8,6))
plt.semilogy(P, '-o')
plt.xlim([1, P.shape[0]])
plt.xlabel('eigenvalue index')
plt.ylabel('eigenvalue in a log scale')
plt.title('Eigenvalues of Covariance Matrix');
plt.show()

In [None]:
V = L.T.dot(X) # apply projection onto eigenbases, then get the coefficients

In [None]:
Re_X = L.dot(V) # Combining the eigenvectors with the coefficients

In [None]:
np.allclose(Re_X, X)# Full rank approximation

In [None]:
normed_cumsum = (P.cumsum()/P.sum())

plt.figure(figsize = (10,8))
plt.plot(normed_cumsum, '-o')
plt.scatter(len(np.where(normed_cumsum <= 0.99)[0]), normed_cumsum[len(np.where(normed_cumsum <= 0.99)[0])], marker=(5,1,0),color='r',s=1000)
plt.title('Cumulative Sum of the Proportion of Total Variance')
plt.xlabel('index')
plt.ylabel('Proportion');
plt.grid(True)
plt.show()

In [None]:
len(np.where(normed_cumsum <= 0.99)[0])# To obtain target compression percentage, what k(approx. rank) sholud be

In [None]:
normed_cumsum[21]

In [None]:
normed_cumsum[normed_cumsum <= 0.99]

In [None]:
len(normed_cumsum[normed_cumsum <= 0.99])# That is identical above result. It is neater.

In [None]:
for k in range(len(P)):
    ratio = sum(P[0:k+1])/ sum(P)
    if  ratio >= 0.99:
        break
print("Pick the largest:", k+1)

In [None]:
k_list = [5, 10, 20, 30, 40, 50, 60, 70]
for i in range(len(k_list)):
    
    X_tilda_k = L[:,0:k_list[i]].dot(V[0:k_list[i],:])
    plt.figure(figsize = (6,6))
    
    plt.imshow(X_tilda_k)
    plt.title('Approximated Image with k ='+' '+str(k_list[i]));
    plt.show()


In [None]:
k_list = [5, 10, 20, 30, 40, 50, 60, 70]
for i in range(len(k_list)):
    X_tilda_k = L[:,0:k_list[i]].dot(V[0:k_list[i],:])
    
    fig = plt.figure(0, (len(k_list) + 14, len(k_list) + 10))
    fig.add_subplot(2,len(k_list)/2,i+1)
    
    plt.imshow(X_tilda_k)
    plt.title('Approximated Image with k ='+' '+str(k_list[i]));
    plt.show()