In [None]:
%pylab inline

In [1]:
import numpy as np
import matplotlib.pyplot as pt
import matplotlib.image as img
from PIL import Image
from skimage import color
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import preprocessing

from sklearn import linear_model, datasets
from sklearn import tree
from sklearn.externals.six import StringIO
import csv
from sklearn import metrics, cross_validation
from numpy import linalg as LA

#Returns dictionary of labels; key is label as string, value is a unique index
def get_category_labels_as_dict():
    category_dict = {}
    num_category = 0
    with open('./CS5785-final-data/train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            if not category_dict.has_key(tokens[1]):
                category_dict[tokens[1]] = num_category
                num_category += 1
    return category_dict

#Returns labels of training set as an array of strings
def get_train_labels():
    train_labels = []
    with open('./CS5785-final-data/train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            train_labels.append(tokens[1])
    return train_labels

#Returns train labels as numpy array, each label is unique index; params: dictionary of label to unique index, array of labels as strings
#Usage: get_train_labels_as_unique_indices(get_category_labels_as_dict(), get_train_labels())
def get_train_labels_as_unique_indices(label_dictionary, train_label_array):
    train_labels_as_index = []
    for i in range(len(train_label_array)):
        label_name = train_label_array[i]
        index = label_dictionary[label_name]
        train_labels_as_index.append(index)
    return np.array(train_labels_as_index)

#Translates index values back into string labels
def get_labels_from_indices(label_dictionary, Y_predicted):
    labels = []
    for i in range(len(Y_predicted)):
        target = Y_predicted[i]
        for key in label_dictionary.keys():
            if label_dictionary[key] == target:
                labels.append(key)
                break
    return labels

#Writes the CSV file
def print_output(Y_predicted):
    with open('./CS5785-final-data/test.txt') as f:
        lines = f.read().splitlines()
    
    with open('kaggle_submission_vote.csv', "wb") as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['ID', 'Category'])
        for index in range(len(Y_predicted)):
            writer.writerow([lines[index], Y_predicted[index]])
 
#calculate cross validation score
def cross_validation_accuracy(X, Y, folds, model):
    average = 0
    
    for train_indices, test_indices in cross_validation.KFold(len(X), n_folds=folds):
    
        X_train = X[train_indices]
        Y_train = Y[train_indices]
        X_test = X[test_indices]
        Y_test = Y[test_indices]

        Y_predicted = model.fit(X_train, Y_train).predict(X_test)
    
        #Compare Y_test and Y_predicted
        average += (Y_predicted == Y_test).sum() / float(len(Y_test))
        
    return average / float(folds)

#Returns attributes of train data as numpy array
def get_train_attributes():
    feature_vectors = []
    with open('./CS5785-final-data/attributes_train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            values = tokens[1].split(',')
            fv = []
            for value in values:
                fv.append(int(value))
            feature_vectors.append(fv)
    return np.array(feature_vectors)

#Returns attributes of test data as numpy array
def get_test_attributes():
    feature_vectors = []
    with open('./CS5785-final-data/attributes_test.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            values = tokens[1].split(',')
            fv = []
            for value in values:
                fv.append(int(value))
            feature_vectors.append(fv)
    return np.array(feature_vectors)

#normalizes the data by normNumber-norm
def normalize(X_train, normNumber):
    X_train= np.array(X_train)
    norm2 = LA.norm(X_train, axis = 0, ord = normNumber)
    for i in range(len(X_train)):
       X_train[i][0] = X_train[i][0]/norm2[0]
       X_train[i][1] = X_train[i][0]/norm2[1]
        
    return X_train

#Returns SIFT attributes of training data as numpy array
def get_train_SIFT():
    return np.load('./CS5785-final-data/SIFTBoW_train.npy')

#Returns SIFT attributes of test data as numpy array
def get_test_SIFT():
    return np.load('./CS5785-final-data/SIFTBoW_test.npy')

#Returns ALEX attributes of train data as numpy array
def get_train_ALEX():
    return np.load('./CS5785-final-data/alexnet_feat_train.npy')

#Returns ALEX_10k attributes of train data as numpy array
def get_train_ALEX_10k():
    return np.load('./CS5785-final-data/alexnet_feat_10k.npy')

#Returns SIFT_10k attributes of test data as numpy array
def get_train_SIFT_10k():
    return np.load('./CS5785-final-data/SIFTBoW_10k.npy')

In [2]:
#get Y_train
category_dictionary = get_category_labels_as_dict()
Y_train_text = get_train_labels()
Y_train = get_train_labels_as_unique_indices(category_dictionary, Y_train_text)

In [22]:
X_train = get_train_SIFT_10k()

from sklearn.cluster import KMeans

km = KMeans(n_clusters = 200, max_iter = 10000)
kmout = km.fit(X_train)


In [20]:
X_train_SIFT = get_train_SIFT()

lr = linear_model.LogisticRegression()
lr.fit(X_train_SIFT, Y_train)



In [21]:
clusterLabels = lr.predict(kmout.cluster_centers_)

X_test = get_test_SIFT()
testClusterCentre = km.predict(X_test)
print clusterLabels
print testClusterCentre
#predicted = []

#for i in range(len(testClusterCentre)):
#    predicted.append(clusterLabels[testClusterCentre[i]])

[ 30  83 181 133 181  29 106 199  93 133 133  31 111  59 193 165  62  72
  93  36  22 187  62  94 144 181 105 107 107  82  30   5  91 193 108 170
 197  62  49  86 151  79 125 147  97  72   5  62 140  72 174  29 107 154
 118 142 151 120 154   8  91 188 130 164 193 130  30  97 133 125 140  98
 165  29 174 197 193 141 151  97  20  62  86 157 148  20 107 154 149 154
  62  72 154 107 154 108 118 140 154  22  97  89 136 178 154  97  30 118
 154  82 197 147  83 168  97 151 106  12 107 178  29 118  22 193 133  39
  30 133  50 107  80  97  37   7 181 193   3 139  12 153 104  97  31  58
 140 197 145 107 154 151 157  50 118 115 154  29  22  99 176 118  83 151
 163 164  50 134  30 151 193  43 147  62 193 193  62 167  30 118  50 127
  72  18 118 193 118  45  50 118 118 193  37  97 115 176 157  46  58 193
 107 192]
[195 143  64 157 196 121 177   9  36  79 143  79  29 196  67  59 148  11
 115 148 148 116  25 115 157 112 173 145  14 105 116  85  38  16  73  98
  29 175  69  51 137 169  19  24   4 177 

In [None]:
predicted_text = get_labels_from_indices(categories, predicted)
print_output(predicted_text)


In [23]:
clusterLabels = lr.predict(kmout.cluster_centers_)

X_test = get_test_SIFT()
testClusterCentre = km.predict(X_test)
print clusterLabels
print testClusterCentre

[136  30 118 181 116 107 133 154  72 151 105 104 140 181  97  62  30 140
  37 107 133 134 188  62 133 132  50  62  86  72  99 107 147 151  50 105
 151   0 154 107 176 127 107 157 133  29 115  27 107 154  29  78 157  29
 118 115  29  64 153  62 197 151  30 147  72 107 197  93  80 149  97 140
  43 133 118 154 107  22 133 193 140 148 164 127 181 154 134 133  37 154
 118  98  31  30 134 132  83  97  12 188 107 132 106  31 107  62 125 134
 138  83 154  97 150 140  97  80   8 193  52 151 181 167 165 165  12 151
 118  97 125  93 130  30 193 154 107 151 174 147  58 144 133  83 178  30
 144 113 148  14  71  80 181 118  29  97 154 193 151 193   8 193  83 118
 148 178 118  30 124 132 164 140 154  57 105  11  72  79 181  71   8 118
 134  30  97 174  80  45 151 116  19  72  59  83  43 105  29 115  97  98
 134  94]
[ 30  52 123  30 138  91 164  54  60 158 121  83 192 138 149  81  49  51
 105  76  85 102   3 103  30 160 173 139 143  99 102 112 178 143  23 133
 156 121 106  45  98 192 189 170 144  54 

345209.871543
