In [None]:
%pylab inline

In [62]:
import numpy as np
import matplotlib.pyplot as pt
import matplotlib.image as img
from PIL import Image
from skimage import color
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing

from sklearn import linear_model, datasets
from sklearn import tree
from sklearn.externals.six import StringIO
import csv
from sklearn import metrics, cross_validation
from numpy import linalg as LA

#Returns dictionary of labels; key is label as string, value is a unique index
def get_category_labels_as_dict():
    category_dict = {}
    num_category = 0
    with open('./CS5785-final-data/train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            if not category_dict.has_key(tokens[1]):
                category_dict[tokens[1]] = num_category
                num_category += 1
    return category_dict

#Returns labels of training set as an array of strings
def get_train_labels():
    train_labels = []
    with open('./CS5785-final-data/train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            train_labels.append(tokens[1])
    return train_labels

#Returns train labels as numpy array, each label is unique index; params: dictionary of label to unique index, array of labels as strings
#Usage: get_train_labels_as_unique_indices(get_category_labels_as_dict(), get_train_labels())
def get_train_labels_as_unique_indices(label_dictionary, train_label_array):
    train_labels_as_index = []
    for i in range(len(train_label_array)):
        label_name = train_label_array[i]
        index = label_dictionary[label_name]
        train_labels_as_index.append(index)
    return np.array(train_labels_as_index)

#Translates index values back into string labels
def get_labels_from_indices(label_dictionary, Y_predicted):
    labels = []
    for i in range(len(Y_predicted)):
        target = Y_predicted[i]
        for key in label_dictionary.keys():
            if label_dictionary[key] == target:
                labels.append(key)
                break
    return labels

#Writes the CSV file
def print_output(Y_predicted):
    with open('./CS5785-final-data/test.txt') as f:
        lines = f.read().splitlines()
    
    with open('kaggle_submission_vote.csv', "wb") as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['ID', 'Category'])
        for index in range(len(Y_predicted)):
            writer.writerow([lines[index], Y_predicted[index]])
 
#calculate cross validation score
def cross_validation_accuracy(X, Y, folds, model):
    average = 0
    
    for train_indices, test_indices in cross_validation.KFold(len(X), n_folds=folds):
    
        X_train = X[train_indices]
        Y_train = Y[train_indices]
        X_test = X[test_indices]
        Y_test = Y[test_indices]

        Y_predicted = model.fit(X_train, Y_train).predict(X_test)
        #print (Y_predicted == Y_test).sum() / float(len(Y_test))
        #Compare Y_test and Y_predicted
        average += (Y_predicted == Y_test).sum() / float(len(Y_test))
        
    return average / float(folds)

#Returns attributes of train data as numpy array
def get_train_attributes():
    feature_vectors = []
    with open('./CS5785-final-data/attributes_train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            values = tokens[1].split(',')
            fv = []
            for value in values:
                fv.append(int(value))
            feature_vectors.append(fv)
    return np.array(feature_vectors)

#Returns attributes of test data as numpy array
def get_test_attributes():
    feature_vectors = []
    with open('./CS5785-final-data/attributes_test.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            values = tokens[1].split(',')
            fv = []
            for value in values:
                fv.append(int(value))
            feature_vectors.append(fv)
    return np.array(feature_vectors)

#normalizes the data by normNumber-norm
def normalize_column(X_train, normNumber):
    X_train= np.array(X_train)
    norm2 = LA.norm(X_train, axis = 0, ord = normNumber)
    for i in range(len(X_train)):
        for j in range(len(X_train[i])):
           X_train[i][j] = X_train[i][j]/norm2[j]
    return X_train

def normalize_row(X_train, normNumber):
    X_train= np.array(X_train)
    norm2 = LA.norm(X_train, axis = 1, ord = normNumber)
    for i in range(len(X_train)):
        for j in range(len(X_train[i])):
           X_train[i][j] = X_train[i][j]/norm2[i]
    return X_train

def mean_subtraction(X_train):
    mean = np.zeros(len(X_train[0]))
    for i in range(len(X_train)):
        mean += X_train[i]
    mean = mean / float(len(X_train))
    
    X_train_centered = np.zeros(X_train.shape)
    for i in range(len(X_train)):
        X_train_centered[i,:] = X_train[i,:] - mean
    return X_train_centered

#Returns SIFT attributes of training data as numpy array
def get_train_SIFT():
    return np.load('./CS5785-final-data/SIFTBoW_train.npy')

#Returns SIFT attributes of test data as numpy array
def get_test_SIFT():
    return np.load('./CS5785-final-data/SIFTBoW_test.npy')

#Returns ALEX attributes of train data as numpy array
def get_train_ALEX():
    return np.load('./CS5785-final-data/alexnet_feat_train.npy')

def get_test_ALEX():
    return np.load('./CS5785-final-data/alexnet_feat_test.npy')

In [63]:
category_dictionary = get_category_labels_as_dict()
Y_train_text = get_train_labels()
Y_train = get_train_labels_as_unique_indices(category_dictionary, Y_train_text)

In [64]:
X_train = get_train_attributes()
X_test = get_test_attributes()

In [65]:
X_train_ALEX = get_train_ALEX()
X_test_ALEX = get_test_ALEX()

In [66]:
X_train_SIFT = get_train_SIFT()
X_test_SIFT = get_test_SIFT()

# Testing

# ALEX

In [57]:
#models
rfc = RandomForestClassifier()
lr = linear_model.LogisticRegression()
gnb = GaussianNB()
bnb = BernoulliNB()
mnb = MultinomialNB()
knnc = KNeighborsClassifier()

In [None]:
print "rfc"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, rfc)
#print "lr"
#print cross_validation_accuracy(X_train_ALEX, Y_train, 3, lr)
print "gnb"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, bnb)
print "knnc"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, knnc)

In [None]:
X_train_ALEX_centered = mean_subtraction(X_train_ALEX)

In [None]:
print "lr ALEX centered"
print cross_validation_accuracy(X_train_ALEX_centered, Y_train, 3, lr)

In [None]:
print "rfc ALEX centered"
print cross_validation_accuracy(X_train_ALEX_centered, Y_train, 3, rfc)
print "gnb ALEX centered"
print cross_validation_accuracy(X_train_ALEX_centered, Y_train, 3, gnb)
print "bnb ALEX centered"
print cross_validation_accuracy(X_train_ALEX_centered, Y_train, 3, bnb)
print "knnc ALEX centered"
print cross_validation_accuracy(X_train_ALEX_centered, Y_train, 3, knnc)

In [9]:
X_train_ALEX_standardized = preprocessing.scale(X_train_ALEX)

In [None]:
print "lr ALEX standardized"
print cross_validation_accuracy(X_train_ALEX_standardized, Y_train, 3, lr)

In [10]:
print "rfc ALEX standardized"
print cross_validation_accuracy(X_train_ALEX_standardized, Y_train, 3, rfc)
print "gnb ALEX standardized"
print cross_validation_accuracy(X_train_ALEX_standardized, Y_train, 3, gnb)
print "bnb ALEX standardized"
print cross_validation_accuracy(X_train_ALEX_standardized, Y_train, 3, bnb)
print "knnc ALEX standardized"
print cross_validation_accuracy(X_train_ALEX_standardized, Y_train, 3, knnc)

rfc ALEX standardized
0.0963333333333
gnb ALEX standardized
0.295666666667
bnb ALEX standardized
0.342333333333
knnc ALEX standardized
0.229


In [None]:
U_ALEX, D_ALEX, VT_ALEX = LA.svd(X_train_ALEX_centered, full_matrices=False)

In [None]:
Vqt_ALEX = VT_ALEX[:1900]
#print Vqt_ALEX.shape

X_train_ALEX_pca = np.zeros((len(X_train_ALEX_centered), 1900))
for i in range(len(X_train_ALEX_centered)):
   X_train_ALEX_pca[i,:] = Vqt_ALEX.dot(X_train_ALEX_centered[i,:])

In [None]:
print "rfc ALEX centered pca"
print cross_validation_accuracy(X_train_ALEX_pca, Y_train, 3, rfc)
#print "lr ALEX centered pca"
#print cross_validation_accuracy(X_train_ALEX_pca, Y_train, 3, lr)
print "gnb ALEX centered pca"
print cross_validation_accuracy(X_train_ALEX_pca, Y_train, 3, gnb)
print "bnb ALEX centered pca"
print cross_validation_accuracy(X_train_ALEX_pca, Y_train, 3, bnb)
print "knnc ALEX centered pca"
print cross_validation_accuracy(X_train_ALEX_pca, Y_train, 3, knnc)

In [58]:
X_train_ALEX_row_normalized = normalize_row(X_train_ALEX,1)
X_test_ALEX_row_normalized = normalize_row(X_test_ALEX,1)

In [48]:
#dual classifier

X_train_ALEX_row_normalized = normalize_row(X_train_ALEX,1)
X_test_ALEX_row_normalized = normalize_row(X_test_ALEX,1)

predicted_gnb = gnb.fit(X_train_ALEX_row_normalized, Y_train).predict(X_test_ALEX_row_normalized)
#append results
Y_train_gnb = np.append(Y_train, predicted_gnb)
#X_train_ALEX_new = np.append(X_train_ALEX_row_normalized, X_test_ALEX_row_normalized)
X_train_ALEX_gnb = np.vstack((X_train_ALEX_row_normalized, X_test_ALEX_row_normalized))
print "bnb ALEX normalized gnb -> bnb"
print cross_validation_accuracy(X_train_ALEX_gnb, Y_train_gnb, 3, bnb)

In [60]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier()
print "abc ALEX normalized"
print cross_validation_accuracy(X_train_ALEX_row_normalized, Y_train, 3, abc)




bnb ALEX normalized abc
0.00666666666667


bnb ALEX normalized gnb -> bnb
0.486023117474


In [50]:
#triple classifier

predicted_gnb_bnb = bnb.fit(X_train_ALEX_gnb, Y_train_gnb).predict(X_test_ALEX_row_normalized)
#append results
Y_train_gnb_bnb = np.append(Y_train_gnb, predicted_gnb_bnb)
#X_train_ALEX_new = np.append(X_train_ALEX_row_normalized, X_test_ALEX_row_normalized)
X_train_ALEX_gnb_bnb = np.vstack((X_train_ALEX_gnb, X_test_ALEX_row_normalized))


print "bnb ALEX normalized gnb -> bnb -> lr"
print cross_validation_accuracy(X_train_ALEX_gnb, Y_train_gnb, 3, lr)



bnb ALEX normalized gnb -> bnb -> lr
0.0382573154533


# Attributes

In [None]:
print "rfc"
print cross_validation_accuracy(X_train, Y_train, 3, rfc)
print "lr"
print cross_validation_accuracy(X_train, Y_train, 3, lr)
print "gnb"
print cross_validation_accuracy(X_train, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train, Y_train, 3, bnb)
print "mnb"
print cross_validation_accuracy(X_train, Y_train, 3, bnb)
print "knnc"
print cross_validation_accuracy(X_train, Y_train, 3, knnc)

In [None]:
for k in range(20):
    knnc = KNeighborsClassifier(n_neighbors=k+1)
    print k+1
    print cross_validation_accuracy(X_train_ALEX, Y_train, 3, knnc)

In [None]:
X_train_centered = normalize_row(X_train, 2)

In [None]:
print "rfc"
print cross_validation_accuracy(X_train_centered, Y_train, 3, rfc)
print "lr"
print cross_validation_accuracy(X_train_centered, Y_train, 3, lr)
print "gnb"
print cross_validation_accuracy(X_train_centered, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_centered, Y_train, 3, bnb)
print "knnc"
print cross_validation_accuracy(X_train_centered, Y_train, 3, knnc)

# SIFT

In [None]:
X_train_SIFT_centered = mean_subtraction(X_train_SIFT)

In [None]:
print "lr SIFT centered"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, lr)

In [None]:
print "rfc SIFT centered"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, rfc)
print "gnb SIFT centered"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, gnb)
print "bnb SIFT centered"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, bnb)
print "knnc SIFT centered"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, knnc)

In [None]:
X_train_SIFT_standardized = preprocessing.scale(X_train_SIFT)

In [None]:
print "lr SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_standardized, Y_train, 3, lr)

In [None]:
print "rfc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_standardized, Y_train, 3, rfc)
print "gnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_standardized, Y_train, 3, gnb)
print "bnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_standardized, Y_train, 3, bnb)
print "knnc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_standardized, Y_train, 3, knnc)
print "mnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_standardized, Y_train, 3, mnb)

In [19]:
X_train_SIFT_normalize_row = normalize_row(X_train_SIFT,1)

In [20]:
print "rfc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, rfc)
print "gnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, gnb)
print "bnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, bnb)
print "knnc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, knnc)
print "mnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, mnb)

rfc SIFT standardized
0.0226666666667
gnb SIFT standardized
0.0193333333333
bnb SIFT standardized
0.106333333333
knnc SIFT standardized
0.0533333333333
mnb SIFT standardized
0.001


In [76]:
X_train_SIFT_normalize_row = normalize_row(X_train_SIFT,2)
print "rfc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, rfc)
print "gnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, gnb)
print "bnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, bnb)
print "knnc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, knnc)
print "mnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_row, Y_train, 3, mnb)

rfc SIFT standardized
0.019
gnb SIFT standardized
0.00833333333333
bnb SIFT standardized
0.106333333333
knnc SIFT standardized
0.0713333333333
mnb SIFT standardized
0.0236666666667


In [77]:
X_train_SIFT_normalize_column = normalize_column(X_train_SIFT,1)

print "rfc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, rfc)
print "gnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, gnb)
print "bnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, bnb)
print "knnc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, knnc)
print "mnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, mnb)

rfc SIFT standardized
0.015
gnb SIFT standardized
0.011
bnb SIFT standardized
0.106333333333
knnc SIFT standardized
0.0276666666667
mnb SIFT standardized
0.001


In [78]:
X_train_SIFT_normalize_column = normalize_column(X_train_SIFT,2)

print "rfc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, rfc)
print "gnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, gnb)
print "bnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, bnb)
print "knnc SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, knnc)
print "mnb SIFT standardized"
print cross_validation_accuracy(X_train_SIFT_normalize_column, Y_train, 3, mnb)

rfc SIFT standardized
0.0173333333333
gnb SIFT standardized
0.00866666666667
bnb SIFT standardized
0.106333333333
knnc SIFT standardized
0.044
mnb SIFT standardized
0.034


# SuperVector

In [72]:
X_train_super = []
X_test_super = []

for i in range(len(X_train)):
    array = np.append(X_train[i], X_train_ALEX[i])
    array = np.append(array, X_train_SIFT[i]) 
    X_train_super.append(array)

for i in range(len(X_test)):
    array = np.append(X_test[i], X_test_ALEX[i])
    array = np.append(array, X_test_SIFT[i])
    X_test_super.append(array)

print len(X_test_super[0])
print len(X_train_super[0])

1000
3000


In [75]:
U_super, D_super, VT_super = LA.svd(X_train_super, full_matrices=False)

In [None]:
Vqt_super = VT_super[:1900]
#print Vqt_ALEX.shape

X_train_super_pca = np.zeros((len(X_train_super), 1900))
for i in range(len(X_train_super)):
   X_train_super_pca[i,:] = Vqt_super.dot(X_train_super[i,:])