In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [9]:
import numpy as np
import matplotlib.pyplot as pt
import matplotlib.image as img
from PIL import Image
from skimage import color
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import preprocessing

from sklearn import linear_model, datasets
from sklearn import tree
from sklearn.externals.six import StringIO
import csv
from sklearn import metrics, cross_validation
from numpy import linalg as LA

#Returns dictionary of labels; key is label as string, value is a unique index
def get_category_labels_as_dict():
    category_dict = {}
    num_category = 0
    with open('./CS5785-final-data/train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            if not category_dict.has_key(tokens[1]):
                category_dict[tokens[1]] = num_category
                num_category += 1
    return category_dict

#Returns labels of training set as an array of strings
def get_train_labels():
    train_labels = []
    with open('./CS5785-final-data/train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            train_labels.append(tokens[1])
    return train_labels

#Returns train labels as numpy array, each label is unique index; params: dictionary of label to unique index, array of labels as strings
#Usage: get_train_labels_as_unique_indices(get_category_labels_as_dict(), get_train_labels())
def get_train_labels_as_unique_indices(label_dictionary, train_label_array):
    train_labels_as_index = []
    for i in range(len(train_label_array)):
        label_name = train_label_array[i]
        index = label_dictionary[label_name]
        train_labels_as_index.append(index)
    return np.array(train_labels_as_index)

#Translates index values back into string labels
def get_labels_from_indices(label_dictionary, Y_predicted):
    labels = []
    for i in range(len(Y_predicted)):
        target = Y_predicted[i]
        for key in label_dictionary.keys():
            if label_dictionary[key] == target:
                labels.append(key)
                break
    return labels

#Writes the CSV file
def print_output(Y_predicted):
    with open('./CS5785-final-data/test.txt') as f:
        lines = f.read().splitlines()
    
    with open('kaggle_submission_vote.csv', "wb") as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['ID', 'Category'])
        for index in range(len(Y_predicted)):
            writer.writerow([lines[index], Y_predicted[index]])
 
#calculate cross validation score
def cross_validation_accuracy(X, Y, folds, model):
    average = 0
    
    for train_indices, test_indices in cross_validation.KFold(len(X), n_folds=folds):
    
        X_train = X[train_indices]
        Y_train = Y[train_indices]
        X_test = X[test_indices]
        Y_test = Y[test_indices]

        Y_predicted = model.fit(X_train, Y_train).predict(X_test)
    
        #Compare Y_test and Y_predicted
        average += (Y_predicted == Y_test).sum() / float(len(Y_test))
        
    return average / float(folds)

#Returns attributes of train data as numpy array
def get_train_attributes():
    feature_vectors = []
    with open('./CS5785-final-data/attributes_train.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            values = tokens[1].split(',')
            fv = []
            for value in values:
                fv.append(int(value))
            feature_vectors.append(fv)
    return np.array(feature_vectors)

#Returns attributes of test data as numpy array
def get_test_attributes():
    feature_vectors = []
    with open('./CS5785-final-data/attributes_test.txt') as f:
        lines = f.read().splitlines()
        for line in lines:
            tokens = line.split(' ')
            values = tokens[1].split(',')
            fv = []
            for value in values:
                fv.append(int(value))
            feature_vectors.append(fv)
    return np.array(feature_vectors)

#normalizes the data by normNumber-norm
def normalize(X_train, normNumber):
    X_train= np.array(X_train)
    norm2 = LA.norm(X_train, axis = 0, ord = normNumber)
    for i in range(len(X_train)):
       X_train[i][0] = X_train[i][0]/norm2[0]
       X_train[i][1] = X_train[i][0]/norm2[1]
        
    return X_train

#Returns SIFT attributes of training data as numpy array
def get_train_SIFT():
    return np.load('./CS5785-final-data/SIFTBoW_train.npy')

#Returns SIFT attributes of test data as numpy array
def get_test_SIFT():
    return np.load('./CS5785-final-data/SIFTBoW_test.npy')

#Returns ALEX attributes of train data as numpy array
def get_train_ALEX():
    return np.load('./CS5785-final-data/alexnet_feat_train.npy')

def get_test_ALEX():
    return np.load('./CS5785-final-data/alexnet_feat_test.npy')

In [10]:
#get Y_train
category_dictionary = get_category_labels_as_dict()
Y_train_text = get_train_labels()
Y_train = get_train_labels_as_unique_indices(category_dictionary, Y_train_text)

In [19]:
#get X_train and X_test
X_train = get_train_attributes()
X_test = get_test_attributes()

In [21]:
lr = linear_model.LogisticRegression()
#Y_predicted_1 = lr.fit(X_train, Y_train).predict(X_test)

In [11]:
X_train_ALEX = get_train_ALEX()
X_test_ALEX = get_test_ALEX()
#Y_predicted_2 = lr.fit(X_train_ALEX, Y_train).predict(X_test_ALEX)

In [26]:
X_train_SIFT = get_train_SIFT()
X_test_SIFT = get_test_SIFT()
#Y_predicted_3 = lr.fit(X_train_SIFT, Y_train).predict(X_test_SIFT)

In [23]:
#Get 3 axes for Y_predicted
Y_predicted = []
for i in range (0, len(Y_predicted_1)):
    vote = []
    vote.append(Y_predicted_1[i])
    vote.append(Y_predicted_2[i])
    vote.append(Y_predicted_3[i])
    if(vote.count(vote[1])>=2):
        Y_predicted.append(vote[1])
    else: Y_predicted.append(vote[0])
Y_predicted = np.array(Y_predicted)

In [25]:
#Convert Y_predicted to string
Y_predicted_text = get_labels_from_indices(category_dictionary, Y_predicted)
print_output(Y_predicted_text)

In [7]:
#models
rfc = RandomForestClassifier()
lr = linear_model.LogisticRegression()
gnb = GaussianNB()
bnb = BernoulliNB()
knnc = KNeighborsClassifier()

In [46]:
X_train_ALEX_centered = preprocessing.scale(X_train_ALEX)

print "rfc"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, rfc)
print "lr"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, lr)
print "gnb"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, bnb)
print "knnc"
print cross_validation_accuracy(X_train_ALEX, Y_train, 3, knnc)

rfc
0.0903333333333
lr
0.340333333333
gnb
0.295666666667
bnb
0.341333333333
knnc
0.236333333333


In [52]:
#Preprocess X_train_ALEX
mean = np.zeros(len(X_train_ALEX[0]))
for i in range(len(X_train_ALEX)):
    mean += X_train_ALEX[i]
mean = mean / float(len(X_train_ALEX))

[-3.42435863 -3.46553791 -1.81228862 ..., -1.10022763 -1.89527777
 -2.52920115]


In [53]:
X_train_ALEX_centered = np.zeros(X_train_ALEX.shape)
for i in range(len(X_train_ALEX)):
    X_train_ALEX_centered[i,:] = X_train_ALEX[i,:] - mean

In [54]:
print "gnb"
print cross_validation_accuracy(X_train_ALEX_centered, Y_train, 3, gnb)

gnb
0.295666666667


In [49]:
print "rfc"
print cross_validation_accuracy(X_train, Y_train, 3, rfc)
print "lr"
print cross_validation_accuracy(X_train, Y_train, 3, lr)
print "gnb"
print cross_validation_accuracy(X_train, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train, Y_train, 3, bnb)
print "knnc"
print cross_validation_accuracy(X_train, Y_train, 3, knnc)

rfc
0.172
lr
0.261
gnb
0.165666666667
bnb
0.249666666667
knnc
0.171666666667


In [25]:
X_train_ALEX_normalized = normalize(X_train_ALEX, 1)
print "gnb"
print cross_validation_accuracy(X_train_ALEX_normalized, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_ALEX_normalized, Y_train, 3, bnb)

gnb
0.295666666667
bnb
0.341333333333


In [28]:
X_train_SIFT_normalized = normalize(X_train_SIFT, 2)
print "gnb"
print cross_validation_accuracy(X_train_SIFT_normalized, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_SIFT_normalized, Y_train, 3, bnb)

gnb
0.00833333333333
bnb
0.106


In [31]:
X_train_SIFT_centered = preprocessing.scale(X_train_SIFT)
print "gnb"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_SIFT_centered, Y_train, 3, bnb)

gnb
0.00866666666667
bnb
0.106


In [32]:

print "gnb"
print cross_validation_accuracy(X_train_SIFT, Y_train, 3, gnb)
print "bnb"
print cross_validation_accuracy(X_train_SIFT, Y_train, 3, bnb)

gnb
0.00833333333333
bnb
0.106333333333


In [12]:
X_train_ALEX_centered = preprocessing.scale(X_train_ALEX)
X_test_ALEX_centered = preprocessing.scale(X_test_ALEX)

lr = linear_model.LogisticRegression()
predicted = lr.fit(X_train_ALEX_centered, Y_train).predict(X_test_ALEX_centered)

predicted_text = get_labels_from_indices(category_dictionary, predicted)
print_output(predicted_text)

