In [1]:
import caffe
import lmdb
import os
import caffe.proto.caffe_pb2
from caffe.io import datum_to_array
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from scipy.spatial import distance

In [2]:
def read_from_db(db_dir):
    lmdb_env = lmdb.open(db_dir)
    lmdb_txn = lmdb_env.begin()
    lmdb_cursor = lmdb_txn.cursor()
    datum = caffe.proto.caffe_pb2.Datum()

    D = []

    for idx, (key, value) in enumerate(lmdb_cursor):
        datum.ParseFromString(value)
        data = caffe.io.datum_to_array(datum)
        D.append(data.flatten())

    lmdb_env.close()
    return D

In [3]:
def read_labels(label_file):
    D = []
    with open(label_file,'rb') as f:
        for line in f:
            D.append(int(line.split(' ')[1]))
    return D

In [4]:
X_train = read_from_db('/Users/ecsark/Documents/visualdb/project/wikiart/train_vgg_feature_fc7')
y_train = read_labels('/Users/ecsark/Documents/visualdb/project/wikiart/train.txt')
X_test = read_from_db('/Users/ecsark/Documents/visualdb/project/wikiart/test_vgg_feature_fc7')
y_test = read_labels('/Users/ecsark/Documents/visualdb/project/wikiart/test.txt')

In [5]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [6]:
y_pred = clf.predict(X_test)
error = sum([1 if p!=g else 0 for (p, g) in zip(y_pred, y_test)])
print 1.0 - 1.0*error/len(y_test)

0.5005


In [7]:
print list(y_pred[:20])
print list(y_test[:20])

[2, 5, 1, 5, 4, 1, 4, 8, 5, 8, 3, 3, 0, 4, 9, 4, 3, 2, 5, 2]
[2, 2, 7, 5, 4, 6, 4, 8, 6, 8, 5, 3, 0, 4, 0, 4, 3, 7, 5, 4]


In [8]:
def findMostSimilar(X, x_query, k=5):
    dist = distance.cdist(X, np.array([x_query]), 'euclidean')
    return dist.flatten().argsort()[:k]

In [9]:
k_min_idx = findMostSimilar(X_train, X_test[4], 10)
print k_min_idx
print [y_train[i] for i in k_min_idx]

[7146 5474  818 7384 6578 6699 3300 2315 7498 5900]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 1]


In [10]:
for i in range(20):
    k_min_idx = findMostSimilar(X_train, X_test[i], 10)
    labels = [y_train[j] for j in k_min_idx]
    print y_test[i], max(set(labels), key=labels.count), labels

2 1 [7, 1, 1, 1, 1, 8, 9, 3, 8, 2]
2 5 [5, 2, 2, 5, 5, 5, 5, 6, 5, 2]
7 1 [4, 7, 1, 7, 4, 2, 1, 1, 5, 2]
5 3 [3, 3, 6, 3, 3, 9, 6, 6, 1, 5]
4 4 [4, 4, 4, 4, 4, 4, 4, 4, 4, 1]
6 1 [1, 1, 2, 7, 7, 7, 1, 1, 1, 5]
4 4 [4, 1, 4, 5, 4, 7, 4, 4, 4, 5]
8 8 [7, 8, 0, 9, 4, 2, 8, 8, 5, 8]
6 5 [3, 0, 4, 7, 5, 5, 7, 9, 7, 5]
8 1 [9, 1, 9, 1, 6, 1, 5, 2, 1, 9]
5 3 [3, 3, 3, 5, 5, 3, 3, 3, 3, 6]
3 3 [3, 3, 5, 3, 0, 2, 9, 3, 6, 6]
0 0 [0, 0, 1, 0, 5, 8, 5, 5, 0, 1]
4 4 [4, 4, 4, 8, 8, 4, 4, 4, 0, 2]
0 1 [8, 1, 2, 0, 9, 1, 1, 1, 0, 3]
4 6 [5, 6, 4, 7, 0, 2, 6, 6, 1, 4]
3 3 [5, 3, 5, 5, 3, 3, 7, 5, 3, 3]
7 2 [7, 2, 1, 9, 7, 9, 2, 2, 3, 1]
5 5 [5, 5, 5, 0, 3, 0, 5, 2, 5, 3]
4 1 [6, 0, 2, 4, 1, 1, 7, 6, 9, 9]


In [12]:
y_pred_max = []

for x in X_test:
    k_min_idx = findMostSimilar(X_train, x, 10)
    labels = [y_train[i] for i in k_min_idx]
    y_pred_max.append(max(set(labels), key=labels.count))

error_max = sum([1 if p!=g else 0 for (p, g) in zip(y_pred_max, y_test)])
print 1.0 - 1.0*error_max/len(y_pred_max)

0.3675
