In [22]:
import caffe
import lmdb
import os
import caffe.proto.caffe_pb2
from caffe.io import datum_to_array
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from scipy.spatial import distance

In [2]:
def read_from_db(db_dir):
    lmdb_env = lmdb.open(db_dir)
    lmdb_txn = lmdb_env.begin()
    lmdb_cursor = lmdb_txn.cursor()
    datum = caffe.proto.caffe_pb2.Datum()

    D = []

    for idx, (key, value) in enumerate(lmdb_cursor):
        datum.ParseFromString(value)
        data = caffe.io.datum_to_array(datum)
        D.append(data.flatten())

    lmdb_env.close()
    return D

In [3]:
def read_labels(label_file):
    D = []
    with open(label_file,'rb') as f:
        for line in f:
            D.append(int(line.split(' ')[1]))
    return D

In [4]:
X_train = read_from_db('/Users/ecsark/Documents/visualdb/project/wikiart/train_feature')
y_train = read_labels('/Users/ecsark/Documents/visualdb/project/wikiart/train.txt')
X_test = read_from_db('/Users/ecsark/Documents/visualdb/project/wikiart/test_feature')
y_test = read_labels('/Users/ecsark/Documents/visualdb/project/wikiart/test.txt')

In [7]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [9]:
y_pred = clf.predict(X_test)
error = sum([1 if p!=g else 0 for (p, g) in zip(y_pred, y_test)])
print 1.0*error/len(y_test)

0.5225


In [21]:
print list(y_pred[:20])
print list(y_test[:20])

[0, 2, 4, 9, 4, 7, 4, 8, 6, 8, 5, 9, 0, 4, 2, 4, 3, 1, 5, 6]
[2, 2, 7, 5, 4, 6, 4, 8, 6, 8, 5, 3, 0, 4, 0, 4, 3, 7, 5, 4]


In [38]:
def findMostSimilar(X, x_query, k=5):
    dist = distance.cdist(X, np.array([x_query]), 'euclidean')
    return dist.flatten().argsort()[:k]

In [51]:
k_min_idx = findMostSimilar(X_train, X_test[4], 10)
print k_min_idx
print [y_train[i] for i in k_min_idx]

[6502 2315 3300 4147 1808 5194 3021 7146 5874 3725]
[4, 4, 4, 7, 4, 7, 7, 4, 0, 1]


In [50]:
for i in range(20):
    k_min_idx = findMostSimilar(X_train, X_test[i], 10)
    labels = [y_train[j] for j in k_min_idx]
    print y_test[i], max(set(labels), key=labels.count), labels

2 0 [5, 0, 2, 0, 0, 0, 2, 8, 8, 5]
2 5 [2, 0, 5, 8, 5, 5, 5, 5, 2, 2]
7 7 [9, 6, 1, 1, 7, 3, 4, 7, 4, 7]
5 5 [8, 3, 2, 5, 5, 5, 5, 6, 7, 5]
4 4 [4, 4, 4, 7, 4, 7, 7, 4, 0, 1]
6 1 [1, 1, 4, 6, 2, 1, 6, 1, 4, 1]
4 4 [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
8 2 [4, 2, 2, 7, 8, 0, 5, 2, 2, 9]
6 6 [6, 5, 6, 2, 3, 6, 6, 6, 6, 6]
8 1 [9, 5, 1, 2, 1, 5, 6, 0, 2, 3]
5 3 [3, 3, 3, 5, 3, 3, 3, 5, 3, 2]
3 3 [0, 9, 3, 9, 3, 3, 7, 3, 5, 3]
0 0 [3, 9, 0, 9, 2, 1, 0, 2, 0, 0]
4 7 [0, 7, 4, 7, 0, 5, 7, 7, 9, 4]
0 1 [2, 7, 5, 1, 4, 0, 1, 1, 9, 2]
4 4 [4, 4, 9, 7, 7, 6, 7, 4, 9, 4]
3 3 [3, 6, 3, 3, 3, 5, 3, 3, 3, 6]
7 3 [9, 1, 3, 7, 9, 2, 1, 1, 3, 3]
5 5 [5, 3, 2, 5, 5, 5, 5, 5, 5, 0]
4 7 [6, 7, 1, 7, 5, 9, 6, 9, 7, 8]


In [46]:
y_pred_max = []

for x in X_test[]:
    k_min_idx = findMostSimilar(X_train, x, 10)
    labels = [y_train[i] for i in k_min_idx]
    y_pred_max.append(max(set(labels), key=labels.count))

error_max = sum([1 if p!=g else 0 for (p, g) in zip(y_pred_max, y_test)])
print 1.0*error_max/len(y_pred_max)

0.6055
