## Data Loading

In [1]:
import h5py
import numpy as np
import pickle

data = h5py.File('fashion-mnist-784-euclidean.hdf5', 'r')
dataset = np.array(data['train'])
queries = np.array(data['test'])
with open('clusters_fashion_mnist_784_euclidean.pkl', 'rb') as f:
    clusters = pickle.load(f)
with open('ground_truth_fashion_mnist_784_normalized_euclidean_0_0_0_5.pkl', 'rb') as f:
    ground_truth_total = pickle.load(f)

In [2]:
ground_truth_total_level = [[[] for _ in range(10000)] for _ in range(100)]
for clus in range(100):
    for t in ground_truth_total[clus]:
        ground_truth_total_level[t[0]][t[1]].append(t)

In [3]:
centroids = []
for cluster in clusters:
    centroids.append(np.mean(cluster))

## Prepare Inputs

In [5]:
def euclidean_dist_normalized(x1, x2=None, eps=1e-8):
    if np.isnan(x2):
        return 1.0
    left = x1 / 255.0
    right = x2 / 255.0
    return np.sqrt(((left - right) ** 2).mean())

train_features = []
train_thresholds = []
train_targets = []
slot = 0.01
for query_id in range(8000):
    cardinality = [0 for _ in range(100)]
    distances2centroids = []
    for cc in centroids:
        distances2centroids.append(euclidean_dist_normalized(queries[query_id], cc))
    for threshold_id, threshold in enumerate(np.arange(0.0, 0.5, slot)):
        indicator = []
        for cluster_id in range(100):
            cardinality[cluster_id] += ground_truth_total_level[cluster_id][query_id][threshold_id][-1]
            if cardinality[cluster_id] > 0:
                indicator.append(1)
            else:
                indicator.append(0)
        feature = queries[query_id] / 255.0
        train_features.append(feature)
        train_thresholds.append([threshold+slot])
        train_targets.append(indicator)
                
test_features = []
test_thresholds = []
test_targets = []
slot = 0.01
for query_id in range(8000,10000):
    cardinality = [0 for _ in range(100)]
    distances2centroids = []
    for cc in centroids:d
        distances2centroids.append(euclidean_dist_normalized(queries[query_id], cc))
    for threshold_id, threshold in enumerate(np.arange(0.0, 0.5, slot)):
        indicator = []
        for cluster_id in range(100):
            cardinality[cluster_id] += ground_truth_total_level[cluster_id][query_id][threshold_id][-1]
            if cardinality[cluster_id] > 0:
                indicator.append(1)
            else:
                indicator.append(0)
        feature = queries[query_id] / 255.0
        test_features.append(feature)
        test_thresholds.append([threshold+slot])
        test_targets.append(indicator)
        
        

In [6]:
len(test_features)

100000

In [8]:
import torch
batch_size = 128
train_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.FloatTensor(train_features), torch.FloatTensor(train_thresholds), torch.FloatTensor(train_targets)), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.FloatTensor(test_features), torch.FloatTensor(test_thresholds), torch.FloatTensor(test_targets)), batch_size=batch_size, shuffle=True)

## Mixture Density Networks

In [180]:
from __future__ import print_function
import argparse
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# dimensionality of hidden layer
h = 64
# K mixing components (PRML p. 274)
# Can also formulate as a K-dimensional, one-hot
# encoded, latent variable $z$, and have the model
# produce values for $\mu_k = p(z_k = 1)$, i.e., the
# prob of each possible state of $z$. (PRML p. 430)
k = 10  # 3
# We specialize to the case of isotropic covariances (PRML p. 273),
# so the covariance matrix is diagonal with equal diagonal elements,
# i.e., the variances for each dimension of y are equivalent.
# therefore, the MDN outputs pi & sigma scalars for each mixture
# component, and a mu vector for each mixture component containing
# means for each target variable.
# NOTE: we could use the shorthand `d_out = 3*k`, since our target
# variable for this project only has a dimensionality of 1, but
# the following is more general.
# d_out = (t + 2) * k  # t is L from PRML p. 274
# NOTE: actually cleaner to just separate pi, sigma^2, & mu into
# separate functions.
t = 1
d_pi = k
d_sigmasq = k
d_mu = t * k

d = 784
d_out = 100
# n = len(training_features)
# dimention of input features

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.nn1 = nn.Linear(d+1, h)
        self.nn2 = nn.Linear(h, h)
        self.nn_pi = nn.Linear(h, d_pi)
        self.nn_sigmasq = nn.Linear(h, d_sigmasq)
        self.nn_mu = nn.Linear(h, d_mu)
#         self.nn_threshold_1 = nn.Linear(1, h)
#         self.nn_threshold_2 = nn.Linear(h, 1)
        
#         self.w1 = Variable(torch.randn(d, h) * np.sqrt(2/(d+h)), requires_grad=True)
#         self.b1 = Variable(torch.zeros(1, h), requires_grad=True)
#         self.w_pi = Variable(torch.randn(h, d_pi) * np.sqrt(2/(d+h)), requires_grad=True)
#         self.b_pi = Variable(torch.zeros(1, d_pi), requires_grad=True)
#         self.w_sigmasq = Variable(torch.randn(h, d_sigmasq) * np.sqrt(2/(d+h)), requires_grad=True)
#         self.b_sigmasq = Variable(torch.zeros(1, d_sigmasq), requires_grad=True)
#         self.w_mu = Variable(torch.randn(h, d_mu) * np.sqrt(2/(d+h)), requires_grad=True)
#         self.b_mu = Variable(torch.zeros(1, d_mu), requires_grad=True)

    def forward(self, x, threshold):
#         print ('input: ', x)
        out = F.leaky_relu(self.nn1(torch.cat((x, threshold), dim=1)))  # shape (n, h)
        out = F.leaky_relu(self.nn2(out))  # shape (n, h)
#         print ('out: ', out)
        #out = F.leaky_relu(x.mm(w1) + b1)  # interesting possibility
        pi = F.softmax(self.nn_pi(out), dim=1)  # p(z_k = 1) for all k; K mixing components that sum to 1; shape (n, k)
        sigmasq = torch.exp(self.nn_sigmasq(out))  # K gaussian variances, which must be >= 0; shape (n, k)
#         sigmasq = self.nn_sigmasq(out)
        mu = torch.exp(self.nn_mu(out))  # K * L gaussian means; shape (n, k*t)
#         print ('pi: ', pi)
#         print ('sigmasq: ', sigmasq)
#         print ('mu: ', mu)
        outputs_y = torch.zeros(x.size()[0], d_out)  # p(y|x)
        outputs_x = Variable( torch.FloatTensor([[e for e in range(d_out)] for _ in range(x.size()[0])]))
        for i in range(k):  # marginalize over z
            likelihood_z_x = gaussian_pdf(outputs_x, mu[:, i*t:(i+1)*t], sigmasq[:, i:(i+1)])
#             print (likelihood_z_x)
            prior_z = pi[:, i:(i+1)]
#             print (prior_z.shape)
    #         print ('likelihood_z_x: ', likelihood_z_x)
#             outputs_y += prior_z * likelihood_z_x
            outputs_y += likelihood_z_x
#         hid_threshold_1 = F.relu(self.nn_threshold_1(threshold))
#         hid_threshold_2 = F.relu(self.nn_threshold_2(hid_threshold_1))
#         return F.sigmoid(outputs_y - threshold)
#         print ('outputs_y: ', outputs_y)
#         print ('threshold: ', threshold)
        return F.sigmoid(outputs_y - threshold)
#         return outputs_y


def gaussian_pdf(x, mu, sigmasq):
    # NOTE: we could use the new `torch.distributions` package for this now
#     print (x.shape, mu.shape, sigmasq.shape)
#     return (1/torch.sqrt(2*np.pi*sigmasq)) * torch.exp((-1/(2*sigmasq)) * torch.norm((x-mu), 2, 1)**2)
    return (1/torch.sqrt(2*np.pi*sigmasq)) * torch.exp((-1/(2*sigmasq)) * (x-mu)**2)   

def loss_fn(estimates, targets):
    return F.mse_loss(estimates, targets)

def print_loss(estimates, targets):
    true_positive = 0.0
    true_negative = 0.0
    false_positive = 0.0
    false_negative = 0.0
    num_elements = estimates.shape[1]
    for est, tar in zip(estimates, targets):
        for i in range(num_elements):
            if est[i] < 0.5 and tar[i] == 0:
                true_negative += 1
            elif est[i] < 0.5 and tar[i] == 1:
                false_positive += 1
            elif est[i] >= 0.5 and tar[i] == 0:
                false_negative += 1
            else:
                true_positive += 1
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    return precision, recall

In [181]:
model = Model()
opt = optim.Adam(model.parameters(), lr=0.0001)
for e in range(1):
    model.train()
    for batch_idx, (features, thresholds, targets) in enumerate(train_loader):
        x = Variable(features)
        z = Variable(thresholds)
        opt.zero_grad()
        estimates = model(x, z)
#         print ('estimates: ', estimates)
#         print ('targets: ', targets)
        loss = loss_fn(estimates, targets)
        if batch_idx % 100 == 0:
            print('Training: Iteration {0}, Batch {1}, Loss {2}'.format(e, batch_idx, loss.item()))
        loss.backward()
        opt.step()

    model.eval()
    test_loss = 0.0
    precision = 0.0
    recall = 0.0
    for batch_idx, (features, thresholds, targets) in enumerate(test_loader):
        x = Variable(features)
        z = Variable(thresholds)
        estimates = model(x, z)
        loss = loss_fn(estimates, targets)
        test_loss += loss.item()
        prec, rec = print_loss(estimates, targets)
        precision += prec
        recall += rec
        if batch_idx % 100 == 0:
            print ('Testing: Iteration {0}, Batch {1}, Loss {2}, Precision {3}, Recall {4}'.format(e, batch_idx, loss.item(), prec, rec))
    test_loss /= len(test_loader)
    precision /= len(test_loader)
    recall /= len(test_loader)
    print ('Testing: Loss {0}, Precision {1}, Recall {2}'.format(test_loss, precision, recall))
    
    

Training: Iteration 0, Batch 0, Loss 0.27266162633895874
Training: Iteration 0, Batch 100, Loss 0.26595133543014526
Training: Iteration 0, Batch 200, Loss 0.2623739242553711
Training: Iteration 0, Batch 300, Loss 0.25652915239334106
Training: Iteration 0, Batch 400, Loss 0.2568981349468231
Training: Iteration 0, Batch 500, Loss 0.2606426179409027
Training: Iteration 0, Batch 600, Loss 0.2609802186489105
Training: Iteration 0, Batch 700, Loss 0.2599741220474243
Training: Iteration 0, Batch 800, Loss 0.2596185505390167
Training: Iteration 0, Batch 900, Loss 0.2558780610561371
Training: Iteration 0, Batch 1000, Loss 0.257240355014801
Training: Iteration 0, Batch 1100, Loss 0.2593942880630493
Training: Iteration 0, Batch 1200, Loss 0.26098722219467163
Training: Iteration 0, Batch 1300, Loss 0.2582543194293976
Training: Iteration 0, Batch 1400, Loss 0.2639160454273224
Training: Iteration 0, Batch 1500, Loss 0.26550450921058655
Training: Iteration 0, Batch 1600, Loss 0.2590753436088562
Train

KeyboardInterrupt: 

In [170]:
sigmasq = torch.exp(model.nn_sigmasq(F.leaky_relu(model.nn2(F.leaky_relu(model.nn1(x))))))

In [174]:
mu = torch.exp(model.nn_mu(F.leaky_relu(model.nn2(F.leaky_relu(model.nn1(x))))))

In [147]:
outputs_x = torch.FloatTensor([[e for e in range(d_out)] for _ in range(x.size()[0])])

In [172]:
sigmasq

tensor([[2.7365e+01, 1.5958e-04, 6.1235e-05,  ..., 2.1868e-04, 2.2945e+03,
         3.7754e-04],
        [1.0974e+01, 2.3538e-03, 1.1906e-03,  ..., 2.8051e-03, 2.3124e+02,
         4.1515e-03],
        [2.2014e+00, 1.2320e-01, 9.9651e-02,  ..., 1.3594e-01, 5.7992e+00,
         1.6005e-01],
        ...,
        [1.6128e+00, 2.7502e-01, 2.4669e-01,  ..., 3.1526e-01, 2.8909e+00,
         3.1943e-01],
        [1.1359e+02, 5.7741e-06, 1.4284e-06,  ..., 8.5517e-06, 5.6936e+04,
         1.7601e-05],
        [7.1575e+00, 3.2246e-03, 1.7251e-03,  ..., 3.0635e-03, 1.3437e+02,
         4.9532e-03]], grad_fn=<ExpBackward>)

In [175]:
mu

tensor([[8.7011e-04, 3.3179e-01, 5.1786e+00,  ..., 8.8829e+00, 5.1800e-03,
         2.1166e-01],
        [8.0833e-03, 4.9907e-01, 3.5145e+00,  ..., 5.3116e+00, 2.6121e-02,
         3.5835e-01],
        [2.0573e-01, 7.0084e-01, 1.4859e+00,  ..., 1.5829e+00, 2.5451e-01,
         7.5853e-01],
        ...,
        [3.9598e-01, 7.6149e-01, 1.2221e+00,  ..., 1.2408e+00, 4.3881e-01,
         8.5934e-01],
        [6.4007e-05, 2.3189e-01, 1.0889e+01,  ..., 2.2670e+01, 7.5969e-04,
         1.1293e-01],
        [8.5521e-03, 4.5572e-01, 3.2901e+00,  ..., 4.7649e+00, 2.0697e-02,
         3.7116e-01]], grad_fn=<ExpBackward>)

In [164]:
outputs_x

tensor([[ 0.,  1.,  2.,  ..., 97., 98., 99.],
        [ 0.,  1.,  2.,  ..., 97., 98., 99.],
        [ 0.,  1.,  2.,  ..., 97., 98., 99.],
        ...,
        [ 0.,  1.,  2.,  ..., 97., 98., 99.],
        [ 0.,  1.,  2.,  ..., 97., 98., 99.],
        [ 0.,  1.,  2.,  ..., 97., 98., 99.]])

In [161]:
gaussian_pdf(outputs_x, mu[:,0:1], sigmasq[:, 0:1])[9]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.], grad_fn=<SelectBackward>)

In [163]:
sigmasq[:, 0:1]

tensor([[4.8654e-02],
        [3.5490e-02],
        [3.8888e-02],
        [4.1196e-03],
        [3.2926e-04],
        [1.0046e-02],
        [1.0499e-02],
        [2.8966e-03],
        [7.8504e-05],
        [4.5916e-06],
        [1.1338e-03],
        [2.0935e-02],
        [3.9623e-05],
        [4.0923e-02],
        [2.6231e-03],
        [1.2288e-04],
        [1.9012e-03],
        [7.5938e-04],
        [3.9229e-02],
        [8.3168e-04],
        [2.4525e-04],
        [2.0062e-04],
        [1.2317e-04],
        [2.2823e-03],
        [5.6996e-02],
        [6.4427e-03],
        [4.5723e-04],
        [1.1053e-02],
        [9.6249e-03],
        [1.3122e-03],
        [1.9814e-05],
        [1.6714e-04],
        [6.7120e-03],
        [1.0815e-01],
        [5.6914e-04],
        [2.6231e-03],
        [5.8650e-02],
        [4.1184e-02],
        [8.4417e-04],
        [3.0058e-04],
        [1.3660e-02],
        [3.2268e-03],
        [2.0867e-04],
        [6.4934e-04],
        [1.4248e-03],
        [4

In [162]:
mu[:,0:1]

tensor([[ -4.1213],
        [ -4.8135],
        [ -4.5288],
        [ -8.8915],
        [-13.2455],
        [ -7.5993],
        [ -6.7905],
        [-10.3428],
        [-15.4605],
        [-19.9681],
        [-10.9578],
        [ -5.7970],
        [-17.0164],
        [ -4.5311],
        [ -8.9885],
        [-14.4900],
        [ -9.8827],
        [-11.7616],
        [ -4.7673],
        [-11.4704],
        [-13.9492],
        [-15.0327],
        [-15.5136],
        [ -9.3282],
        [ -3.8861],
        [ -7.9119],
        [-13.2706],
        [ -7.6553],
        [ -8.1451],
        [-10.8335],
        [-17.9672],
        [-15.1711],
        [ -7.6878],
        [ -3.1328],
        [-12.1910],
        [ -8.9885],
        [ -3.6594],
        [ -4.6261],
        [-11.2487],
        [-13.5288],
        [ -7.6578],
        [-10.2899],
        [-14.8749],
        [-13.0333],
        [-10.7198],
        [ -5.3329],
        [-11.7752],
        [ -8.6176],
        [ -6.1946],
        [ -4.3192],


In [None]:
torch.save(model.state_dict(), 'MDN_model')

## Model Usage

In [4]:
model = Model()
model.load_state_dict(torch.load('MDN_model'))
model.eval()

Model(
  (nn1): Linear(in_features=201, out_features=256, bias=True)
  (nn_pi): Linear(in_features=256, out_features=50, bias=True)
  (nn_sigmasq): Linear(in_features=256, out_features=50, bias=True)
  (nn_mu): Linear(in_features=256, out_features=50, bias=True)
)

In [12]:
def predict_probablity_for_single(pi, sigmasq, mu, targets):
    # rather than sample the single conditional mode at each
    # point, we could sample many points from the GMM produced
    # by the model for each point, yielding a dense set of
    # predictions
    N, K = pi.shape
    _, KT = mu.shape
    T = int(KT / K)
    class_num = targets.shape[1]
    out = torch.zeros(N, class_num)
    print ('here')
    for c in range(class_num):
        print ('c: {}, k: {}'.format(c, k))
        target = targets[:, c]
        prob = torch.zeros(N)
        for i in range(K):  # marginalize over z
            likelihood_z_x = gaussian_pdf(target, mu[:, i*T:(i+1)*T], sigmasq[:, i])
            prior_z = pi[:, i]
            prob += (likelihood_z_x * prior_z)
        out[:, c] += prob
    return out

In [8]:
print (torch.FloatTensor(test_navigate_input).shape[1])

201


In [13]:
def test_navigation(features):
    model.eval()
    test_loss = 0.0
    x = features
    pi, sigmasq, mu = model(x)
    targets = torch.FloatTensor([np.arange(0.0, 1.0, 0.01) for _ in range(x.shape[0])])
    output = predict_probablity_for_single(pi, sigmasq, mu, targets)
    return output

test_navigate_input = []
test_navigate_cards = []
slot = 0.01
for query_id in range(8000,8010):
    cardinality = [0 for _ in range(100)]
    for threshold_id, threshold in enumerate(np.arange(0.0, 0.5, slot)):
        for cluster_id in range(100):
            cardinality[cluster_id] += ground_truth_total_level[cluster_id][query_id][threshold_id][-1]
        feature = np.append(queries[query_id], [threshold+slot])
        test_navigate_input.append(feature)
        test_navigate_cards.append(cardinality.copy())
print ('data prepared')
estimate = test_navigation(torch.FloatTensor(test_navigate_input))


data prepared
here
c: 0, k: 50
c: 1, k: 50
c: 2, k: 50
c: 3, k: 50
c: 4, k: 50
c: 5, k: 50
c: 6, k: 50
c: 7, k: 50
c: 8, k: 50
c: 9, k: 50
c: 10, k: 50
c: 11, k: 50
c: 12, k: 50
c: 13, k: 50
c: 14, k: 50
c: 15, k: 50
c: 16, k: 50
c: 17, k: 50
c: 18, k: 50
c: 19, k: 50
c: 20, k: 50
c: 21, k: 50
c: 22, k: 50
c: 23, k: 50
c: 24, k: 50
c: 25, k: 50
c: 26, k: 50
c: 27, k: 50
c: 28, k: 50
c: 29, k: 50
c: 30, k: 50
c: 31, k: 50
c: 32, k: 50
c: 33, k: 50
c: 34, k: 50
c: 35, k: 50
c: 36, k: 50
c: 37, k: 50
c: 38, k: 50
c: 39, k: 50
c: 40, k: 50
c: 41, k: 50
c: 42, k: 50
c: 43, k: 50
c: 44, k: 50
c: 45, k: 50
c: 46, k: 50
c: 47, k: 50
c: 48, k: 50
c: 49, k: 50
c: 50, k: 50
c: 51, k: 50
c: 52, k: 50
c: 53, k: 50
c: 54, k: 50
c: 55, k: 50
c: 56, k: 50
c: 57, k: 50
c: 58, k: 50
c: 59, k: 50
c: 60, k: 50
c: 61, k: 50
c: 62, k: 50
c: 63, k: 50
c: 64, k: 50
c: 65, k: 50
c: 66, k: 50
c: 67, k: 50
c: 68, k: 50
c: 69, k: 50
c: 70, k: 50
c: 71, k: 50
c: 72, k: 50
c: 73, k: 50
c: 74, k: 50
c: 75, k: 50
c: 

In [71]:
print ((estimate[41] > 5e-32).nonzero())
print ((torch.FloatTensor(test_navigate_cards)[41] > 2).nonzero())
# test_navigate_cards[1]

tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [11],
        [12],
        [16],
        [17],
        [18],
        [19],
        [20],
        [21],
        [22],
        [23],
        [24],
        [25],
        [26],
        [27],
        [28],
        [29],
        [30],
        [31],
        [32],
        [33],
        [34],
        [35],
        [36],
        [37],
        [38],
        [39],
        [40],
        [41],
        [42],
        [43],
        [44],
        [45],
        [46],
        [47],
        [48],
        [49],
        [50],
        [51],
        [52],
        [53],
        [54],
        [55],
        [56],
        [57],
        [58],
        [59],
        [60],
        [61],
        [62],
        [63],
        [64],
        [65],
        [67],
        [68],
        [69],
        [70],
        [71],
        [72],
        [73],
        [74],
        [75],
        [76],
        [77],
        [78],
      

In [None]:
use_cuda = torch.cuda.is_available()

#     torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")
train_dataset = np.array(f['train'])
test_dataset = np.array(f['test'])
train_lefts, train_rights, test_lefts, test_rights = prepare_dataset(train_dataset, test_dataset, train_num, test_num)

train_loader = torch.utils.data.DataLoader(
    (train_lefts, train_rights), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    (test_lefts, test_rights), batch_size=batch_size, shuffle=True)


In [None]:
# hash_distances, input_distances = test(model, device, train_loader)
hash_distances, input_distances = test(model, device, test_loader)

In [None]:
lefts = torch.FloatTensor([f['train'][0] for x in range(999)])
rights = torch.FloatTensor(f['train'][1:1000])
inputdistance = angular_distance(lefts, rights).detach().numpy()
hashdistance = l1_distance(model(lefts), model(rights)).detach().numpy()


In [None]:
# for xx in zip(inputdistance, hashdistance):
#     print (xx[0], xx[1])
index_1 = np.argsort(hashdistance, 0)
index_2 = np.argsort(inputdistance, 0)
# np.random.shuffle(index_2)

input_index = {}
for pos, idx in enumerate(index_2):
    input_index[idx] = pos
sum = 0.0
for pos, idx in enumerate(index_1):
    sum += np.abs(pos - input_index[idx])
sum / len(index_1)

In [None]:
xxx = np.sort(inputdistance, 0)
plt.plot(xxx)
plt.show()

In [None]:
import math
distances = []
for i in index_1:
    distances.append(math.floor(inputdistance[i].item()* 40))

In [None]:
import matplotlib.pyplot as plt

plt.plot(distances)
plt.show()

In [None]:
F.cosine_similarity(torch.FloatTensor(f['train'][0]).unsqueeze(0), torch.FloatTensor(f['train'][6]).unsqueeze(0), dim=1, eps=1e-8)

In [None]:
for x, y in zip(hash_distances[0][0:30], input_distances[0][0:30]):
    print (x, y)

In [None]:
dataset_vector = model(torch.FloatTensor(f['train']))

In [None]:
query_vector = model(torch.FloatTensor(f['test']))

In [None]:
def binarization(vector):
    query_codes = []
    for v in vector:
        binary_code = []
        for e in v:
            if e < 0.5:
                binary_code.append(0)
            else:
                binary_code.append(1)
        query_codes.append(binary_code)
    return np.array(query_codes)
dataset_binary = binarization(dataset_vector.detach().numpy())
query_binary = binarization(query_vector.detach().numpy())

In [None]:
len(dataset_binary)

In [None]:
len(query_binary)

In [None]:
import math
hash_table = {}
for idx, point in enumerate(dataset_binary):
    pos = 0
    key = 0
    for d in point:
        key += d * math.pow(2, pos)
        pos += 1
    if key in hash_table:
        hash_table[key].append(idx)
    else:
        hash_table[key] = [idx]

In [None]:
f['neighbors'][:]

In [None]:
def find_candidate_distance(vector, hash_table, candidate_num):
    candidate = []
    for point in query_binary:
        cand = []
        dis = 0
        while len(cand) < 100:
            pos = 0
            key = 0
            for d in point:
                key += d * math.pow(2, pos)
                pos += 1
            if key in hash_table:
                candidate.append(hash_table[key])
    return candidate
find_candidate_0_distance(query_binary, hash_table)

In [None]:
class Node(object):
    def __init__(hash_code, data_index_set):
        self.hash_code = hash_code
        self.data_index_set = data_index_set
        self.children = []
        
    def isLeaf():
        return len(self.children) == 0
    
    def train(dataset):
        train_data = dataset[self.data_index_set]
        self.model = train(dataset)
        
    def partition():
        points = dataset[self.data_index_set]
        hash_table = {}
        codes = self.model(points)
        for idx, code in enumerate(codes):
            if code in hash_table:
                hash_table[code].append(self.data_index_set[idx])
            else:
                hash_table[code] = [self.data_index_set[idx]]
        for key,value in d.items():
            self.children.append(Node(key, value))
    
    def search(query, dataset):
        if self.isLeaf():
            return validate(dataset[self.data_index_set])
        else:
            children_idxes = select_children(query)
            result = []
            for idx in children_idxes:
                result += self.children[idx].search(query, dataset)
            return result
    
    
    

def index_construction(dataset):
    model = train(dataset)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold, datasets

data = np.array(f['train'])

tsne = manifold.TSNE(n_components=2, init='pca', random_state=501)
X_tsne = tsne.fit_transform(data[np.random.choice(data.shape[0], 100000, replace=False)])