## Data Loading

In [1]:
import h5py
import numpy as np
import pickle

data = h5py.File('/home/sunji/ANN/nytimes_256_angular/nytimes-256-angular.hdf5', 'r')
data_train = np.array(data['train'])
data_test = np.array(data['test'])
with open('/home/sunji/ANN/nytimes_256_angular/clusters_nytimes_256_angular.pkl', 'rb') as f:
    clusters = pickle.load(f)
with open('/home/sunji/ANN/nytimes_256_angular/ground_truth_nytimes_256_angular_0_4_0_5.pkl', 'rb') as f:
    ground_truth_total = pickle.load(f)

In [2]:
cluster_size = len(clusters)

In [3]:
ground_truth_total_level = [[[] for _ in range(10000)] for _ in range(cluster_size)]
for clus in range(cluster_size):
    for t in ground_truth_total[clus]:
        ground_truth_total_level[t[0]][t[1]].append(t)

In [4]:
centroids = []
for cluster in clusters:
    centroids.append(np.mean(cluster))

In [15]:
ground_truth_total_level[0]

[[(0, 0, 0.4, 0.402, 0),
  (0, 0, 0.402, 0.404, 0),
  (0, 0, 0.404, 0.406, 0),
  (0, 0, 0.406, 0.40800000000000003, 0),
  (0, 0, 0.40800000000000003, 0.41000000000000003, 0),
  (0, 0, 0.41000000000000003, 0.41200000000000003, 0),
  (0, 0, 0.41200000000000003, 0.41400000000000003, 0),
  (0, 0, 0.41400000000000003, 0.41600000000000004, 0),
  (0, 0, 0.41600000000000004, 0.41800000000000004, 0),
  (0, 0, 0.41800000000000004, 0.42000000000000004, 0),
  (0, 0, 0.42000000000000004, 0.42200000000000004, 0),
  (0, 0, 0.42200000000000004, 0.42400000000000004, 0),
  (0, 0, 0.42400000000000004, 0.42600000000000005, 0),
  (0, 0, 0.42600000000000005, 0.42800000000000005, 0),
  (0, 0, 0.42800000000000005, 0.43000000000000005, 0),
  (0, 0, 0.43000000000000005, 0.43200000000000005, 0),
  (0, 0, 0.43200000000000005, 0.43400000000000005, 0),
  (0, 0, 0.43400000000000005, 0.43600000000000005, 0),
  (0, 0, 0.43600000000000005, 0.43800000000000006, 0),
  (0, 0, 0.43800000000000006, 0.44000000000000006, 0),


## Prepare Inputs

In [10]:
from numpy import dot
from numpy.linalg import norm
from scipy import spatial


def jaccard(x1, x2=None, eps=1e-8):
    x1 = x1.astype(bool)
    x2 = x2.astype(bool)
    return 1.0 - np.double(np.bitwise_and(x1, x2).sum()) / np.double(np.bitwise_or(x1, x2).sum())

def euclidean_dist_normalized(x1, x2=None, eps=1e-8):
    if np.isnan(x2):
        return 1.0
    left = x1 / 255.0
    right = x2 / 255.0
    return np.sqrt(((left - right) ** 2).mean())

def angular_dist(x1, x2=None, eps=1e-8):
    cosine_sim = 1 - spatial.distance.cosine(x1, x2)
    distance = np.arccos(cosine_sim) / 3.14159267
    return distance 

train_features = []
train_thresholds = []
train_distances = []
train_targets = []
train_cards = []
slot = 0.002
for query_id in range(8000):
    cardinality = [0 for _ in range(cluster_size)]
    distances2centroids = []
    for cc in centroids:
        distances2centroids.append(angular_dist(data_test[query_id], cc))
    for threshold_id, threshold in enumerate(np.arange(0.4, 0.5, slot)):
        indicator = []
        cards = []
        for cluster_id in range(cluster_size):
            cardinality[cluster_id] += ground_truth_total_level[cluster_id][query_id][threshold_id][-1]
            if cardinality[cluster_id] > 0:
                indicator.append(1)
            else:
                indicator.append(0)
            cards.append(cardinality[cluster_id])
        feature = data_test[query_id]
        train_features.append(feature)
        train_distances.append(distances2centroids)
        train_thresholds.append([threshold+slot])
        train_targets.append(indicator)
        train_cards.append(cards)

test_features = []
test_thresholds = []
test_distances = []
test_targets = []
test_cards = []
slot = 0.002
for query_id in range(8000,10000):
    cardinality = [0 for _ in range(cluster_size)]
    distances2centroids = []
    for cc in centroids:
        distances2centroids.append(angular_dist(data_test[query_id], cc))
    for threshold_id, threshold in enumerate(np.arange(0.4, 0.5, slot)):
        indicator = []
        cards = []
        for cluster_id in range(cluster_size):
            cardinality[cluster_id] += ground_truth_total_level[cluster_id][query_id][threshold_id][-1]
            if cardinality[cluster_id] > 0:
                indicator.append(1)
            else:
                indicator.append(0)
            cards.append(cardinality[cluster_id])
        feature = data_test[query_id]
        test_features.append(feature)
        test_distances.append(distances2centroids)
        test_thresholds.append([threshold+slot])
        test_targets.append(indicator)
        test_cards.append(cards)
        
        

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [11]:
test_distances[0][0]

0.5355004546363034

In [12]:
import torch
import torch.utils.data
batch_size = 128
train_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.FloatTensor(train_features), torch.FloatTensor(train_thresholds), torch.FloatTensor(train_distances), torch.FloatTensor(train_targets), torch.FloatTensor(train_cards)), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.FloatTensor(test_features), torch.FloatTensor(test_thresholds), torch.FloatTensor(test_distances), torch.FloatTensor(test_targets), torch.FloatTensor(test_cards)), batch_size=batch_size, shuffle=True)


## Multi-label Networks

In [16]:
from __future__ import print_function
import argparse
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

input_dimension = 256
cluster_dimension = cluster_size
hidden_num = 256
output_num = cluster_size

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.nn1 = nn.Linear(input_dimension, hidden_num)
        self.nn2 = nn.Linear(hidden_num, hidden_num)
#         self.nn3 = nn.Linear(hidden_num, hidden_num)
        
        self.dist1 = nn.Linear(cluster_dimension, hidden_num)
        self.dist2 = nn.Linear(hidden_num, hidden_num)
        
        self.nn4 = nn.Linear(hidden_num, hidden_num)
        self.nn5 = nn.Linear(hidden_num, output_num)
        
        self.thres1 = nn.Linear(1, hidden_num)
        self.thres2 = nn.Linear(hidden_num, 1)

    def forward(self, x, distances, thresholds):
        out1 = F.relu(self.nn1(x))
        out2 = F.relu(self.nn2(out1))
#         out3 = F.relu(self.nn3(out2))
#         print (distances.shape)
        distance1 = F.relu(self.dist1(distances))
        distance2 = F.relu(self.dist2(distance1))
        
        thresholds_1 = F.relu(self.thres1(thresholds))
        thresholds_2 = self.thres2(thresholds_1)

        out4 = F.relu(self.nn4((out2 + distance2) / 2))
        out5 = self.nn5(out2)
        
        probability = F.sigmoid(out5 + thresholds_2)
        return probability

def loss_fn(estimates, targets, cards):
    punish_idx = (estimates < 0.5).float()
    return F.mse_loss(estimates, targets) + 0.0 * torch.log(((0.5 - estimates) * cards * punish_idx).mean() + 1.0)

def print_loss(estimates, targets, cards):
    true_positive = 0.0
    true_negative = 0.0
    false_positive = 0.0
    false_negative = 0.0
    num_elements = estimates.shape[1]
    for est, tar in zip(estimates, targets):
        for i in range(num_elements):
            if est[i] < 0.5 and tar[i] == 0:
                true_negative += 1
            elif est[i] < 0.5 and tar[i] == 1:
                false_negative += 1
            elif est[i] >= 0.5 and tar[i] == 0:
                false_positive += 1
            else:
                true_positive += 1
    if true_positive + false_positive > 0:
        precision = true_positive / (true_positive + false_positive)
    else:
        precision = 1.0
    if true_positive + false_negative > 0:
        recall = true_positive / (true_positive + false_negative)
    else:
        recall = 1.0
    total_card = cards.sum(dim=1)
#     print ('total_card: ', total_card.shape)
    miss_card = torch.FloatTensor([cards[i][((estimates[i] < 0.5).nonzero())].sum() for i in range(cards.shape[0])])
#     print ('miss_card: ', miss_card.shape)
    miss_rate = (miss_card / (total_card + 0.1)).mean()
    return precision, recall, miss_rate

In [17]:
model = Model()
opt = optim.Adam(model.parameters(), lr=0.001)
for e in range(5):
    model.train()
    for batch_idx, (features, thresholds, distances, targets, cards) in enumerate(train_loader):
        x = Variable(features)
        y = Variable(targets.unsqueeze(1))
        z = Variable(thresholds)
        dists = Variable(distances)
        opt.zero_grad()
        estimates = model(x, dists, z)
        loss = loss_fn(estimates, targets, cards)
        if batch_idx % 100 == 0:
            print('Training: Iteration {0}, Batch {1}, Loss {2}'.format(e, batch_idx, loss.item()))
        loss.backward()
        opt.step()
        next(model.thres1.parameters()).data.clamp_(0)
        next(model.thres2.parameters()).data.clamp_(0)

    model.eval()    
    test_loss = 0.0
    precision = 0.0
    recall = 0.0
    miss_rate = 0.0
    for batch_idx, (features, thresholds, distances, targets, cards) in enumerate(test_loader):
        x = Variable(features)
        y = Variable(targets.unsqueeze(1))
        z = Variable(thresholds)
        dists = Variable(distances)
        estimates = model(x, dists, z)
        loss = loss_fn(estimates, targets, cards)
        test_loss += loss.item()
        prec, rec, miss = print_loss(estimates, targets, cards)
        precision += prec
        recall += rec
        miss_rate += miss
        if batch_idx % 100 == 0:
            print ('Testing: Iteration {0}, Batch {1}, Loss {2}, Precision {3}, Recall {4}, Miss {5}'.format(e, batch_idx, loss.item(), prec, rec, miss))
    test_loss /= len(test_loader)
    precision /= len(test_loader)
    recall /= len(test_loader)
    miss_rate /= len(test_loader)
    print ('Testing: Loss {0}, Precision {1}, Recall {2}, Miss {3}'.format(test_loss, precision, recall, miss_rate))
    
    

Training: Iteration 0, Batch 0, Loss 0.25881287455558777
Training: Iteration 0, Batch 100, Loss 0.10124887526035309
Training: Iteration 0, Batch 200, Loss 0.08501636981964111
Training: Iteration 0, Batch 300, Loss 0.08989463001489639
Training: Iteration 0, Batch 400, Loss 0.062349364161491394
Training: Iteration 0, Batch 500, Loss 0.07847148180007935
Training: Iteration 0, Batch 600, Loss 0.06649202108383179
Training: Iteration 0, Batch 700, Loss 0.05057813227176666
Training: Iteration 0, Batch 800, Loss 0.07049056887626648
Training: Iteration 0, Batch 900, Loss 0.07940644770860672
Training: Iteration 0, Batch 1000, Loss 0.0465458519756794
Training: Iteration 0, Batch 1100, Loss 0.05627273768186569
Training: Iteration 0, Batch 1200, Loss 0.05367976054549217
Training: Iteration 0, Batch 1300, Loss 0.0653342604637146
Training: Iteration 0, Batch 1400, Loss 0.05922418087720871
Training: Iteration 0, Batch 1500, Loss 0.07737191766500473
Training: Iteration 0, Batch 1600, Loss 0.06352695822

Testing: Iteration 2, Batch 200, Loss 0.034647416323423386, Precision 1.0, Recall 0.7290909090909091, Miss 0.04656883329153061
Testing: Iteration 2, Batch 300, Loss 0.016546763479709625, Precision 0.9, Recall 0.9, Miss 0.0078124976716935635
Testing: Iteration 2, Batch 400, Loss 0.030030356720089912, Precision 1.0, Recall 0.668, Miss 0.038863617926836014
Testing: Iteration 2, Batch 500, Loss 0.03278117626905441, Precision 0.9090909090909091, Recall 0.7142857142857143, Miss 0.031249990686774254
Testing: Iteration 2, Batch 600, Loss 0.025998026132583618, Precision 0.875, Recall 0.6363636363636364, Miss 0.031249990686774254
Testing: Iteration 2, Batch 700, Loss 0.015564859844744205, Precision 1.0, Recall 0.7475, Miss 0.01581747643649578
Testing: Loss 0.019782228061876943, Precision 0.9735975306648508, Recall 0.8167610839484714, Miss 0.01701875776052475
Training: Iteration 3, Batch 0, Loss 0.022595936432480812
Training: Iteration 3, Batch 100, Loss 0.013123693875968456
Training: Iteration 3

In [None]:
(distances[2], features[2])

In [None]:
for name, param in model.thres1.named_parameters():
    print (name, param.data)

In [18]:
torch.save(model.state_dict(), '/home/sunji/ANN/nytimes_256_angular/saved_models/global_nytimes_256_angular_punish_query_threshold_monotonic.model')

## Model Usage

In [70]:
model = Model()
model.load_state_dict(torch.load('global_fashion_mnist_784_euclidean_binary_query_threshold.model'))
model.eval()

Model(
  (nn1): Linear(in_features=785, out_features=256, bias=True)
  (nn2): Linear(in_features=256, out_features=256, bias=True)
  (nn3): Linear(in_features=256, out_features=256, bias=True)
  (nn4): Linear(in_features=256, out_features=100, bias=True)
)

In [39]:
for batch_idx, (features, thresholds, distances, targets, cards) in enumerate(test_loader):
    x = Variable(features)
    y = Variable(targets.unsqueeze(1))
    z = Variable(thresholds)
    dist = Variable(distances)
    estimates = model(x, dist, z)
    loss = loss_fn(estimates, targets, cards)
    prec, rec, miss_rate = print_loss(estimates, targets, cards)
    print (loss.item(), prec, rec, miss_rate)
#     print (targets[0])

total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.011616033501923084 0.9865168539325843 0.9653655854865311 tensor(0.0307)
total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.013758893124759197 0.9868929952728835 0.9659305993690852 tensor(0.0149)
total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.010944121517241001 0.9889834752128193 0.9657701711491442 tensor(0.0170)
total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.012130827642977238 0.9878915504080021 0.9596011250319612 tensor(0.0179)
total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.011950699612498283 0.9855285749325484 0.972645848462842 tensor(0.0386)
total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.014175038784742355 0.9893148962916405 0.9622987568779295 tensor(0.0226)
total_card:  torch.Size([128])
miss_card:  torch.Size([128])
0.014790528453886509 0.9867578900904878 0.9652417962003454 tensor(0.0361)
total_card:  torch.Size([128])
miss_card:  torch.Size([1

KeyboardInterrupt: 

In [None]:
use_cuda = torch.cuda.is_available()

#     torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")
train_dataset = np.array(f['train'])
test_dataset = np.array(f['test'])
train_lefts, train_rights, test_lefts, test_rights = prepare_dataset(train_dataset, test_dataset, train_num, test_num)

train_loader = torch.utils.data.DataLoader(
    (train_lefts, train_rights), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    (test_lefts, test_rights), batch_size=batch_size, shuffle=True)


In [None]:
# hash_distances, input_distances = test(model, device, train_loader)
hash_distances, input_distances = test(model, device, test_loader)

In [None]:
lefts = torch.FloatTensor([f['train'][0] for x in range(999)])
rights = torch.FloatTensor(f['train'][1:1000])
inputdistance = angular_distance(lefts, rights).detach().numpy()
hashdistance = l1_distance(model(lefts), model(rights)).detach().numpy()


In [None]:
# for xx in zip(inputdistance, hashdistance):
#     print (xx[0], xx[1])
index_1 = np.argsort(hashdistance, 0)
index_2 = np.argsort(inputdistance, 0)
# np.random.shuffle(index_2)

input_index = {}
for pos, idx in enumerate(index_2):
    input_index[idx] = pos
sum = 0.0
for pos, idx in enumerate(index_1):
    sum += np.abs(pos - input_index[idx])
sum / len(index_1)

In [None]:
xxx = np.sort(inputdistance, 0)
plt.plot(xxx)
plt.show()

In [None]:
import math
distances = []
for i in index_1:
    distances.append(math.floor(inputdistance[i].item()* 40))

In [None]:
import matplotlib.pyplot as plt

plt.plot(distances)
plt.show()

In [None]:
F.cosine_similarity(torch.FloatTensor(f['train'][0]).unsqueeze(0), torch.FloatTensor(f['train'][6]).unsqueeze(0), dim=1, eps=1e-8)

In [None]:
for x, y in zip(hash_distances[0][0:30], input_distances[0][0:30]):
    print (x, y)

In [None]:
dataset_vector = model(torch.FloatTensor(f['train']))

In [None]:
query_vector = model(torch.FloatTensor(f['test']))

In [None]:
def binarization(vector):
    query_codes = []
    for v in vector:
        binary_code = []
        for e in v:
            if e < 0.5:
                binary_code.append(0)
            else:
                binary_code.append(1)
        query_codes.append(binary_code)
    return np.array(query_codes)
dataset_binary = binarization(dataset_vector.detach().numpy())
query_binary = binarization(query_vector.detach().numpy())

In [None]:
len(dataset_binary)

In [None]:
len(query_binary)

In [None]:
import math
hash_table = {}
for idx, point in enumerate(dataset_binary):
    pos = 0
    key = 0
    for d in point:
        key += d * math.pow(2, pos)
        pos += 1
    if key in hash_table:
        hash_table[key].append(idx)
    else:
        hash_table[key] = [idx]

In [None]:
f['neighbors'][:]

In [None]:
def find_candidate_distance(vector, hash_table, candidate_num):
    candidate = []
    for point in query_binary:
        cand = []
        dis = 0
        while len(cand) < 100:
            pos = 0
            key = 0
            for d in point:
                key += d * math.pow(2, pos)
                pos += 1
            if key in hash_table:
                candidate.append(hash_table[key])
    return candidate
find_candidate_0_distance(query_binary, hash_table)

In [None]:
class Node(object):
    def __init__(hash_code, data_index_set):
        self.hash_code = hash_code
        self.data_index_set = data_index_set
        self.children = []
        
    def isLeaf():
        return len(self.children) == 0
    
    def train(dataset):
        train_data = dataset[self.data_index_set]
        self.model = train(dataset)
        
    def partition():
        points = dataset[self.data_index_set]
        hash_table = {}
        codes = self.model(points)
        for idx, code in enumerate(codes):
            if code in hash_table:
                hash_table[code].append(self.data_index_set[idx])
            else:
                hash_table[code] = [self.data_index_set[idx]]
        for key,value in d.items():
            self.children.append(Node(key, value))
    
    def search(query, dataset):
        if self.isLeaf():
            return validate(dataset[self.data_index_set])
        else:
            children_idxes = select_children(query)
            result = []
            for idx in children_idxes:
                result += self.children[idx].search(query, dataset)
            return result
    
    
    

def index_construction(dataset):
    model = train(dataset)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold, datasets

data = np.array(f['train'])

tsne = manifold.TSNE(n_components=2, init='pca', random_state=501)
X_tsne = tsne.fit_transform(data[np.random.choice(data.shape[0], 100000, replace=False)])