## Data Preparing

In [1]:
import h5py
import numpy as np
import pickle

data = h5py.File('/home/sunji/ANN/nytimes_256_angular/nytimes-256-angular.hdf5', 'r')
data_train = np.array(data['train'])
data_test = np.array(data['test'])
with open('/home/sunji/ANN/nytimes_256_angular/clusters_nytimes_256_angular.pkl', 'rb') as f:
    clusters = pickle.load(f)
with open('/home/sunji/ANN/nytimes_256_angular/ground_truth_nytimes_256_angular_0_4_0_5.pkl', 'rb') as f:
    ground_truth_total = pickle.load(f)

In [2]:
cluster_size = len(clusters)

In [3]:
ground_truth_total_level = [[[] for _ in range(10000)] for _ in range(cluster_size)]
for clus in range(cluster_size):
    for t in ground_truth_total[clus]:
        ground_truth_total_level[t[0]][t[1]].append(t)

In [4]:
centroids = []
for cluster in clusters:
    centroids.append(np.mean(cluster))

In [5]:
from numpy import dot
from numpy.linalg import norm
from scipy import spatial

def jaccard(x1, x2=None, eps=1e-8):
    x1 = x1.astype(bool)
    x2 = x2.astype(bool)
    return 1.0 - np.double(np.bitwise_and(x1, x2).sum()) / np.double(np.bitwise_or(x1, x2).sum())

def euclidean_dist_normalized(x1, x2=None, eps=1e-8):
    if np.isnan(x2):
        return 1.0
    left = x1 / 255.0
    right = x2 / 255.0
    return np.sqrt(((left - right) ** 2).mean())

def angular_dist(x1, x2=None, eps=1e-8):
    cosine_sim = 1 - spatial.distance.cosine(x1, x2)
#     print (cosine_sim)
    distance = np.arccos(cosine_sim) / 3.14159267
    return distance

test_features = []
test_thresholds = []
test_distances = []
test_targets = []
test_cards = []
slot = 0.002
for query_id in range(8000,10000):
    cardinality = [0 for _ in range(cluster_size)]
    distances2centroids = []
    for cc in centroids:
        distances2centroids.append(angular_dist(data_test[query_id], cc))
    for threshold_id, threshold in enumerate(np.arange(0.4, 0.5, slot)):
        indicator = []
        cards = []
        for cluster_id in range(cluster_size):
            cardinality[cluster_id] += ground_truth_total_level[cluster_id][query_id][threshold_id][-1]
            if cardinality[cluster_id] > 0:
                indicator.append(1)
            else:
                indicator.append(0)
            cards.append(cardinality[cluster_id])
        feature = data_test[query_id]
        test_features.append(feature)
        test_distances.append(distances2centroids)
        test_thresholds.append([threshold+slot])
        test_targets.append(indicator)
        test_cards.append(cards)

In [6]:
cnt = 0
for i in test_cards:
    if np.sum(i) > 0:
        cnt += 1
print (cnt)

9408


In [7]:
import torch
import torch.utils.data
batch_size = 128
test_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.FloatTensor(test_features), torch.FloatTensor(test_thresholds), torch.FloatTensor(test_distances), torch.FloatTensor(test_targets), torch.FloatTensor(test_cards)), batch_size=batch_size, shuffle=False)


## Global

In [8]:
from __future__ import print_function
import argparse
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

input_dimension = 256
cluster_dimension = cluster_size
hidden_num = 256
output_num = cluster_size

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.nn1 = nn.Linear(input_dimension, hidden_num)
        self.nn2 = nn.Linear(hidden_num, hidden_num)
#         self.nn3 = nn.Linear(hidden_num, hidden_num)
        
        self.dist1 = nn.Linear(cluster_dimension, hidden_num)
        self.dist2 = nn.Linear(hidden_num, hidden_num)
        
        self.nn4 = nn.Linear(hidden_num, hidden_num)
        self.nn5 = nn.Linear(hidden_num, output_num)
        
        self.thres1 = nn.Linear(1, hidden_num)
        self.thres2 = nn.Linear(hidden_num, 1)

    def forward(self, x, distances, thresholds):
        out1 = F.relu(self.nn1(x))
        out2 = F.relu(self.nn2(out1))
#         out3 = F.relu(self.nn3(out2))
#         print (distances.shape)
        distance1 = F.relu(self.dist1(distances))
        distance2 = F.relu(self.dist2(distance1))
        
        thresholds_1 = F.relu(self.thres1(thresholds))
        thresholds_2 = self.thres2(thresholds_1)

        out4 = F.relu(self.nn4((out2 + distance2) / 2))
        out5 = self.nn5(out2)
        
        probability = F.sigmoid(out5 + thresholds_2)
        return probability

def loss_fn(estimates, targets, cards):
    punish_idx = (estimates < 0.5).float()
    return F.mse_loss(estimates, targets) + 1.4 * torch.log(((0.5 - estimates) * cards * punish_idx).mean() + 1.0)

def print_loss(estimates, targets, cards):
    true_positive = 0.0
    true_negative = 0.0
    false_positive = 0.0
    false_negative = 0.0
    num_elements = estimates.shape[1]
    for est, tar in zip(estimates, targets):
        for i in range(num_elements):
            if est[i] < 0.5 and tar[i] == 0:
                true_negative += 1
            elif est[i] < 0.5 and tar[i] == 1:
                false_negative += 1
            elif est[i] >= 0.5 and tar[i] == 0:
                false_positive += 1
            else:
                true_positive += 1
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    total_card = cards.sum(dim=1)
#     print ('total_card: ', total_card.shape)
    miss_card = torch.FloatTensor([cards[i][((estimates[i] < 0.5).nonzero())].sum() for i in range(cards.shape[0])])
#     print ('miss_card: ', miss_card.shape)
    miss_rate = (miss_card / (total_card + 0.1)).mean()
    return precision, recall, miss_rate

## Local

In [9]:
from __future__ import print_function
import argparse
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

queries_dimension = 256
hidden_num_2 = 128

class Threshold_Model(nn.Module):
    
    def __init__(self):
        super(Threshold_Model, self).__init__()
        self.fc1 = nn.Linear(1, hidden_num_2)
        self.fc2 = nn.Linear(hidden_num_2, 1)
    
    def forward(self, threshold):
        t1 = F.relu(self.fc1(threshold))
        t2 = self.fc2(t1)
        return t2

class CNN_Model(nn.Module):
    
    def __init__(self, in_channel, out_channel, kernel_size, stride, padding, pool_type, pool_size):
        super(CNN_Model, self).__init__()
        if pool_type == 0:
            pool_layer = nn.MaxPool1d(kernel_size=pool_size, stride=pool_size)
        elif pool_type == 1:
            pool_layer = nn.AvgPool1d(kernel_size=pool_size, stride=pool_size)
        else:
            print ('CNN_Model Init Error, invalid pool_type {}'.format(pool_type))
            return
        self.layer = nn.Sequential(
            nn.Conv1d(in_channel, out_channel, kernel_size=kernel_size, stride=stride, padding=padding), 
            nn.BatchNorm1d(out_channel),
            nn.ReLU(),
            pool_layer)
        
    def forward(self, inputs):
        hid = self.layer(inputs)
        return hid

class Output_Model(nn.Module):
    
    def __init__(self, inputs_dim):
        super(Output_Model, self).__init__()
        self.fc1 = nn.Linear(inputs_dim, hidden_num_2)
        self.fc2 = nn.Linear(hidden_num_2, 1)
        
    def forward(self, queries, threshold):
        out1 = F.relu(self.fc1(queries))
        out2 = out1 + threshold
        out3 = self.fc2(out2)
        return out3

# class Model(nn.Module):
    
#     def __init__(self):
#         super(Model, self).__init__()
#         self.nn1 = nn.Linear(queries_dimension+1, hidden_num)
#         self.n1 = nn.Linear(hidden_num, hidden_num)
#         self.n2 = nn.Linear(hidden_num, hidden_num)
# #         self.n3 = nn.Linear(hidden_num, hidden_num)
# #         self.n4 = nn.Linear(hidden_num, hidden_num)
#         self.nn2 = nn.Linear(hidden_num, 1)
        
#     def forward(self, queries, threshold):
#         out1 = F.relu(self.nn1(torch.cat([queries, threshold],1)))
#         hid = out1
#         hid = F.relu(self.n1(hid))
#         hid = F.relu(self.n2(hid))
# #         hid = F.relu(self.n3(hid))
# #         hid = F.relu(self.n4(hid))
# #         hid = self.norm2(hid)
#         out2 = self.nn2(hid)
#         return out2

def loss_fn(estimates, targets, mini, maxi):
    est = unnormalize(estimates, mini, maxi)
    print (torch.cat((est, targets), 1))
    return F.mse_loss(est, targets)

def l1_loss(estimates, targets, eps=1e-5):
    return F.smooth_l1_loss(estimates, torch.log(targets))

def mse_loss(estimates, targets, eps=1e-5):
    return F.mse_loss(estimates, torch.log(targets))

def qerror_loss(preds, targets, mini, maxi):
    qerror = []
    preds = unnormal1ize_label(preds, mini, maxi)
    for i in range(len(targets)):
        if (preds[i] > targets[i]).cpu().data.numpy()[0]:
            qerror.append(preds[i]/targets[i])
        else:
            qerror.append(targets[i]/(preds[i] + 0.1))
    return torch.mean(torch.cat(qerror) ** 2)

def print_loss(estimates, targets):
    esti = torch.exp(estimates)
    qerror = []
    for i in range(esti.shape[0]):
        if esti[i] > targets[i] + 0.1:
            qerror.append((esti[i] / (targets[i] + 0.1)).item())
        else:
            qerror.append(((targets[i] + 0.1) / esti[i]).item())
    
    return F.mse_loss(esti, targets), np.mean(qerror), np.max(qerror)

In [10]:
class TunableParameters():
    
    def __init__(self, out_channel, kernel_size, stride, padding, pool_size, pool_type):
        self.out_channel = out_channel
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.pool_size = pool_size
        self.pool_type = pool_type
        
    def __repr__(self):
        return str(self.out_channel) +' '+ str(self.kernel_size) +' '+ str(self.stride) +' '+ str(self.padding) +' '+ str(self.pool_size) +' '+ str(self.pool_type)
 
    def __str__(self):
        return str(self.out_channel) +' '+ str(self.kernel_size) +' '+ str(self.stride) +' '+ str(self.padding) +' '+ str(self.pool_size) +' '+ str(self.pool_type)

In [11]:
import pickle
hyper_parameterss = []
with open('/home/sunji/ANN/nytimes_256_angular/saved_models/cnn_hyper_parameters.hyperpara', 'r') as handle:
    for paras in handle.readlines():
        hyper_parameters = []
        for para in paras.split(';'):
            para = para.split(' ')
            hyper_parameters.append(TunableParameters(int(para[0]), int(para[1]), int(para[2]),
                                                      int(para[3]), int(para[4]), int(para[5])))
        hyper_parameterss.append(hyper_parameters)
            

In [12]:
cnn_modelss = []
threshold_models = []
output_models = []
for idx in range(cluster_size):
    states = torch.load('/home/sunji/ANN/nytimes_256_angular/saved_models/local_nytimes_256_angular_cluster_' + str(idx) + '.model')
    hyper_para = hyper_parameterss[idx]
    cnn_models = []
    weights = [None for _ in range(len(hyper_para))]
    for key, value in states.items():
        if key != 'threshold_model_state_dict' and key != 'output_model_state_dict':
#             print (key)
            layer_id = int(key.split('_')[-1])
#             print (layer_id)
            weights[layer_id] = value
    in_channel = 1
    in_size = queries_dimension
    for weight_idx, weight in enumerate(weights):
        hyper = hyper_para[weight_idx]
        cnn_model = CNN_Model(in_channel, hyper.out_channel, hyper.kernel_size,
                              hyper.stride, hyper.padding, hyper.pool_type, hyper.pool_size)
        in_size = int((int((in_size - hyper.kernel_size + 2*(hyper.padding)) / hyper.stride) + 1) / hyper.pool_size)
        in_channel = hyper.out_channel
        cnn_model.load_state_dict(weight)
        cnn_model.eval()
        cnn_models.append(cnn_model)
    cnn_modelss.append(cnn_models)
        
    threshold_model_state_dict = states['threshold_model_state_dict']
    threshold_model = Threshold_Model()
    threshold_model.load_state_dict(threshold_model_state_dict)
    threshold_model.eval()
    threshold_models.append(threshold_model)
    
    output_model_state_dict = states['output_model_state_dict']
    output_model = Output_Model(in_size * in_channel)
    output_model.load_state_dict(output_model_state_dict)
    output_model.eval()
    output_models.append(output_model)
    

In [13]:
len(output_models)

100

In [65]:
estimatess = []
for idx in range(cluster_size):
    estimates = only_test(cnn_modelss[idx], threshold_models[idx], output_models[idx], test[idx])
    estimatess.append(estimates)

Testing: Mean Error 1.9588894144359676, Median Error 1.4675414562225342, 90 Percent 3.059294509887695, 95 Percent 4.0605128765106, 99 Percent 8.400643901824921, Max Percent 78.74772644042969
Testing: Mean Error 1.37362078340473, Median Error 1.292150855064392, 90 Percent 1.769090723991394, 95 Percent 1.9449544489383697, 99 Percent 2.4839744043350214, Max Percent 12.294903755187988
Testing: Mean Error 3.0506578658623402, Median Error 1.6479474306106567, 90 Percent 5.164201259613038, 95 Percent 8.503339195251467, 99 Percent 27.358099460601938, Max Percent 193.28009033203125
Testing: Mean Error 1.9044559912085743, Median Error 1.4728668928146362, 90 Percent 3.0150680065155036, 95 Percent 4.205971813201894, 99 Percent 8.188479423522905, Max Percent 43.20423889160156
Testing: Mean Error 5.249203454945848, Median Error 2.010266423225403, 90 Percent 9.792761421203618, 95 Percent 16.637843704223634, 99 Percent 47.39627223968509, Max Percent 571.5121459960938
Testing: Mean Error 2.7756644566996

Testing: Mean Error 3.134069788897215, Median Error 1.7549961805343628, 90 Percent 6.299139738082886, 95 Percent 9.211748361587524, 99 Percent 18.193819332122793, Max Percent 407.9886169433594
Testing: Mean Error 3.1528304337608524, Median Error 1.9670038223266602, 90 Percent 5.9683879852294925, 95 Percent 9.186656761169438, 99 Percent 19.502544403076175, Max Percent 135.13424682617188
Testing: Mean Error 1.348260089117693, Median Error 1.2683959603309631, 90 Percent 1.7460334300994873, 95 Percent 1.9386850893497465, 99 Percent 2.316835489273073, Max Percent 3.231677293777466
Testing: Mean Error 1.3478630466794055, Median Error 1.2706505060195923, 90 Percent 1.7370400428771973, 95 Percent 1.9112213611602784, 99 Percent 2.276931533813477, Max Percent 3.4060893058776855
Testing: Mean Error 2.6486924485247645, Median Error 1.5675183534622192, 90 Percent 3.8284571170806885, 95 Percent 5.7774276971816985, 99 Percent 15.209475097656243, Max Percent 1061.518798828125
Testing: Mean Error 1.401

Testing: Mean Error 1.2962083752642597, Median Error 1.2766467332839966, 90 Percent 1.550905132293701, 95 Percent 1.64510338306427, 99 Percent 1.8499685096740723, Max Percent 2.2510898113250732
Testing: Mean Error 2.153763660930784, Median Error 1.5469430685043335, 90 Percent 3.194102168083191, 95 Percent 4.513320565223694, 99 Percent 10.909398555755615, Max Percent 229.42977905273438
Testing: Mean Error 1.2735772903950016, Median Error 1.2350033521652222, 90 Percent 1.5543779373168944, 95 Percent 1.659059262275696, 99 Percent 1.8603610992431647, Max Percent 2.4291951656341553
Testing: Mean Error 1.287910802459417, Median Error 1.2743593454360962, 90 Percent 1.5114535808563234, 95 Percent 1.5687596797943115, 99 Percent 1.7160401916503907, Max Percent 1.9992518424987793
Testing: Mean Error 1.3359843349983034, Median Error 1.317501187324524, 90 Percent 1.6175954341888428, 95 Percent 1.7080742835998532, 99 Percent 1.8734824657440186, Max Percent 2.310460329055786
Testing: Mean Error 1.099

In [70]:
len(estimatess[3])

19759

## End-to-End Testing

In [14]:
def get_local_cardinality(cnn_models, threshold_model, output_model, queries, thresholds):
    queries = queries.unsqueeze(2).permute(0,2,1)
    for model in cnn_models:
        queries = model(queries)
    threshold = threshold_model(thresholds)
    queries = queries.view(queries.shape[0], -1)
    estimates = output_model(queries, threshold)
    esti = torch.exp(estimates)
    return esti.detach()

In [20]:
def print_qerror(estimates, targets):
    qerror = []
    for i in range(estimates.shape[0]):
        left = estimates[i] + 3000
        right = targets[i] + 3000
        if left > right:
            qerror.append((left / right).item())
        else:
            qerror.append((right / left).item())
    return qerror

In [21]:
for cluster_id in range(100):
    print (cluster_id)
    total = 0.0
    for batch_idx, (features, thresholds, distances, targets, cards) in enumerate(test_loader):
        estimates = get_local_cardinality(cnn_modelss[cluster_id], threshold_models[cluster_id],
                                             output_models[cluster_id], features, thresholds)
#         print (estimates.shape, cards[:,cluster_id].shape)
#         print (torch.cat((estimates, cards[:,cluster_id].unsqueeze(1)), dim=1))
        errors = print_qerror(estimates, cards[:,cluster_id])
        total += np.mean(errors)
        print (np.mean(errors))
    print (total / len(test_loader))

0
2960.6335887042806
3137.14608001709
2900.3735548509285
2291.9361393842846
2950.540810663253
2574.9271658407524
2488.1301158908755
2704.8580279219896
2819.7560826912522
2860.053068837151
2834.34151869826
2881.961838444695
2731.610900664702
2747.4556887717918
2904.6955729108304
2694.241140129976
2939.85243552085
2845.686027823016
2867.2749468889087
2598.649263237603
2969.995610645972
2861.2907220833004
3130.2000427246094
2925.3855982609093
2909.4569304436445
3015.399523003958
3137.14608001709
3037.0660574277863
3119.842819213867
3114.8453540802
2827.9557469645515
2843.5781836360693
2890.474924257025
2821.103249135427
2174.6018271502107
2679.7834470300004
2744.6623841645196
3031.108577984385
3011.724102465436
2743.3767667235807
2448.9156136484817
3021.7739279111847
2413.4663630528376
3085.141405105591
2961.37026508525
2810.194286642596
2617.463942905888
3130.2000427246094
2546.051467518322
2676.960922688246
2773.6767066102475
2517.115901223384
2645.979770707898
2367.3728767847642
2714.1

2768.8069461341947
2422.2291984176263
2745.1886338619515
2702.1275523277
3142.7868061065674
3085.141405105591
2851.226368895732
2914.3141385819763
2863.557607933879
3130.2000427246094
2952.390687397681
3126.1158996289596
3070.777484893799
3026.5431288126856
3092.444019317627
2441.762094784528
3087.079433030449
2525.3766228035092
2771.2293763728812
2727.6795751452446
2696.7063195733353
3077.919527053833
3021.2984007922933
2547.1157326502725
2683.543308775872
2965.322146988474
1931.0120081137866
3145.899990081787
2832.726999565959
3032.642909917049
2467.725540536456
2511.454456231557
2836.459014969878
2588.8122097766027
2677.0887215724215
2517.2500940244645
3016.5823663985357
3070.777484893799
2810.9320681709796
3092.444019317627
2640.4180482123047
2468.259986680001
2507.896800359711
2542.0231038769707
2973.311954411678
2235.33002155181
2570.882715212181
2912.516441945918
2989.6844004718587


KeyboardInterrupt: 

In [21]:
model = Model()
model.load_state_dict(torch.load('/home/sunji/ANN/nytimes_256_angular/saved_models/global_nytimes_256_angular_punish_query_threshold_monotonic.model'))
model.eval()
test_loss = 0.0
precision = 0.0
recall = 0.0
miss_rate = 0.0
estimatesss = []
q_errors = []
for batch_idx, (features, thresholds, distances, targets, cards) in enumerate(test_loader):
    if batch_idx % 100 == 0:
        print (batch_idx)
    estimates = model(features, distances, thresholds)
    global_indicator = (estimates >= 0.5).float()
    local_estimates = []
    for cluster_id in range(cluster_size):
        local_estimates.append(get_local_cardinality(cnn_modelss[cluster_id], threshold_models[cluster_id],
                                                     output_models[cluster_id], features, thresholds))
    localss = torch.cat(local_estimates, dim = 1)
#     print (localss.shape, global_indicator.shape)
    cards_estimates = (localss * global_indicator).sum(dim=1).unsqueeze(1)
    cards = cards.sum(dim=1).unsqueeze(1)
#     print (cards_estimates.shape)
#     print (cards.shape)
#     print (torch.cat((cards_estimates, cards), dim=1))
    q_errors += print_qerror(cards_estimates, cards)
#     print (q_errors)
mean = np.mean(q_errors)
percent90 = np.percentile(q_errors, 90)
percent95 = np.percentile(q_errors, 95)
percent99 = np.percentile(q_errors, 99)
median = np.median(q_errors)
maxi = np.max(q_errors)
print ('Testing: Mean Error {}, Median Error {}, 90 Percent {}, 95 Percent {}, 99 Percent {}, Max Percent {}'
       .format(mean, median, percent90, percent95, percent99, maxi))
    
    
        
    

0
100
200
300
400
500
600
700
Testing: Mean Error 2.725188450744152, Median Error 1.0, 90 Percent 1.0, 95 Percent 1.02164626121521, 99 Percent 97.58699798583984, Max Percent 98.8680648803711
