In [1]:
%load_ext autoreload
%autoreload 2

from time import time
import pandas as pd
import numpy as np
import os
from collections import Counter, defaultdict
import pickle

In [2]:
import sys
sys.path.insert(0, "/data3/muntean/DRhard")

# Load Query - Doc distances

In **nearest_neighbors_ra[0]** we have the **top-10000** docs in the **cache**, determined by **qa**.
They are ordered accoridng to the distance with qa.

This also give us the order in which the distances are kept in the distance dict.

In [3]:
%store -r all_dist_dict  
%store -r nearest_neighbors_ra 

In [4]:
print(len(all_dist_dict.keys()))
all_dist_dict.keys()

194


dict_keys(['31_1', '31_2', '31_3', '31_4', '31_5', '31_6', '31_7', '31_8', '31_9', '32_1', '32_2', '32_3', '32_4', '32_5', '32_6', '32_7', '32_8', '32_9', '32_10', '32_11', '33_1', '33_2', '33_3', '33_4', '33_5', '33_6', '33_7', '33_8', '33_9', '33_10', '34_1', '34_2', '34_3', '34_4', '34_5', '34_6', '34_7', '34_8', '34_9', '37_1', '37_2', '37_3', '37_4', '37_5', '37_6', '37_7', '37_8', '37_9', '37_10', '37_11', '37_12', '40_1', '40_2', '40_3', '40_4', '40_5', '40_6', '40_7', '40_8', '40_9', '40_10', '49_1', '49_2', '49_3', '49_4', '49_5', '49_6', '49_7', '49_8', '49_9', '49_10', '50_1', '50_2', '50_3', '50_4', '50_5', '50_6', '50_7', '50_8', '50_9', '50_10', '54_1', '54_2', '54_3', '54_4', '54_5', '54_6', '54_7', '54_8', '54_9', '56_1', '56_2', '56_3', '56_4', '56_5', '56_6', '56_7', '56_8', '58_1', '58_2', '58_3', '58_4', '58_5', '58_6', '58_7', '58_8', '59_1', '59_2', '59_3', '59_4', '59_5', '59_6', '59_7', '59_8', '61_1', '61_2', '61_3', '61_4', '61_5', '61_6', '61_7', '61_8', '61_

In [5]:
len(nearest_neighbors_ra[0]) # doc ids

10000

In [6]:
all_dist_dict["31_1"][:20] # ordered because they we retrieved from index

[8.185292,
 8.152467,
 8.02967,
 8.259123,
 8.07297,
 8.266944,
 8.225096,
 8.43523,
 7.924303,
 8.371743,
 7.9806976,
 7.9725966,
 8.388752,
 8.236596,
 8.148874,
 7.96484,
 7.935154,
 8.139231,
 8.703874,
 8.407185]

In [7]:
all_dist_dict["31_2"][:20] # shuffled because we compute dist outside the index, using elems order as in nearest_neighbors_ra

[4.802757,
 4.8222275,
 4.3992887,
 4.379764,
 4.53187,
 4.515611,
 4.413016,
 4.3586264,
 4.735081,
 4.3028064,
 4.7684774,
 4.845764,
 4.4350486,
 3.7372952,
 4.551476,
 4.796568,
 4.837861,
 4.299589,
 4.2723017,
 4.3629227]

# Load Query - Query distances

In [None]:
%store -r query_distance_dict
# query_distance_dict

# Compute distances

In [None]:
conv_qrel_int = [31, 32, 33, 34, 37, 40, 49, 50, 54, 56, 58, 59, 61, 67, 68, 69, 75, 77, 78, 79]
conv_qrel = [str(x) for x in conv_qrel_int]

In [None]:
top_k = [1000,2000,5000,10000]

## Check triangle inequality - OK!

In [None]:
for convid in conv_qrel:
    conv_qids = [qid for qid in all_dist_dict.keys() if qid.startswith(convid+"_") and not qid.endswith("_1")]
    distances_qa = all_dist_dict[convid+"_1"]
    for qid in conv_qids:
        distances_qb = all_dist_dict[qid]
        assert len(distances_qa)==len(distances_qb)
        for (i,dist) in enumerate(distances_qb):
            assert distances_qb[i] <= distances_qa[i] + query_distance_dict[qid]

## Rb_hat

In [None]:
#check rb_hat
def compute_rb_hat(conv_qrel, all_dist_dict, query_distance_dict, top_k):
    rb_hat_cutoff_k = {}
    for convid in conv_qrel:
        conv_qids = [qid for qid in all_dist_dict.keys() if qid.startswith(convid+"_") and not qid.endswith("_1")]
        for qid in conv_qids:
            # print(qid)
            rb_hat = all_dist_dict[convid+"_1"][top_k-1] - query_distance_dict[qid]
            # print(all_dist_dict[conv+"_1"][top_k[3]-1], query_distance_dict[qid], rb_hat)
            rb_hat_cutoff_k[qid]= rb_hat
    return rb_hat_cutoff_k

In [None]:
rb_hat_dict = {}
for k in top_k:
    rb_hat = compute_rb_hat(conv_qrel, all_dist_dict, query_distance_dict, k)
    print("top-"+str(k), len([x for x in rb_hat.values() if x <= 0]))
    rb_hat_dict[k] = rb_hat

In [None]:
rb_hat_dict.keys()

In [None]:
rb_hat_dict[2000]

## COV1

In [None]:
rb_hat_10000 = rb_hat_dict[10000]
for convid in conv_qrel:
    conv_qids = [qid for qid in all_dist_dict.keys() if qid.startswith(convid+"_") and not qid.endswith("_1")]
    for qid in conv_qids:
        distances = all_dist_dict[qid]
        safe_dist_docs = ([(i,dist) for (i,dist) in enumerate(distances) if dist < rb_hat_10000[qid]])
        # print(qid, safe_dist_docs)

## COV2

Compute COVERAGE          
- Cov2 - intersezione tra risultati di query qb su cache e query qb su indice per k=3,5,10
- Cov2 - how many docs in cache have distance < rb_3, rb_5, rb_10


In [None]:
# Radius between current utterance (q) and last retrieved doc from the big index for top-k = 3,5,10
%store -r query_radius_dict
query_radius_dict

In [None]:
gcov_dict = {}

top_k = [1000 ,2000,5000,10000]
for k in top_k:
    print("top-k", k)
    print()
    gcov = {}
    for qid, (rb_3, rb_5, rb_10) in query_radius_dict.items():
        # print(qid, (rb_3, rb_5, rb_10))
        distances = all_dist_dict[qid][:k] # qua si cambia il topk della cache
        a=b=c=0
        a = len([x for x in distances if x <= rb_3])
        b = len([x for x in distances if x <= rb_5])
        c = len([x for x in distances if x <= rb_10])
        print([x for x in distances if x <= rb_3])
        print(qid, (a,b,c), rb_hat_dict[k][qid])    
        gcov[qid] = (a,c,c,rb_hat_dict[k][qid])
    gcov_dict[k] = gcov
    print()
    print()

In [None]:
# top_k = [1000,2000,5000,10000]
# for k in top_k:
#     with open('../data/star-ranking/approximated-coverage-star-L2-ranking-top1000-cache-top'+str(k)+'_first_utt.tsv', 'w+') as fout:
#         for qid, (rb_3, rb_5, rb_10) in query_radius_dict.items():
#             distances = all_dist_dict[qid][:k] # qua si cambia il topk della cache
#             a=b=c=0
#             a = len([x for x in distances if x <= rb_3])
#             b = len([x for x in distances if x <= rb_5])
#             c = len([x for x in distances if x <= rb_10])   
#             fout.write(str(qid)+"\t"+str((a,b,c))+"\t"+str(rb_hat_dict[k][qid])+"\n")

# Plots

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np

In [None]:
%store -r NDCG3

In [None]:
filtered_NDCG = [y for (x, y) in zip(NDCG3["qid"], NDCG3["value"]) if x in gcov.keys()]
sizes = [300*y for y in filtered_NDCG]

In [None]:
gcov = gcov_dict[5000] # k = 10.000

rng = np.random.RandomState(0)

x = [d for (a,b,c,d) in gcov.values()] # rb_hat
y = [c/10 for (a,b,c,d) in gcov.values()] # top-10 di gcov
print(x,y, filtered_NDCG)
for (a,b,c,d) in zip(gcov.keys(),x,y, filtered_NDCG):
    print(a,b,c,d)
assert len(x)==len(y)

colors = rng.rand(len(x))
# sizes = 1000 * rng.rand(len(x))

# plt.scatter(x, y, c=filtered_NDCG, s=sizes, alpha=0.5, cmap='viridis')
plt.scatter(x, y, c='r', alpha=0.5, cmap='viridis')
plt.colorbar();  # show col`or scale
plt.savefig("ndcg3rbhattop10.pdf")

In [None]:
intop10_list = []
rb_hat_list = []
with open('../data/star-ranking/approximated-coverage-star-L2-ranking-top1000-cache-top'+str(k)+'_with_update.tsv', 'r') as fout:
    for line in fout:
        # print(line)
        linedata = line.split("\t")
        acovs = linedata[1].replace("[","").replace("]", "").split(",")
        intop10= acovs[2]
        rb_hat = linedata[3].replace(" UPDATE \n","")
        # print(intop10, rb_hat)
        intop10_list.append(int(intop10))
        rb_hat_list.append(float(rb_hat))

In [None]:
print(intop10_list)
print(rb_hat_list)

# Average acov

In [None]:
top_k = [1000 ,2000,5000,10000]
for k in top_k:
    print("top-k", k)
    acovs_list = []
    with open('../data/star-ranking/approximated-coverage-star-L2-ranking-top1000-cache-top'+str(k)+'_with_update.tsv', 'r') as fout:
        for line in fout:
            # print(line)
            linedata = line.split("\t")
            acovs = linedata[1].replace("[","").replace("]", "").split(",")
            acovs_list.append((int(acovs[0]), int(acovs[1]),int(acovs[2])))
    acov_cache_with_update_df = pd.DataFrame(acovs_list)
    print(acov_cache_with_update_df.mean(axis=0))
    print()

In [None]:
top_k = [1000 ,2000,5000,10000]
for k in top_k:
    print("top-k", k)
    acovs_list = []
    with open('../data/star-ranking/approximated-coverage-star-L2-ranking-top1000-cache-top'+str(k)+'_first_utt.tsv', 'r') as fout:
        for line in fout:
            # print(line)
            linedata = line.split("\t")
            acovs = linedata[1].replace("(","").replace(")", "").split(",")
            acovs_list.append((int(acovs[0]), int(acovs[1]),int(acovs[2])))
    acov_cache_with_update_df = pd.DataFrame(acovs_list)
    print(acov_cache_with_update_df.mean(axis=0))
    print()