In [118]:
import json

In [119]:
model_name = ["bge-large-en-v1.5", "contriever", "gte-base", "multilingual-e5-large"][3]
dataset_name = "nq-train"
mode = "train"
with open(f"{model_name}/{dataset_name}.jsonl") as f:
    results = [json.loads(i) for i in f.readlines()]

In [120]:
model_name, dataset_name, mode

('multilingual-e5-large', 'nq-train', 'train')

In [121]:
# Get retrieval results as corpus id
retrieval_results = {}
for res in results:
    qid = res["_id"]
    for retrieval in res["retrieval"]:
        doc_id = retrieval["_id"]
        if doc_id in retrieval_results:
            retrieval_results[doc_id].append(qid)
        else:
            retrieval_results[doc_id] = [qid]

In [122]:
retrieval_results

{'doc80465': ['train616'],
 'doc11610454': ['train616'],
 'doc6748406': ['train616'],
 'doc16611385': ['train616'],
 'doc14727343': ['train616'],
 'doc13822259': ['train616'],
 'doc15858380': ['train616'],
 'doc990463': ['train616'],
 'doc14753466': ['train616'],
 'doc16543256': ['train616'],
 'doc4813421': ['train616'],
 'doc2684113': ['train616'],
 'doc16897068': ['train616'],
 'doc2943842': ['train616'],
 'doc8372130': ['train616'],
 'doc4955609': ['train616'],
 'doc5491583': ['train616'],
 'doc14642436': ['train616'],
 'doc1031085': ['train616'],
 'doc16200444': ['train616'],
 'doc8505855': ['train616'],
 'doc4215552': ['train616'],
 'doc13518108': ['train616'],
 'doc8952011': ['train616'],
 'doc17674362': ['train616'],
 'doc1907362': ['train616'],
 'doc4917486': ['train616'],
 'doc10475585': ['train616'],
 'doc8189666': ['train616'],
 'doc12851897': ['train616'],
 'doc4968500': ['train616'],
 'doc6124958': ['train616'],
 'doc32238': ['train616'],
 'doc15190521': ['train616'],
 'do

## Load Qrels

In [123]:
import pandas as pd
qrels = pd.read_csv(f"/gallery_louvre/dayoon.ko/research/sds/eval_retrieval/retrieval/results/{model_name}/{dataset_name}.csv")

In [124]:
qrels

Unnamed: 0.1,Unnamed: 0,query-id,corpus-id,score
0,21564,train21564,doc2962294,1
1,24011,train24011,doc3299462,1
2,30983,train30983,doc4244915,1
3,63770,train63770,doc8679304,1
4,119508,train119508,doc16274486,1
...,...,...,...,...
95,58459,train58459,doc7967953,1
96,44526,train44526,doc6060946,1
97,45128,train45128,doc6143865,1
98,126307,train126307,doc17192634,1


In [125]:
qrels = qrels[qrels["score"] >= 1].sort_values("corpus-id")
qrels

Unnamed: 0.1,Unnamed: 0,query-id,corpus-id,score
32,73982,train73982,doc10076560,1
57,7547,train7547,doc1027980,1
22,76059,train76059,doc10348980,1
92,76858,train76858,doc10453264,1
77,8130,train8130,doc1107119,1
...,...,...,...,...
79,67354,train67354,doc9171934,1
54,69713,train69713,doc9495049,1
61,69858,train69858,doc9512970,1
46,70501,train70501,doc9598508,1


In [126]:
len(qrels["corpus-id"].unique()) - len(qrels["corpus-id"])

0

# Generate Dataframe of Retrieval Results

In [127]:
# Time consuming
gt = {}
for k in qrels["corpus-id"].unique():
    qrels_ = qrels[qrels["corpus-id"] == k]
    v = qrels_["query-id"].tolist()
    gt[k] = v

In [128]:
gt

{'doc10076560': ['train73982'],
 'doc1027980': ['train7547'],
 'doc10348980': ['train76059'],
 'doc10453264': ['train76858'],
 'doc1107119': ['train8130'],
 'doc11664171': ['train85858'],
 'doc11666586': ['train85878'],
 'doc118376': ['train886'],
 'doc12062765': ['train88785'],
 'doc12138285': ['train89384'],
 'doc12193870': ['train89815'],
 'doc12737007': ['train93765'],
 'doc1275918': ['train9255'],
 'doc13123301': ['train96514'],
 'doc13244232': ['train97405'],
 'doc13771051': ['train101148'],
 'doc1383778': ['train9980'],
 'doc13850815': ['train101719'],
 'doc13894574': ['train102042'],
 'doc14100425': ['train103476'],
 'doc1424969': ['train10288'],
 'doc14379254': ['train105501'],
 'doc14573023': ['train106939'],
 'doc14574972': ['train106951'],
 'doc14661155': ['train107547'],
 'doc15002534': ['train110131'],
 'doc15085036': ['train110718'],
 'doc15443727': ['train113465'],
 'doc15490420': ['train113810'],
 'doc15967696': ['train117311'],
 'doc16033835': ['train117799'],
 'doc16

In [129]:
print("Total", len(qrels["corpus-id"].unique()), "documents!")

Total 100 documents!


In [130]:
corpus_ids = []
num_gts = []
recall = [] # retrieval 돼야하는데 된 경우 / retrieval 돼야하는 경우 (gt)
precision = [] # retrieval 돼야하는데 된 경우 / retrieval 이 된 경우 (ret)
for k, gt_v in gt.items():
    gt_v = [str(i) for i in gt_v]
    v = retrieval_results[k] if k in retrieval_results else []
    # gt_v: document 가 retrieval 되어야하는 모든 쿼리 / v: document 가 실제 retrieval 된 쿼리
    tp_t = len(set(gt_v).intersection(set(v))) / len(gt_v) # True Positive / # True
    tp_p = len(set(gt_v).intersection(set(v))) / len(v) if len(v) > 0 else 0 # True Positive / # Positive
    corpus_ids.append(k)
    num_gts.append(len(gt_v))
    recall.append(tp_t)
    precision.append(tp_p)

In [131]:
print(len(retrieval_results))

8840


In [132]:
data = {
        "corpus-id": corpus_ids, 
        "n-query": num_gts,
        "recall": recall,
        "precision": precision
       }
df = pd.DataFrame(data)
df.sort_values("recall")

Unnamed: 0,corpus-id,n-query,recall,precision
90,doc7930672,1,0.0,0.0
84,doc679465,1,0.0,0.0
70,doc4116273,1,0.0,0.0
39,doc17130536,1,0.0,0.0
58,doc2999713,1,0.0,0.0
...,...,...,...,...
28,doc15490420,1,1.0,1.0
27,doc15443727,1,1.0,1.0
26,doc15085036,1,1.0,1.0
24,doc14661155,1,1.0,1.0


In [133]:
df["f1"] = [0 if i < 0.0001 and j < 0.0001 else 2 * i * j / (i + j) for i, j in zip(df.recall, df.precision)]
df = df.sort_values("f1", ascending=False)
df

Unnamed: 0,corpus-id,n-query,recall,precision,f1
0,doc10076560,1,1.0,1.0,1.0
61,doc3299462,1,1.0,1.0,1.0
72,doc4253841,1,1.0,1.0,1.0
69,doc4006010,1,1.0,1.0,1.0
68,doc3737442,1,1.0,1.0,1.0
...,...,...,...,...,...
54,doc2785448,1,0.0,0.0,0.0
55,doc2863252,1,0.0,0.0,0.0
84,doc679465,1,0.0,0.0,0.0
58,doc2999713,1,0.0,0.0,0.0


In [104]:
df[df["recall"] > 0.2].sort_values("recall", ascending=False)

Unnamed: 0,corpus-id,n-query,recall,precision,f1
0,doc10076560,1,1.0,1.0,1.000000
13,doc13123301,1,1.0,1.0,1.000000
4,doc1107119,1,1.0,1.0,1.000000
5,doc11664171,1,1.0,1.0,1.000000
6,doc11666586,1,1.0,1.0,1.000000
...,...,...,...,...,...
94,doc8679304,1,1.0,1.0,1.000000
95,doc9171934,1,1.0,1.0,1.000000
96,doc9495049,1,1.0,1.0,1.000000
97,doc9512970,1,1.0,1.0,1.000000


In [105]:
#cid = "MED-2662"
#qrels[qrels["corpus-id"] == cid]

In [106]:
#print("Corpus")
#print(corpus_dict[cid])
#print("\n")
#print("Queries")
#for i in qrels[qrels["corpus-id"] == cid]["query-id"].tolist():
#    print(queries_dict[str(i)])


In [107]:
#corpus_dict["3h1o0oz3"]

# Generate dataset

In [108]:
for i in range(1, len(df["n-query"].unique())+1):
    print(f"n_query=={i} : total {len(df[df['n-query'] == i])} datapoints")

n_query==1 : total 100 datapoints


In [109]:
for i in range(11):
    i = round(i / 10, 2)
    j = round(i + 0.1, 2)
    print(f"{i}<=recall<{j} : total {len(df[(df['recall'] >= i) & (df['recall'] < j)])} datapoints")

0.0<=recall<0.1 : total 11 datapoints
0.1<=recall<0.2 : total 0 datapoints
0.2<=recall<0.3 : total 0 datapoints
0.3<=recall<0.4 : total 0 datapoints
0.4<=recall<0.5 : total 0 datapoints
0.5<=recall<0.6 : total 0 datapoints
0.6<=recall<0.7 : total 0 datapoints
0.7<=recall<0.8 : total 0 datapoints
0.8<=recall<0.9 : total 0 datapoints
0.9<=recall<1.0 : total 0 datapoints
1.0<=recall<1.1 : total 89 datapoints


In [110]:
df.to_csv(f"{model_name}/{dataset_name}-n-query-mt-2.csv")

In [111]:
f"{model_name}/{dataset_name}-n-query-mt-2.csv"

'multilingual-e5-large/nq-train-n-query-mt-2.csv'

In [112]:
# hotpotqa: 5233329
# trec-covid: 173???
# np: 2681468

In [113]:
465 rows × 3 columns / 1983

SyntaxError: invalid syntax (2594108381.py, line 1)