In [1]:
import json

In [2]:
with open("trec-covid.jsonl") as f: # retrieval top-100 result
    results = [json.loads(i) for i in f.readlines()][:50]

In [3]:
with open("/gallery_louvre/dayoon.ko/research/sds/src/datasets/trec-covid/corpus.jsonl") as f:
    corpus = [json.loads(i) for i in f.readlines()]
    corpus_dict = {}
    for c in corpus:
        corpus_dict[c["_id"]] = c["text"]

In [4]:
seq_num_to_id = {}
for i, doc in zip(range(1, len(corpus) + 1), corpus):
    seq_num_to_id[i] = doc["_id"] 

In [5]:
# Add document id to each retrieved document
for res in results:
    for retrieval in res["retrieval"]:
        seq_num = retrieval["seq_num"]
        retrieval["_id"] = seq_num_to_id[seq_num]

In [6]:
# Get retrieval results as corpus id
retrieval_results = {}
for res in results:
    qid = res["_id"]
    for retrieval in res["retrieval"]:
        doc_id = retrieval["_id"]
        if doc_id in retrieval_results:
            retrieval_results[doc_id].append(qid)
        else:
            retrieval_results[doc_id] = [qid]

In [7]:
retrieval_results # document가 top-100에 retrieve된 query

{'n15i01tn': ['1', '9', '43', '48'],
 'hmvo5b0q': ['1'],
 'dv9m19yk': ['1'],
 'm4y8tf6u': ['1'],
 'utsr0zv7': ['1', '33'],
 'k9lcpjyo': ['1'],
 'jbtrdvhe': ['1'],
 'r2ynbnxx': ['1'],
 'jxbk30gh': ['1'],
 'rtw8m00s': ['1', '4'],
 'juz9jnfk': ['1', '26'],
 'lj2iu7z0': ['1'],
 't1iagum7': ['1'],
 '4dtk1kyh': ['1'],
 'jwxt4ygt': ['1'],
 'wco27nop': ['1'],
 'dnxhtbxn': ['1'],
 '8qfwat2v': ['1'],
 'deee71uw': ['1'],
 'gjigvjgv': ['1', '4', '8', '9', '26'],
 'xndojxqk': ['1'],
 'ccxj4s6j': ['1'],
 'rugijb30': ['1'],
 'p3681yit': ['1'],
 '2lxs9laj': ['1'],
 '1ybj2p1n': ['1'],
 '80c471y3': ['1', '35'],
 'w3avnbpu': ['1'],
 'fsy00ngb': ['1', '15', '18', '48'],
 '75773gwg': ['1'],
 'ptizke03': ['1'],
 'bbae1qyx': ['1'],
 'k9yus2sv': ['1'],
 'qp4efhwq': ['1'],
 '7v5aln90': ['1'],
 '6ck2ntid': ['1'],
 '8a6flxl6': ['1'],
 'f6p9i951': ['1'],
 '8vi60e0a': ['1'],
 'can1e8ro': ['1'],
 '82sc928x': ['1'],
 'dtv7to3l': ['1'],
 'hqzkzupi': ['1', '4', '39'],
 'sqrn6kjy': ['1'],
 '9griuata': ['1'],
 'pfv7q4v6

## Load Qrels

In [8]:
import pandas as pd
qrels = pd.read_csv("/gallery_louvre/dayoon.ko/research/sds/src/datasets/trec-covid/qrels/test.tsv", sep="\t")

In [9]:
qrels.head(10)

Unnamed: 0,query-id,corpus-id,score
0,1,005b2j4b,2
1,1,00fmeepz,1
2,1,g7dhmyyo,2
3,1,0194oljo,1
4,1,021q9884,1
5,1,02f0opkr,1
6,1,047xpt2c,0
7,1,04ftw7k9,0
8,1,pl9ht0d0,0
9,1,05vx82oo,0


In [10]:
# qrels = qrels[qrels["score"] > 0].sort_values("corpus-id")
qrels = qrels[qrels["score"] == 2].sort_values("corpus-id")
qrels.head(10)

Unnamed: 0,query-id,corpus-id,score
0,1,005b2j4b,2
61089,45,006k39tj,2
43576,31,00fmeepz,2
53369,38,00qk10im,2
18656,13,00rq0ggi,2
51077,36,00z7x46i,2
17083,12,011k6mm0,2
37288,26,011k6mm0,2
63298,47,012f0g4y,2
59950,44,019lj813,2


In [11]:
gt = {}
for k in qrels["corpus-id"].unique():
    qrels_ = qrels[qrels["corpus-id"] == k]
    v = qrels_["query-id"].tolist()
    gt[k] = v

In [12]:
# retrieval_results :  # document가 top-100에 retrieve된 query
# gt : document가 실제 evidence text인 query

In [13]:
corpus_ids = []
num_gts = []
tp_over_t = [] # retrieval 돼야하는데 안된 경우
tp_over_p = [] # retrieval 안돼야하는데 된 경우
for k, v in retrieval_results.items():
    try:
        gt_v = [str(i) for i in gt[k]]
        # gt_v: document 가 retrieval 되어야하는 모든 쿼리 / v: document 가 실제 retrieval 된 쿼리
        tp_t = len(set(gt_v).intersection(set(v))) / len(gt_v) # True Positive / # True
        tp_p = len(set(gt_v).intersection(set(v))) / len(v) # True Positive / # Positive
        corpus_ids.append(k)
        num_gts.append(len(gt_v))
        tp_over_t.append(tp_t)
        tp_over_p.append(tp_p)
    except:
        continue

In [14]:
data = {
        "corpus-id": corpus_ids, 
        "num-positive-query-gt": num_gts,
        "tp_over_t": tp_over_t,
        "tp_over_p": tp_over_p
       }
df = pd.DataFrame(data)
df.head()

Unnamed: 0,corpus-id,num-positive-query-gt,tp_over_t,tp_over_p
0,n15i01tn,1,0.0,0.0
1,k9lcpjyo,2,0.5,1.0
2,jbtrdvhe,2,0.0,0.0
3,juz9jnfk,2,1.0,1.0
4,t1iagum7,1,1.0,1.0


In [15]:
df_suc = df[df["tp_over_t"] > 0.5]
df_suc = df_suc[df_suc["num-positive-query-gt"] > 2]
df_suc = df_suc.sort_values("tp_over_t")
print(len(df_suc))
df_suc.head(10)

15


Unnamed: 0,corpus-id,num-positive-query-gt,tp_over_t,tp_over_p
47,rbzl0txp,3,0.666667,0.666667
124,dptgg05n,3,0.666667,1.0
657,k9xhphpl,3,0.666667,1.0
794,gey0nidn,3,0.666667,1.0
801,5wsj003j,3,0.666667,1.0
872,n4dgqo73,3,0.666667,1.0
1505,1sbnewog,3,0.666667,1.0
1509,iljmhdd1,3,0.666667,1.0
1519,02cfyuf4,3,0.666667,1.0
1527,qopcs6jy,3,0.666667,0.666667


In [16]:
corpus_dict["i1lyno9g"]

'SARS-CoV-2 has caused tens of thousands of infections and more than one thousand deaths. There are currently no registered therapies for treating coronavirus infections. Because of time consuming process of new drug development, drug repositioning may be the only solution to the epidemic of sudden infectious diseases. We systematically analyzed all the proteins encoded by SARS-CoV-2 genes, compared them with proteins from other coronaviruses, predicted their structures, and built 19 structures that could be done by homology modeling. By performing target-based virtual ligand screening, a total of 21 targets (including two human targets) were screened against compound libraries including ZINC drug database and our own database of natural products. Structure and screening results of important targets such as 3-chymotrypsin-like protease (3CLpro), Spike, RNA-dependent RNA polymerase (RdRp), and papain like protease (PLpro) were discussed in detail. In addition, a database of 78 commonly 

In [17]:
df_fail = df[df["num-positive-query-gt"] > 2]
df_fail = df_fail[df_fail["tp_over_t"] < 0.01]
df_fail = df_fail.sort_values("tp_over_t")
print(len(df_fail))
# df_fail.tail(38)

21


In [18]:
import random
fail_id_list = random.sample(df_fail['corpus-id'].tolist(), 5)
# fail_id_list
succ_id_list = random.sample(df_suc['corpus-id'].tolist(), 5)
# succ_id_list

In [19]:
print("Failure list")
for id_ in fail_id_list:
    print(id_)
    print(corpus_dict[id_])
    print()
print("Success list")
for id_ in succ_id_list:
    print(id_)
    print(corpus_dict[id_])
    print()

Failure list
lnys6iuu
The ongoing global pandemic of infection disease COVID-19 caused by the 2019 novel coronavirus (SARS-COV-2, formerly 2019-nCoV) presents critical threats to public health and the economy since it was identified in China, December 2019. The genome of SARS-CoV-2 had been sequenced and structurally annotated, yet little is known of the intrinsic organization and evolution of the genome. To this end, we present a mathematical method for the genomic spectrum, a kind of barcode, of SARS-CoV-2 and common human coronaviruses. The genomic spectrum is constructed according to the periodic distributions of nucleotides, and therefore reflects the unique characteristics of the genome. The results demonstrate that coronavirus SARS-CoV-2 exhibits dinucleotide TT islands in the non-structural proteins 3, 4, 5, and 6. Further analysis of the dinucleotide regions suggests that the dinucleotide repeats are increased during evolution and may confer the evolutionary fitness of the vir

In [20]:
fail_id_list[0]

'lnys6iuu'

# Check Queries

In [21]:
def check_matching_queries(_document_id):
    queries = gt[_document_id]
    questions = [results[i-1]['question'] for i in queries]
    # print(queries)
    print(questions)

for __id in fail_id_list:
    # print(corpus_dict[__id])
    check_matching_queries(__id)
    print()

print('---------')
for __id in succ_id_list:
    # print(corpus_dict[__id])
    check_matching_queries(__id)
    print()

# check_matching_queries(fail_id_list[0])

['What are the observed mutations in the SARS-CoV-2 genome and how often do the mutations occur?', 'what is the origin of COVID-19', 'What is the protein structure of the SARS-CoV-2 spike?']

['Does SARS-CoV-2 have any subtypes, and if so what are they?', 'What is the result of phylogenetic analysis of SARS-CoV-2 genome sequence?', 'What are the observed mutations in the SARS-CoV-2 genome and how often do the mutations occur?']

['what are the initial symptoms of Covid-19?', 'which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that can be repurposed based on this information?', 'what are the transmission routes of coronavirus?', 'what is the origin of COVID-19']

['What is the protein structure of the SARS-CoV-2 spike?', 'will SARS-CoV2 infected people develop immunity? Is cross protection possible?', 'which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that c

In [75]:
# results[0]['_id']
# results[0]['question']

results[49]['_id']

'50'

# Check retrieved queries

In [None]:
# hotpotqa: 5233329
# trec-covid: 173???
# np: 2681468

In [28]:
import json

def load_jsonl(path):
    with open(path) as f:
        return [json.loads(i) for i in f.readlines()]

def get_matching_queries(document_id):
    queries = gt[document_id]
    questions = [results[i-1]['question'] for i in queries]
    return questions



In [23]:
ret_suc = load_jsonl("/gallery_louvre/dayoon.ko/research/sds/retrieval/results/trec-covid-suc-query-retrieval.jsonl")
ret_fail = load_jsonl("/gallery_louvre/dayoon.ko/research/sds/retrieval/results/trec-covid-fail-query-retrieval.jsonl")

{'_id': 'i1lyno9g',
 'title': 'Analysis of therapeutic targets for SARS-CoV-2 and discovery of potential drugs by computational methods',
 'text': 'SARS-CoV-2 has caused tens of thousands of infections and more than one thousand deaths. There are currently no registered therapies for treating coronavirus infections. Because of time consuming process of new drug development, drug repositioning may be the only solution to the epidemic of sudden infectious diseases. We systematically analyzed all the proteins encoded by SARS-CoV-2 genes, compared them with proteins from other coronaviruses, predicted their structures, and built 19 structures that could be done by homology modeling. By performing target-based virtual ligand screening, a total of 21 targets (including two human targets) were screened against compound libraries including ZINC drug database and our own database of natural products. Structure and screening results of important targets such as 3-chymotrypsin-like protease (3CLp

In [34]:
for res in ret_suc:
    try:
        gt_docs = get_matching_queries(res["_id"])
    except:
        continue
    print("ID:", res["_id"])
    print("\nGT queries\n-----------")
    for doc in gt_docs:
        print(doc)
    print("\nRetrieved queries\n-----------")
    for ret in res["retrieval"]:
        print(ret["document"])
    print("\n\n\n")

ID: i1lyno9g

GT queries
-----------
which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that can be repurposed based on this information?

Retrieved queries
-----------
which SARS-CoV-2 proteins-human proteins interactions indicate potential for drug targets. Are there approved drugs that can be repurposed based on this information?
what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
What is the protein structure of the SARS-CoV-2 spike?
What is the result of phylogenetic analysis of SARS-CoV-2 genome sequence?
what evidence is there for the value of hydroxychloroquine in treating Covid-19?
which biomarkers predict the severe clinical course of 2019-nCOV infection?
what evidence is there for dexamethasone as a treatment for COVID-19?
are there any clinical trials available for the coronavirus
Does SARS-CoV-2 have any subtypes, and if so what are they?
will SARS-CoV2 infected people develop immuni

In [35]:
for res in ret_fail:
    try:
        gt_docs = get_matching_queries(res["_id"])
    except:
        continue
    print("ID:", res["_id"])
    print("\nGT queries\n-----------")
    for doc in gt_docs:
        print(doc)
    print("\nRetrieved queries\n-----------")
    for ret in res["retrieval"]:
        print(ret["document"])
    print("\n\n\n")

ID: sqrn6kjy

GT queries
-----------
what are the initial symptoms of Covid-19?
How does the coronavirus differ from seasonal flu?

Retrieved queries
-----------
what is the origin of COVID-19
what evidence is there for dexamethasone as a treatment for COVID-19?
which biomarkers predict the severe clinical course of 2019-nCOV infection?
are cardiac complications likely in patients with COVID-19?
what causes death from Covid-19?
has social distancing had an impact on slowing the spread of COVID-19?
What is the mechanism of inflammatory response and pathogenesis of COVID-19 cases?
what kinds of complications related to COVID-19 are associated with diabetes
what evidence is there for the value of hydroxychloroquine in treating Covid-19?
What is the mechanism of cytokine storm syndrome on the COVID-19?




ID: qvz63m93

GT queries
-----------
Does SARS-CoV-2 have any subtypes, and if so what are they?
What is the result of phylogenetic analysis of SARS-CoV-2 genome sequence?
What are the o

# NDCG metric for "Retrieved Query" vs "Gold Query"

In [40]:
# True positive documents
accs = []
for res in ret_suc:
    try:
        gt_docs = get_matching_queries(res["_id"])
    except:
        continue
    acc = 0 
    for ret in res["retrieval"]:
        if ret["document"] in gt_docs:
            acc += 1
    accs.append(acc / len(gt_docs))
print(sum(accs) / len(accs))

0.45178197064989517


In [41]:
# True negative documents
accs = []
for res in ret_fail:
    try:
        gt_docs = get_matching_queries(res["_id"])
    except:
        continue
    acc = 0 
    for ret in res["retrieval"]:
        if ret["document"] in gt_docs:
            acc += 1
    accs.append(acc / len(gt_docs))
print(sum(accs) / len(accs))

0.3817204301075268
