In [6]:
import json

In [35]:
dataset_name = "nfcorpus"
with open(f"{dataset_name}.jsonl") as f:
    results = [json.loads(i) for i in f.readlines()]

In [36]:
results[0]

{'_id': 'PLAIN-3',
 'text': 'Breast Cancer Cells Feed on Cholesterol',
 'metadata': {'url': 'http://nutritionfacts.org/2015/07/14/breast-cancer-cells-feed-on-cholesterol/'},
 'question': 'Breast Cancer Cells Feed on Cholesterol',
 'retrieval': [{'source': '/gallery_louvre/dayoon.ko/research/sds/src/datasets/nfcorpus/corpus.jsonl',
   'seq_num': 1383,
   'document': 'While many factors are involved in the etiology of cancer, it has been clearly established that diet significantly impacts one’s risk for this disease. More recently, specific food components have been identified which are uniquely beneficial in mitigating the risk of specific cancer subtypes. Plant sterols are well known for their effects on blood cholesterol levels, however research into their potential role in mitigating cancer risk remains in its infancy. As outlined in this review, the cholesterol modulating actions of plant sterols may overlap with their anti-cancer actions. Breast cancer is the most common malignancy

In [37]:
with open(f"/gallery_louvre/dayoon.ko/research/sds/src/datasets/{dataset_name}/corpus.jsonl") as f:
    corpus = [json.loads(i) for i in f.readlines()]
    corpus_dict = {}
    for c in corpus:
        corpus_dict[c["_id"]] = c["text"]

In [38]:
seq_num_to_id = {}
for i, doc in zip(range(1, len(corpus) + 1), corpus):
    seq_num_to_id[i] = doc["_id"] 

In [107]:
# Add document id to each retrieved document
for res in results:
    for retrieval in res["retrieval"]:
        seq_num = retrieval["seq_num"]
        retrieval["_id"] = seq_num_to_id[seq_num]

In [108]:
# Get retrieval results as corpus id
retrieval_results = {}
for res in results:
    qid = res["_id"]
    for retrieval in res["retrieval"]:
        doc_id = retrieval["_id"]
        if doc_id in retrieval_results:
            retrieval_results[doc_id].append(qid)
        else:
            retrieval_results[doc_id] = [qid]

In [49]:
qids = []
for i in retrieval_results.values():
    qids.extend(i)
qids = set(qids)

len(qids)

3237

## Load Qrels

In [98]:
import pandas as pd
qrels = pd.read_csv(f"/gallery_louvre/dayoon.ko/research/sds/src/datasets/{dataset_name}/qrels/test.tsv", sep="\t")

In [99]:
qrels.head(10)

Unnamed: 0,query-id,corpus-id,score
0,PLAIN-2,MED-2427,2
1,PLAIN-2,MED-10,2
2,PLAIN-2,MED-2429,2
3,PLAIN-2,MED-2430,2
4,PLAIN-2,MED-2431,2
5,PLAIN-2,MED-14,2
6,PLAIN-2,MED-2432,2
7,PLAIN-2,MED-2428,1
8,PLAIN-2,MED-2440,1
9,PLAIN-2,MED-2434,1


In [100]:
qrels = qrels[qrels["score"] >= 1].sort_values("corpus-id")
qrels

Unnamed: 0,query-id,corpus-id,score
1,PLAIN-2,MED-10,2
1469,PLAIN-531,MED-1097,1
5813,PLAIN-1601,MED-1098,1
3743,PLAIN-1109,MED-1098,1
4667,PLAIN-1409,MED-1098,1
...,...,...,...
9846,PLAIN-2430,MED-990,2
9847,PLAIN-2430,MED-991,2
9848,PLAIN-2430,MED-992,2
9849,PLAIN-2430,MED-993,2


In [55]:
gt = {}
for k in qrels["corpus-id"].unique():
    qrels_ = qrels[qrels["corpus-id"] == k]
    v = qrels_["query-id"].tolist()
    gt[k] = v

In [109]:
corpus_ids = []
num_gts = []
recall = [] # retrieval 돼야하는데 안된 경우 / retrieval 돼야하는 경우
precision = [] # retrieval 안돼야하는데 된 경우 / retrieval 된 경우
for k, v in retrieval_results.items():
    try:
        gt_v = [str(i) for i in gt[k]]
        # gt_v: document 가 retrieval 되어야하는 모든 쿼리 / v: document 가 실제 retrieval 된 쿼리
        tp_t = len(set(gt_v).intersection(set(v))) / len(gt_v) # True Positive / # True
        tp_p = len(set(gt_v).intersection(set(v))) / len(v) # True Positive / # Positive
        corpus_ids.append(k)
        num_gts.append(len(gt_v))
        recall.append(tp_t)
        precision.append(tp_p)
    except:
        continue

In [115]:
data = {
        "corpus-id": corpus_ids, 
        "num-positive-query-gt": num_gts,
        "recall": recall,
        "precision": precision,
       }
df = pd.DataFrame(data)
df

Unnamed: 0,corpus-id,num-positive-query-gt,recall,precision
0,MED-2427,1,1.0,0.007937
1,MED-14,1,1.0,0.009346
2,MED-2430,1,1.0,0.031250
3,MED-2431,1,1.0,0.041667
4,MED-3551,2,0.5,0.009901
...,...,...,...,...
532,MED-3023,1,0.0,0.000000
533,MED-5108,1,0.0,0.000000
534,MED-1171,1,0.0,0.000000
535,MED-2568,1,0.0,0.000000


In [116]:
df["f1"] = [0 if i < 0.01 and j < 0.01 else 2 * i * j / (i + j) for i, j in zip(recall, precision)]
df.head()

Unnamed: 0,corpus-id,num-positive-query-gt,recall,precision,f1
0,MED-2427,1,1.0,0.007937,0.015748
1,MED-14,1,1.0,0.009346,0.018519
2,MED-2430,1,1.0,0.03125,0.060606
3,MED-2431,1,1.0,0.041667,0.08
4,MED-3551,2,0.5,0.009901,0.019417


In [20]:
df_suc = df[df["tp_over_t"] > 0.2]
df_suc = df_suc[df_suc["num-positive-query-gt"] > 3]
df_suc = df_suc.sort_values("tp_over_t")
print(len(df_suc))
df_suc.head(10)
df_suc.to_csv("trec-covid-suc.csv")

89


In [146]:
corpus_dict["i1lyno9g"]

'SARS-CoV-2 has caused tens of thousands of infections and more than one thousand deaths. There are currently no registered therapies for treating coronavirus infections. Because of time consuming process of new drug development, drug repositioning may be the only solution to the epidemic of sudden infectious diseases. We systematically analyzed all the proteins encoded by SARS-CoV-2 genes, compared them with proteins from other coronaviruses, predicted their structures, and built 19 structures that could be done by homology modeling. By performing target-based virtual ligand screening, a total of 21 targets (including two human targets) were screened against compound libraries including ZINC drug database and our own database of natural products. Structure and screening results of important targets such as 3-chymotrypsin-like protease (3CLpro), Spike, RNA-dependent RNA polymerase (RdRp), and papain like protease (PLpro) were discussed in detail. In addition, a database of 78 commonly 

In [161]:
df_fail = df[df["num-positive-query-gt"] > 2]
df_fail = df_fail[df_fail["tp_over_t"] < 0.01]
df_fail = df_fail.sort_values("tp_over_t")
print(len(df_fail))
df_fail.tail(10)
df_fail.to_csv("trec-covid-fail.csv")

38


In [115]:
corpus_dict["3h1o0oz3"]

'The novel coronavirus SARS-CoV-2 (2019-nCoV) is a member of the family coronaviridae and contains a single-stranded RNA genome with positive-polarity. To reveal the evolution mechanism of SARS-CoV-2 genome, we performed comprehensive genomic analysis with newly sequenced SARS-CoV-2 strains and 20 closely related coronavirus strains. Among 98 nucleotide mutations at 93 sites of the genome among different SARS-CoV-2 strains, 58 of them caused amino acid change, indicating a result of neutral evolution. However, the ratio of nucleotide substitutions to amino acid substitutions of spike gene (9.07) between SARS-CoV-2 WIV04 and Bat-SARSr-CoV RaTG13 was extensively higher than those from comparisons between other coronaviruses (range 1.29 - 4.81). The elevated synonymous mutations between SARS-CoV-2 and RaTG13, suggesting they underwent stronger purifying selection. Moreover, their nucleotide substitutions are enriched with T:C transition, which is consistent with the mutation signature cau

# Check Queries

# Document Length

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader, DirectoryLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import fire
import torch
from tqdm import tqdm
from glob import glob 

data_dir = "/gallery_louvre/dayoon.ko/research/sds/src/datasets"
dataset_name = "nq"
glob_dir = "corpus.jsonl"
loader = JSONLoader(f"{data_dir}/{dataset_name}/{glob_dir}", jq_schema=".text", json_lines=True)
documents = loader.load()
print(f'Document count: {len(documents)}')

# Split document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
splits = text_splitter.split_documents(documents)
print(len(splits))

Document count: 2681468


KeyboardInterrupt: 

In [None]:
# hotpotqa: 5233329
# trec-covid: 173???
# np: 2681468