In [1]:
import json

In [186]:
dataset_name = "nfcorpus"
with open(f"{dataset_name}.jsonl") as f:
    results = [json.loads(i) for i in f.readlines()]

In [187]:
results[0]

{'_id': 'PLAIN-3',
 'text': 'Breast Cancer Cells Feed on Cholesterol',
 'metadata': {'url': 'http://nutritionfacts.org/2015/07/14/breast-cancer-cells-feed-on-cholesterol/'},
 'question': 'Breast Cancer Cells Feed on Cholesterol',
 'retrieval': [{'source': '/gallery_louvre/dayoon.ko/research/sds/src/datasets/nfcorpus/corpus.jsonl',
   'seq_num': 1383,
   'document': 'While many factors are involved in the etiology of cancer, it has been clearly established that diet significantly impacts one’s risk for this disease. More recently, specific food components have been identified which are uniquely beneficial in mitigating the risk of specific cancer subtypes. Plant sterols are well known for their effects on blood cholesterol levels, however research into their potential role in mitigating cancer risk remains in its infancy. As outlined in this review, the cholesterol modulating actions of plant sterols may overlap with their anti-cancer actions. Breast cancer is the most common malignancy

In [214]:
import json
with open(f"/gallery_louvre/dayoon.ko/research/sds/src/datasets/{dataset_name}/queries.jsonl") as f:
    queries = [json.loads(i) for i in f.readlines()]
    queries_dict = {}
    for q in queries:
        queries_dict[q["_id"]] = q["text"]
        
import json
with open(f"/gallery_louvre/dayoon.ko/research/sds/src/datasets/{dataset_name}/corpus.jsonl") as f:
    corpus = [json.loads(i) for i in f.readlines()]
    corpus_dict = {}
    for c in corpus:
        corpus_dict[c["_id"]] = c["text"]

In [215]:
seq_num_to_id = {}
for i, doc in zip(range(1, len(corpus) + 1), corpus):
    seq_num_to_id[i] = doc["_id"] 

In [216]:
# Add document id to each retrieved document
for res in results:
    res["retrieval"] = res["retrieval"][:100]
    for retrieval in res["retrieval"]:
        seq_num = retrieval["seq_num"]
        retrieval["_id"] = seq_num_to_id[seq_num]

In [217]:
# Get retrieval results as corpus id
retrieval_results = {}
for res in results:
    qid = res["_id"]
    for retrieval in res["retrieval"]:
        doc_id = retrieval["_id"]
        if doc_id in retrieval_results:
            retrieval_results[doc_id].append(qid)
        else:
            retrieval_results[doc_id] = [qid]
            

In [218]:
retrieval_results

{'MED-2439': ['PLAIN-3',
  'PLAIN-29',
  'PLAIN-84',
  'PLAIN-100',
  'PLAIN-119',
  'PLAIN-134',
  'PLAIN-135',
  'PLAIN-160',
  'PLAIN-171',
  'PLAIN-178',
  'PLAIN-208',
  'PLAIN-213',
  'PLAIN-214',
  'PLAIN-215',
  'PLAIN-231',
  'PLAIN-292',
  'PLAIN-294',
  'PLAIN-329',
  'PLAIN-339',
  'PLAIN-349',
  'PLAIN-462',
  'PLAIN-755',
  'PLAIN-758',
  'PLAIN-812',
  'PLAIN-813',
  'PLAIN-1137',
  'PLAIN-1858',
  'PLAIN-1860',
  'PLAIN-2149',
  'PLAIN-2384',
  'PLAIN-2501',
  'PLAIN-2505',
  'PLAIN-2506',
  'PLAIN-2521',
  'PLAIN-2532',
  'PLAIN-2566',
  'PLAIN-2572',
  'PLAIN-2578',
  'PLAIN-2586',
  'PLAIN-2607',
  'PLAIN-2608',
  'PLAIN-2635',
  'PLAIN-2684',
  'PLAIN-2697',
  'PLAIN-2716',
  'PLAIN-2763',
  'PLAIN-2764',
  'PLAIN-2771',
  'PLAIN-2772',
  'PLAIN-2773',
  'PLAIN-2804',
  'PLAIN-2813',
  'PLAIN-2814',
  'PLAIN-2815',
  'PLAIN-2816',
  'PLAIN-2828',
  'PLAIN-2845',
  'PLAIN-2911',
  'PLAIN-3005',
  'PLAIN-3007',
  'PLAIN-3008',
  'PLAIN-3009',
  'PLAIN-3020',
  'PLAIN-

## Load Qrels

In [197]:
import pandas as pd
qrels = pd.read_csv(f"/gallery_louvre/dayoon.ko/research/sds/src/datasets/{dataset_name}/qrels/test.tsv", sep="\t")

In [198]:
qrels.head(10)

Unnamed: 0,query-id,corpus-id,score
0,PLAIN-2,MED-2427,2
1,PLAIN-2,MED-10,2
2,PLAIN-2,MED-2429,2
3,PLAIN-2,MED-2430,2
4,PLAIN-2,MED-2431,2
5,PLAIN-2,MED-14,2
6,PLAIN-2,MED-2432,2
7,PLAIN-2,MED-2428,1
8,PLAIN-2,MED-2440,1
9,PLAIN-2,MED-2434,1


In [243]:
qrels = qrels[qrels["score"] == 2].sort_values("corpus-id")
qrels

Unnamed: 0,query-id,corpus-id,score
1,PLAIN-2,MED-10,2
9903,PLAIN-2450,MED-1166,2
9904,PLAIN-2450,MED-1167,2
9906,PLAIN-2450,MED-1169,2
9907,PLAIN-2450,MED-1170,2
...,...,...,...
9846,PLAIN-2430,MED-990,2
9847,PLAIN-2430,MED-991,2
9848,PLAIN-2430,MED-992,2
9849,PLAIN-2430,MED-993,2


# Generate Dataframe of Retrieval Results

In [244]:
# Time consuming
gt = {}
for k in qrels["corpus-id"].unique():
    qrels_ = qrels[qrels["corpus-id"] == k]
    v = qrels_["query-id"].tolist()
    gt[k] = v

In [245]:
print("Total", len(qrels["corpus-id"].unique()), "documents!")

Total 537 documents!


In [246]:
corpus_ids = []
num_gts = []
recall = [] # retrieval 돼야하는데 된 경우 / retrieval 돼야하는 경우 (gt)
precision = [] # retrieval 돼야하는데 된 경우 / retrieval 이 된 경우 (ret)
for k, v in retrieval_results.items():
    try:
        gt_v = [str(i) for i in gt[k]]
        # gt_v: document 가 retrieval 되어야하는 모든 쿼리 / v: document 가 실제 retrieval 된 쿼리
        tp_t = len(set(gt_v).intersection(set(v))) / len(gt_v) # True Positive / # True
        tp_p = len(set(gt_v).intersection(set(v))) / len(v) # True Positive / # Positive
        corpus_ids.append(k)
        num_gts.append(len(gt_v))
        recall.append(tp_t)
        precision.append(tp_p)
    except:
        continue

In [247]:
data = {
        "corpus-id": corpus_ids, 
        "n-query": num_gts,
        "recall": recall,
        "precision": precision
       }
df = pd.DataFrame(data)
df = df[df["n-query"] > 1]
df

Unnamed: 0,corpus-id,n-query,recall,precision
4,MED-3551,2,0.5,0.009901
26,MED-2646,2,0.5,0.002632
30,MED-2655,2,0.0,0.0
45,MED-3964,2,1.0,0.006431
57,MED-3474,2,0.5,0.004219
70,MED-3137,2,0.0,0.0
91,MED-5062,2,1.0,0.006135
92,MED-2653,2,0.0,0.0
125,MED-4319,2,0.5,0.043478
129,MED-3555,2,0.0,0.0


In [248]:
df["f1"] = [0 if i < 0.01 and j < 0.01 else 2 * i * j / (i + j) for i, j in zip(df.recall, df.precision)]
#df = df.sort_values("f1", ascending=False)
df.head()

Unnamed: 0,corpus-id,n-query,recall,precision,f1
4,MED-3551,2,0.5,0.009901,0.019417
26,MED-2646,2,0.5,0.002632,0.005236
30,MED-2655,2,0.0,0.0,0.0
45,MED-3964,2,1.0,0.006431,0.01278
57,MED-3474,2,0.5,0.004219,0.008368


In [260]:
df[df["recall"] < 0.2].sort_values("n-query", ascending=False)

Unnamed: 0,corpus-id,n-query,recall,precision,f1
30,MED-2655,2,0.0,0.0,0.0
70,MED-3137,2,0.0,0.0,0.0
92,MED-2653,2,0.0,0.0,0.0
129,MED-3555,2,0.0,0.0,0.0
130,MED-3485,2,0.0,0.0,0.0
212,MED-2659,2,0.0,0.0,0.0
402,MED-3484,2,0.0,0.0,0.0
418,MED-3378,2,0.0,0.0,0.0
444,MED-3176,2,0.0,0.0,0.0
496,MED-3174,2,0.0,0.0,0.0


In [263]:
cid = "MED-2662"
qrels[qrels["corpus-id"] == cid]

Unnamed: 0,query-id,corpus-id,score
10623,PLAIN-2630,MED-2662,2
63,PLAIN-23,MED-2662,2


In [264]:
print("Corpus")
print(corpus_dict[cid])
print("\n")
print("Queries")
for i in qrels[qrels["corpus-id"] == cid]["query-id"].tolist():
    print(queries_dict[str(i)])


Corpus
A human breast cancer cell line (MCF-7) was used to develop an in vitro screening assay for the detection of xenoestrogenic environmental pollutants. MCF-7 cells were cultured in DMEM containing 5% fetal bovine serum (FBS). An estrogenic response was defined as an increase in the frequency of proliferating MCF-7 cells, and was measured using a thymidine analog, bromodeoxyuridine, and flow cytometry. Di-2-ethylhexyl phthalate (DEHP) and 4-n-nonylphenol (4-n-NP) were used as model chemicals. The proliferation rate of S-phase cells after 24 h of exposure to various concentrations of 17beta-estradiol and to model compounds was compared with a positive and a negative control, containing 1 nM 17beta-estradiol and 0.1% ethanol, respectively. DEHP and 4-n-NP increased the frequency of proliferating MCF-7 cells in a dose-dependent manner. The lowest concentration that significantly increased the proliferation of MCF-7 cells was 10 microM for DEHP and 1 microM for 4-n-NP. The results show

In [115]:
corpus_dict["3h1o0oz3"]

'The novel coronavirus SARS-CoV-2 (2019-nCoV) is a member of the family coronaviridae and contains a single-stranded RNA genome with positive-polarity. To reveal the evolution mechanism of SARS-CoV-2 genome, we performed comprehensive genomic analysis with newly sequenced SARS-CoV-2 strains and 20 closely related coronavirus strains. Among 98 nucleotide mutations at 93 sites of the genome among different SARS-CoV-2 strains, 58 of them caused amino acid change, indicating a result of neutral evolution. However, the ratio of nucleotide substitutions to amino acid substitutions of spike gene (9.07) between SARS-CoV-2 WIV04 and Bat-SARSr-CoV RaTG13 was extensively higher than those from comparisons between other coronaviruses (range 1.29 - 4.81). The elevated synonymous mutations between SARS-CoV-2 and RaTG13, suggesting they underwent stronger purifying selection. Moreover, their nucleotide substitutions are enriched with T:C transition, which is consistent with the mutation signature cau

# Generate random sampled dataset

In [171]:
for i in range(len(df["n-query"].unique())):
    print(f"n_query=={i} : total {len(df[df['n-query'] == i])} datapoints")

n_query==0 : total 0 datapoints
n_query==1 : total 0 datapoints
n_query==2 : total 781 datapoints
n_query==3 : total 254 datapoints
n_query==4 : total 91 datapoints
n_query==5 : total 27 datapoints
n_query==6 : total 10 datapoints
n_query==7 : total 6 datapoints
n_query==8 : total 6 datapoints


In [182]:
for i in range(10):
    i = round(i / 10, 2)
    j = round(i + 0.1, 2)
    print(f"{i}<=recall<{j} : total {len(df[(df['recall'] >= i) & (df['recall'] < j)])} datapoints")

0.0<=recall<0.1 : total 137 datapoints
0.1<=recall<0.2 : total 21 datapoints
0.2<=recall<0.3 : total 88 datapoints
0.3<=recall<0.4 : total 196 datapoints
0.4<=recall<0.5 : total 6 datapoints
0.5<=recall<0.6 : total 604 datapoints
0.6<=recall<0.7 : total 31 datapoints
0.7<=recall<0.8 : total 0 datapoints
0.8<=recall<0.9 : total 0 datapoints
0.9<=recall<1.0 : total 0 datapoints


In [184]:
df.to_csv("trec-covid-n-query-mt-2.csv")

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader, DirectoryLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import fire
import torch
from tqdm import tqdm
from glob import glob 

data_dir = "/gallery_louvre/dayoon.ko/research/sds/src/datasets"
dataset_name = "nq"
glob_dir = "corpus.jsonl"
loader = JSONLoader(f"{data_dir}/{dataset_name}/{glob_dir}", jq_schema=".text", json_lines=True)
documents = loader.load()
print(f'Document count: {len(documents)}')

# Split document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
splits = text_splitter.split_documents(documents)
print(len(splits))

Document count: 2681468


KeyboardInterrupt: 

In [None]:
# hotpotqa: 5233329
# trec-covid: 173???
# np: 2681468