# Prepare the TREC COVID data for the pyserini BM25 processing

In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
EXPANSION_FILES=[
    "tmp_generated_expansion_from_000000_to_050000.pkl",
    "tmp_generated_expansion_from_050000_to_100000.pkl",
    "tmp_generated_expansion_from_100000_to_129186.pkl"
]

In [59]:
TREC_COVID_ORIGINAL_DATA_FILENAME="trec_covid_corpus.tsv"

TREC_COVID_QUERIES_FILENAME="trec_covid_merged_data.tsv"
TREC_COVID_QRELS_FILENAME="test.tsv"

### Merge the T5-base document expansions in a single dataframe

In [7]:
expansion_dicts = []

for which_file in EXPANSION_FILES:
    with open(which_file, "rb") as inputFile:
        expansion_dicts.append(pickle.load(inputFile))

In [8]:
len(expansion_dicts)

3

In [13]:
expansion_df = pd.concat([pd.DataFrame(expansion_dict) for expansion_dict in expansion_dicts]).reset_index(drop=True)

In [14]:
expansion_df

Unnamed: 0,doc_indexes,doc_expansion
0,0,what were the symptoms of m pneumoniae pneumon...
1,1,what microbial activities can occur in respira...
2,2,what cells in your lungs produce spd what do c...
3,3,what is a cuticle protein called what are the ...
4,4,what are respiratory virus and pvm what are re...
...,...,...
129181,171325,what is determinant of health and disease? is ...
129182,171326,what is the role of the business owners what i...
129183,171328,what is the function of mm2 cells in pim what ...
129184,171329,what can we use to localize a phenylalanine wh...


In [16]:
trec_covid_original_df = pd.read_csv(TREC_COVID_ORIGINAL_DATA_FILENAME, sep='\t')

In [17]:
trec_covid_original_df

Unnamed: 0,_id,title,text,metadata
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
...,...,...,...,...
171327,7e8r61e7,Can Pediatric COVID-19 Testing Sensitivity Be ...,,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...
171328,6jittbis,Heterogeneity and plasticity of porcine alveol...,This study investigated the heterogeneity and ...,{'url': 'https://doi.org/10.1242/bio.046342; h...
171329,hi8k8wvb,SARS E protein in phospholipid bilayers: an an...,Abstract We report on an anomalous X-ray refle...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...
171330,ma3ndg41,Italian Society of Interventional Cardiology (...,COVID‐19 pandemic raised the issue to guarante...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...


### Merge the title and the text fields of the original dataset to be able to index the documents which have only title

In [20]:
trec_covid_original_df['title_text'] = trec_covid_original_df['title'] + trec_covid_original_df['text'].fillna("")

In [21]:
trec_covid_original_df

Unnamed: 0,_id,title,text,metadata,title_text
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Clinical features of culture-proven Mycoplasma...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Nitric oxide: a pro-inflammatory mediator in l...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Surfactant protein-D and pulmonary host defens...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Role of endothelin-1 in lung diseaseEndothelin...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Gene expression in epithelial cells in respons...
...,...,...,...,...,...
171327,7e8r61e7,Can Pediatric COVID-19 Testing Sensitivity Be ...,,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...,Can Pediatric COVID-19 Testing Sensitivity Be ...
171328,6jittbis,Heterogeneity and plasticity of porcine alveol...,This study investigated the heterogeneity and ...,{'url': 'https://doi.org/10.1242/bio.046342; h...,Heterogeneity and plasticity of porcine alveol...
171329,hi8k8wvb,SARS E protein in phospholipid bilayers: an an...,Abstract We report on an anomalous X-ray refle...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...,SARS E protein in phospholipid bilayers: an an...
171330,ma3ndg41,Italian Society of Interventional Cardiology (...,COVID‐19 pandemic raised the issue to guarante...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...,Italian Society of Interventional Cardiology (...


In [23]:
expansion_df.set_index(expansion_df['doc_indexes'])

Unnamed: 0_level_0,doc_indexes,doc_expansion
doc_indexes,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,what were the symptoms of m pneumoniae pneumon...
1,1,what microbial activities can occur in respira...
2,2,what cells in your lungs produce spd what do c...
3,3,what is a cuticle protein called what are the ...
4,4,what are respiratory virus and pvm what are re...
...,...,...
171325,171325,what is determinant of health and disease? is ...
171326,171326,what is the role of the business owners what i...
171328,171328,what is the function of mm2 cells in pim what ...
171329,171329,what can we use to localize a phenylalanine wh...


### Merge the original dataset with the document expansions using the document index as key

In [27]:
trec_covid_expanded_df = trec_covid_original_df.merge(expansion_df.set_index(expansion_df['doc_indexes']), left_index=True, right_index=True, how='left')

In [31]:
trec_covid_expanded_df.iloc[171325]['title_text']

'Migrants and emerging public health issues in a globalized world: threats, risks and challenges, an evidence-based frameworkInternational population mobility is an underlying factor in the emergence of public health threats and risks that must be managed globally. These risks are often related, but not limited, to transmissible pathogens. Mobile populations can link zones of disease emergence to lowprevalence or nonendemic areas through rapid or high-volume international movements, or both. Against this background of human movement, other global processes such as economics, trade, transportation, environment and climate change, as well as civil security influence the health impacts of disease emergence. Concurrently, global information systems, together with regulatory frameworks for disease surveillance and reporting, affect organizational and public awareness of events of potential public health significance. International regulations directed at disease mitigation and control have 

In [32]:
trec_covid_expanded_df.iloc[171325]['doc_expansion']

'what is determinant of health and disease? is population mobility a major determinant of health how does global population movement influence human health what is the importance of population mobility in health a determinant of health how is the global population moving how are people moving in the world what is the relation between population mobility and health in general who does international migration influence? what is population mobility and health what determinants of global health varies among human populations which causes are at a key contributor to public health? what does global migration impact what is a global migration where do population mobility mean what causes human population mobility underlying causes of global population mobility and health challenges what is global population mobility what causes disease emergence and risk which international factors are responsible for global health is mobile population a determinant of global health?'

### Create a single field with the title + text + expasion data

In [None]:
trec_covid_expanded_df['text'] = trec_covid_expanded_df['title_text'] + trec_covid_expanded_df['doc_expansion'].fillna("")

In [34]:
trec_covid_expanded_df

Unnamed: 0,_id,title,text,metadata,title_text,doc_indexes,doc_expansion
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,Clinical features of culture-proven Mycoplasma...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Clinical features of culture-proven Mycoplasma...,0.0,what were the symptoms of m pneumoniae pneumon...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Nitric oxide: a pro-inflammatory mediator in l...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Nitric oxide: a pro-inflammatory mediator in l...,1.0,what microbial activities can occur in respira...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D and pulmonary host defens...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Surfactant protein-D and pulmonary host defens...,2.0,what cells in your lungs produce spd what do c...
3,2b73a28n,Role of endothelin-1 in lung disease,Role of endothelin-1 in lung diseaseEndothelin...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Role of endothelin-1 in lung diseaseEndothelin...,3.0,what is a cuticle protein called what are the ...
4,9785vg6d,Gene expression in epithelial cells in respons...,Gene expression in epithelial cells in respons...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...,Gene expression in epithelial cells in respons...,4.0,what are respiratory virus and pvm what are re...
...,...,...,...,...,...,...,...
171327,7e8r61e7,Can Pediatric COVID-19 Testing Sensitivity Be ...,Can Pediatric COVID-19 Testing Sensitivity Be ...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...,Can Pediatric COVID-19 Testing Sensitivity Be ...,,
171328,6jittbis,Heterogeneity and plasticity of porcine alveol...,Heterogeneity and plasticity of porcine alveol...,{'url': 'https://doi.org/10.1242/bio.046342; h...,Heterogeneity and plasticity of porcine alveol...,171328.0,what is the function of mm2 cells in pim what ...
171329,hi8k8wvb,SARS E protein in phospholipid bilayers: an an...,SARS E protein in phospholipid bilayers: an an...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...,SARS E protein in phospholipid bilayers: an an...,171329.0,what can we use to localize a phenylalanine wh...
171330,ma3ndg41,Italian Society of Interventional Cardiology (...,Italian Society of Interventional Cardiology (...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/3...,Italian Society of Interventional Cardiology (...,171330.0,what is a position paper for the gisse ii what...


### Save the datasets in a pyserini-friendly format

In [35]:
trec_covid_expanded_df.to_csv("trec_covid_expanded_full.tsv", sep="\t", index="False")

In [57]:
trec_covid_expanded_df[['_id', 'text']].dropna().to_csv("trec_covid_expanded.tsv", sep="\t", index=False, header=False)

In [58]:
trec_covid_expanded_df[['_id', 'title_text']].dropna().to_csv("trec_covid_original_tile_text_merged.tsv", sep="\t", index=False, header=False)

In [42]:
"ug7v899j	Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi ArabiaOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high.what were the symptoms of m pneumoniae pneumoniae what is m pneumoniae comorbidity most common infections where are mycoplasma pneumoniae infections what are the symptoms of pneumonia in children most common infections involving children what is the comorbidity and symptoms of pneumonia how many people have pneumonia m pneumoniae causes respiratory infections most common infections were for newborns m pneumoniae symptoms in the fall how many people die of pneumonia what is the epidemiology of pneumonia when was the most common pneumonia etiologies used which infection type were detected what is the mortality of pneumonia what is the most common age to contract pneumonia what are some common signs of pneumonia which clinical characteristics is most common in children what was the comorbidity of pneumonia in children".rstrip().split('\t')

['ug7v899j',
 'Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi ArabiaOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pne

### Save the queries in a pyserini-friendly format

In [60]:
trec_covid_queries_df = pd.read_csv(TREC_COVID_QUERIES_FILENAME, sep='\t')

In [61]:
trec_covid_queries_df

Unnamed: 0,query-id,corpus-id,score,query-text,corpus-title,corpus-text,query-metadata,corpus-metadata
0,1,005b2j4b,2,what is the origin of COVID-19,Monophyletic Relationship between Severe Acute...,Although primary genomic analysis has revealed...,"{'query': 'coronavirus origin', 'narrative': ""...",{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/1...
1,16,005b2j4b,0,how long does coronavirus remain stable on su...,Monophyletic Relationship between Severe Acute...,Although primary genomic analysis has revealed...,{'query': 'how long does coronavirus survive o...,{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/1...
2,32,005b2j4b,0,"Does SARS-CoV-2 have any subtypes, and if so w...",Monophyletic Relationship between Severe Acute...,Although primary genomic analysis has revealed...,"{'query': 'coronavirus subtypes', 'narrative':...",{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/1...
3,37,005b2j4b,0,What is the result of phylogenetic analysis of...,Monophyletic Relationship between Severe Acute...,Although primary genomic analysis has revealed...,"{'query': 'SARS-CoV-2 phylogenetic analysis', ...",{'url': 'https://www.ncbi.nlm.nih.gov/pubmed/1...
4,1,00fmeepz,1,what is the origin of COVID-19,Comprehensive overview of COVID-19 based on cu...,"In December 2019, twenty-seven pneumonia patie...","{'query': 'coronavirus origin', 'narrative': ""...","{'url': '', 'pubmed_id': ''}"
...,...,...,...,...,...,...,...,...
66331,50,zn10rnrm,1,what is known about an mRNA vaccine for the SA...,Characterization of RNA in Saliva,Background: We have previously shown that huma...,"{'query': 'mRNA vaccine coronavirus', 'narrati...",{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
66332,50,zstmdt4n,0,what is known about an mRNA vaccine for the SA...,Coordinate induction of IFN-α and -γ by SARS-C...,Abstract Background: Severe acute respiratory ...,"{'query': 'mRNA vaccine coronavirus', 'narrati...",{'url': 'https://api.elsevier.com/content/arti...
66333,50,zth8ffy3,0,what is known about an mRNA vaccine for the SA...,Vasculopathy and Coagulopathy Associated with ...,The emergence of severe acute respiratory synd...,"{'query': 'mRNA vaccine coronavirus', 'narrati...","{'url': '', 'pubmed_id': ''}"
66334,50,zv4nbz9p,2,what is known about an mRNA vaccine for the SA...,"Emerging Technologies for Use in the Study, Di...",INTRODUCTION: The COVID-19 pandemic has caused...,"{'query': 'mRNA vaccine coronavirus', 'narrati...",{'url': 'https://doi.org/10.1007/s12195-020-00...


In [66]:
trec_covid_queries_df[['query-id', 'query-text']].drop_duplicates('query-id').sort_values('query-id').to_csv("trec_covid_queries.tsv", sep='\t', index=False, header=False)

### Also save the qrels in a pyserini friendly format

In [67]:
trec_covid_qrels_df = pd.read_csv(TREC_COVID_QRELS_FILENAME, sep='\t')

In [69]:
trec_covid_qrels_df['fixed'] = 0

In [71]:
trec_covid_qrels_df[['query-id', 'fixed', 'corpus-id', 'score']].to_csv("trec_covid_qrels.tsv", sep='\t', index=False, header=False)