In [12]:
from datasets import load_dataset
import pandas as pd
import os
import json

In [13]:
repository_directory = os.path.abspath('') + "/repository"
models_jsons = os.listdir(repository_directory)

In [14]:
dataset_model_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        for dataset in data['dataset']:
            if dataset not in dataset_model_dict:
                dataset_model_dict[dataset] = []
            
            dataset_model_dict[dataset].append(data['model_name'])

In [26]:
dataset_model_dict

{'squad_v2': ['mrm8488/longformer-base-4096-finetuned-squadv2',
  'allenai/unifiedqa-t5-base',
  'ixa-ehu/SciBERT-SQuAD-QuAC'],
 'hotpot_qa': ['AdapterHub/roberta-base-pf-hotpotqa'],
 'cuad': ['Rakib/roberta-base-on-cuad', 'akdeniz27/deberta-v2-xlarge-cuad'],
 'trivia_qa': ['allenai/longformer-large-4096-finetuned-triviaqa'],
 'squad': ['ozcangundes/T5-base-for-BioQA',
  'MaRiOrOsSi/t5-base-finetuned-question-answering',
  'vanadhi/roberta-base-fiqa-flm-sq-flit'],
 'BeIR/bioasq-generated-queries': ['ozcangundes/T5-base-for-BioQA'],
 'duorc': ['MaRiOrOsSi/t5-base-finetuned-question-answering',
  'MaRiOrOsSi/t5-base-finetuned-question-answering'],
 'pubmed_qa': ['razent/SciFive-base-Pubmed_PMC', 'microsoft/biogpt'],
 'zhengyun21/PMC-Patients': ['razent/SciFive-base-Pubmed_PMC'],
 'boolq': ['allenai/unifiedqa-t5-base'],
 'race': ['allenai/unifiedqa-t5-base'],
 'quoref': ['allenai/unifiedqa-t5-base'],
 'ropes': ['allenai/unifiedqa-t5-base'],
 'drop': ['allenai/unifiedqa-t5-base'],
 'sagnik

In [16]:
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 250,
                             seed: int = 121,
                             **kwargs) -> pd.DataFrame:    
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, split="test")
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e
    
    
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 2000,
                             seed: int = 121,
                             **kwargs) -> pd.DataFrame:
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, **kwargs)
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e

### Squad Dataset

In [17]:
dataset_name = "squad"
configs = None
column_tuple = ("question", "context", "answers")

squad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="validation")

In [18]:
squad_qa_dataset = squad_qa_dataset.head(250)

In [19]:
answers = []

for i in range(len(squad_qa_dataset)):
    curr_ans_list = squad_qa_dataset['answers'][i]['text']
    curr_ans = max(curr_ans_list, key = len)
    answers.append(curr_ans)
    
squad_qa_dataset['answers'] = answers

### Pubmed Biology Dataset

In [20]:
dataset_name = "pubmed_qa"
config = "pqa_labeled"
column_tuple = ("question", "context", "long_answer")

pubmed_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="train")

contexts_strings = []

for i in range(len(pubmed_qa_dataset)):
    contexts_strings.append(' '.join(pubmed_qa_dataset["context"][i]['contexts']))
    
pubmed_qa_dataset['context'] = contexts_strings
pubmed_qa_dataset = pubmed_qa_dataset.rename(columns={"long_answer": "answers"})

In [21]:
pubmed_qa_dataset = pubmed_qa_dataset.head(250)

### BioASQ dataset

In [30]:
dataset_name = "BeIR/bioasq-generated-queries"
column_tuple = ("text", "query")

bioasq_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")
bioasq_qa_dataset = bioasq_qa_dataset.rename(columns={"text": "context", "query": "question"})
bioasq_qa_dataset = bioasq_qa_dataset[["question", "context"]]

Downloading readme: 100%|██████████| 14.0k/14.0k [00:00<00:00, 40.1MB/s]
Downloading data: 100%|██████████| 7.12G/7.12G [09:22<00:00, 12.7MB/s]
Downloading data files: 100%|██████████| 1/1 [09:22<00:00, 562.45s/it]
Extracting data files: 100%|██████████| 1/1 [01:16<00:00, 76.97s/it]
Generating train split: 14100000 examples [01:29, 158196.47 examples/s]


### cuad (legal) dataset

In [23]:
dataset_name = "cuad"
column_tuple = ("question", "context", "answers")

cuad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

In [24]:
answers = []

for i in range(len(cuad_qa_dataset)):
    curr_ans_list = cuad_qa_dataset['answers'][i]['text']
    if len(curr_ans_list)!=0:
        curr_ans = max(curr_ans_list, key = len)
    else:
        curr_ans = ""
    answers.append(curr_ans)
    
cuad_qa_dataset['answers'] = answers

In [25]:
cuad_qa_dataset = cuad_qa_dataset[cuad_qa_dataset["answers"]!=""][:250]

In [None]:
dataset_name = "cuad"
column_tuple = ("question", "context", "answers")

cuad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

### SciQ

In [27]:
dataset_name = "sciq"
column_tuple = ("question", "support", "correct_answer")

sciq_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="test")

Downloading builder script: 100%|██████████| 3.56k/3.56k [00:00<00:00, 11.9MB/s]
Downloading metadata: 100%|██████████| 1.81k/1.81k [00:00<00:00, 16.6MB/s]
Downloading readme: 100%|██████████| 6.84k/6.84k [00:00<00:00, 14.9MB/s]
Downloading data: 100%|██████████| 2.82M/2.82M [00:00<00:00, 4.20MB/s]
Generating train split: 100%|██████████| 11679/11679 [00:00<00:00, 42873.94 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 52097.33 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 52061.12 examples/s]


In [28]:
sciq_qa_dataset = sciq_qa_dataset.rename(columns={"support": "context", "correct_answer": "answers"})


In [30]:
sciq_qa_dataset = sciq_qa_dataset.head(250)

### CovidQA Bio

In [31]:
dataset_name = "covid_qa_deepset"
column_tuple = ("question", "context", "answers")

covidqa_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

Downloading builder script: 100%|██████████| 4.80k/4.80k [00:00<00:00, 12.4MB/s]
Downloading metadata: 100%|██████████| 1.95k/1.95k [00:00<00:00, 12.9MB/s]
Downloading readme: 100%|██████████| 5.61k/5.61k [00:00<00:00, 16.9MB/s]
Downloading data: 4.42MB [00:01, 4.23MB/s]                            
Generating train split: 100%|██████████| 2019/2019 [00:00<00:00, 6526.84 examples/s]


In [33]:
covidqa_qa_dataset['answers'][0]

{'text': ['zanamivir (Relenza) and oseltamivir (Tamiflu)'],
 'answer_start': [2356]}

In [34]:
answers = []

for i in range(len(covidqa_qa_dataset)):
    curr_ans_list = covidqa_qa_dataset['answers'][i]['text']
    if len(curr_ans_list)!=0:
        curr_ans = max(curr_ans_list, key = len)
    else:
        curr_ans = ""
    answers.append(curr_ans)
    
covidqa_qa_dataset['answers'] = answers

In [35]:
covidqa_qa_dataset = covidqa_qa_dataset.head(250)

### Combining datasets

In [37]:
cuad_qa_dataset["domain"] = "legal"
#bioasq_qa_dataset["domain"] = "bio"
pubmed_qa_dataset["domain"] = "bio"
squad_qa_dataset["domain"] = "None"
sciq_qa_dataset["domain"] = "science"
covidqa_qa_dataset["domain"] = "bio"

In [38]:
dataset_model_dict

{'squad_v2': ['mrm8488/longformer-base-4096-finetuned-squadv2',
  'allenai/unifiedqa-t5-base',
  'ixa-ehu/SciBERT-SQuAD-QuAC'],
 'hotpot_qa': ['AdapterHub/roberta-base-pf-hotpotqa'],
 'cuad': ['Rakib/roberta-base-on-cuad', 'akdeniz27/deberta-v2-xlarge-cuad'],
 'trivia_qa': ['allenai/longformer-large-4096-finetuned-triviaqa'],
 'squad': ['ozcangundes/T5-base-for-BioQA',
  'MaRiOrOsSi/t5-base-finetuned-question-answering',
  'vanadhi/roberta-base-fiqa-flm-sq-flit'],
 'BeIR/bioasq-generated-queries': ['ozcangundes/T5-base-for-BioQA'],
 'duorc': ['MaRiOrOsSi/t5-base-finetuned-question-answering',
  'MaRiOrOsSi/t5-base-finetuned-question-answering'],
 'pubmed_qa': ['razent/SciFive-base-Pubmed_PMC', 'microsoft/biogpt'],
 'zhengyun21/PMC-Patients': ['razent/SciFive-base-Pubmed_PMC'],
 'boolq': ['allenai/unifiedqa-t5-base'],
 'race': ['allenai/unifiedqa-t5-base'],
 'quoref': ['allenai/unifiedqa-t5-base'],
 'ropes': ['allenai/unifiedqa-t5-base'],
 'drop': ['allenai/unifiedqa-t5-base'],
 'sagnik

In [41]:
science_list = dataset_model_dict['quac'] + dataset_model_dict['zhengyun21/PMC-Patients']

In [43]:
bio_list = dataset_model_dict['BeIR/bioasq-generated-queries'] + dataset_model_dict['pubmed_qa'] + dataset_model_dict['covid_qa_deepset']

In [53]:
cuad_qa_dataset['models'] = ""
pubmed_qa_dataset['models'] = ""
squad_qa_dataset['models'] = ""
sciq_qa_dataset['models'] = ""
covidqa_qa_dataset['models'] = "" 

In [54]:
cuad_qa_dataset['models'] = cuad_qa_dataset['models'].apply(lambda x: dataset_model_dict['cuad'])
pubmed_qa_dataset['models'] = pubmed_qa_dataset['models'].apply(lambda x: dataset_model_dict['pubmed_qa'])
squad_qa_dataset['models'] = squad_qa_dataset['models'].apply(lambda x: dataset_model_dict['squad'])
sciq_qa_dataset['models'] = sciq_qa_dataset['models'].apply(lambda x: science_list)
covidqa_qa_dataset['models'] = covidqa_qa_dataset['models'].apply(lambda x: bio_list)


In [55]:
eval_dataset = pd.concat([cuad_qa_dataset, pubmed_qa_dataset, squad_qa_dataset], ignore_index=True)

In [57]:
eval_dataset.head(10)

Unnamed: 0,question,context,answers,domain,models
0,Highlight the parts (if any) of this contract ...,"Exhibit 99.1\n\nEXECUTION VERSION\n\nMETLIFE, ...",Deutsche Bank,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
1,Highlight the parts (if any) of this contract ...,Execution version\n\n\n\n Amendment n° 01 t...,"""Repairer'",legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
2,Highlight the parts (if any) of this contract ...,Exhibit 10.1\n\n\n\nPROMOTION AGREEMENT\n\nby ...,King,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
3,Highlight the parts (if any) of this contract ...,Exhibit 10.1 MASTER FRANCHISE AGREEMENT This M...,If third party franchisees are operating the S...,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
4,Highlight the parts (if any) of this contract ...,Exhibit 10.18 MAINTENANCE AGREEMENT MAINTE...,The Provider will obtain and maintain appropri...,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
5,Highlight the parts (if any) of this contract ...,[ * ] = Certain confidential information conta...,Each of Astellas and FG shall retain its recor...,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
6,Highlight the parts (if any) of this contract ...,CONTENT LICENSE AGREEMENT\n\n\n\nTHIS AGREEMEN...,CONTENT LICENSE AGREEMENT,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
7,Highlight the parts (if any) of this contract ...,JOINT DEVELOPMENT AND MARKETING AGREEMENT\n\nB...,"Subject to Article 22.2, this Agreement shall ...",legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
8,Highlight the parts (if any) of this contract ...,Exhibit 10.1 COLLABORATION AGREEMENT THIS ...,CAPSUGEL shall pay to CARDAX a royalty equal t...,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."
9,Highlight the parts (if any) of this contract ...,EXHIBIT 10.3\n\n INTEL...,Neither of the parties hereto may assign its r...,legal,"[Rakib/roberta-base-on-cuad, akdeniz27/deberta..."


In [58]:
eval_dataset.to_csv("eval_dataset_v2.csv")