In [1]:
from datasets import load_dataset
import pandas as pd
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
repository_directory = os.path.abspath('') + "/repository"
models_jsons = os.listdir(repository_directory)

In [3]:
dataset_model_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        for dataset in data['dataset']:
            if dataset not in dataset_model_dict:
                dataset_model_dict[dataset] = []
            
            dataset_model_dict[dataset].append(data['model_name'])

In [4]:
domain_model_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        for domain in data['domain']:
            if domain not in domain_model_dict:
                domain_model_dict[domain] = []
            
            domain_model_dict[domain].append(data['model_name'])

In [5]:
domain_dataset_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        domain = data['domain'][0]
        for dt in data['dataset']:
            if domain not in domain_dataset_dict:
                domain_dataset_dict[domain] = []
            
            domain_dataset_dict[domain].append(dt)

In [6]:
domain_model_dict

{'None': ['mrm8488/longformer-base-4096-finetuned-squadv2',
  'allenai/longformer-large-4096-finetuned-triviaqa',
  'allenai/unifiedqa-t5-base'],
 'math': ['AdapterHub/roberta-base-pf-hotpotqa',
  'AlexWortega/taskGPT2-xl-v0.2a',
  'vanadhi/roberta-base-fiqa-flm-sq-flit'],
 'legal': ['Rakib/roberta-base-on-cuad', 'akdeniz27/deberta-v2-xlarge-cuad'],
 'bio': ['ozcangundes/T5-base-for-BioQA',
  'microsoft/biogpt',
  'Sarmila/pubmed-bert-squad-covidqa'],
 'narrative': ['MaRiOrOsSi/t5-base-finetuned-question-answering'],
 'science': ['razent/SciFive-base-Pubmed_PMC', 'ixa-ehu/SciBERT-SQuAD-QuAC'],
 'finance': ['vanadhi/roberta-base-fiqa-flm-sq-flit']}

In [7]:
dataset_model_dict

{'squad_v2': ['mrm8488/longformer-base-4096-finetuned-squadv2',
  'allenai/unifiedqa-t5-base',
  'ixa-ehu/SciBERT-SQuAD-QuAC'],
 'hotpot_qa': ['AdapterHub/roberta-base-pf-hotpotqa'],
 'cuad': ['Rakib/roberta-base-on-cuad', 'akdeniz27/deberta-v2-xlarge-cuad'],
 'trivia_qa': ['allenai/longformer-large-4096-finetuned-triviaqa'],
 'squad': ['ozcangundes/T5-base-for-BioQA',
  'MaRiOrOsSi/t5-base-finetuned-question-answering',
  'vanadhi/roberta-base-fiqa-flm-sq-flit'],
 'BeIR/bioasq-generated-queries': ['ozcangundes/T5-base-for-BioQA'],
 'duorc': ['MaRiOrOsSi/t5-base-finetuned-question-answering',
  'MaRiOrOsSi/t5-base-finetuned-question-answering'],
 'pubmed_qa': ['razent/SciFive-base-Pubmed_PMC', 'microsoft/biogpt'],
 'zhengyun21/PMC-Patients': ['razent/SciFive-base-Pubmed_PMC'],
 'boolq': ['allenai/unifiedqa-t5-base'],
 'race': ['allenai/unifiedqa-t5-base'],
 'quoref': ['allenai/unifiedqa-t5-base'],
 'ropes': ['allenai/unifiedqa-t5-base'],
 'drop': ['allenai/unifiedqa-t5-base'],
 'sagnik

In [8]:
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 250,
                             seed: int = 121,
                             **kwargs) -> pd.DataFrame:    
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, split="test")
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e
    
    
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 2000,
                             seed: int = 121,
                             **kwargs) -> pd.DataFrame:
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, **kwargs)
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e

### Squad Dataset

In [9]:
dataset_name = "squad"
configs = None
column_tuple = ("question", "context", "answers")

squad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="validation")

In [10]:
squad_qa_dataset = squad_qa_dataset.head(250)

In [11]:
answers = []

for i in range(len(squad_qa_dataset)):
    curr_ans_list = squad_qa_dataset['answers'][i]['text']
    curr_ans = max(curr_ans_list, key = len)
    answers.append(curr_ans)
    
squad_qa_dataset['answers'] = answers

### Pubmed Biology Dataset

In [12]:
dataset_name = "pubmed_qa"
config = "pqa_labeled"
column_tuple = ("question", "context", "long_answer")

pubmed_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="train")

contexts_strings = []

for i in range(len(pubmed_qa_dataset)):
    contexts_strings.append(' '.join(pubmed_qa_dataset["context"][i]['contexts']))
    
pubmed_qa_dataset['context'] = contexts_strings
pubmed_qa_dataset = pubmed_qa_dataset.rename(columns={"long_answer": "answers"})

In [13]:
pubmed_qa_dataset = pubmed_qa_dataset.head(250)

### BioASQ dataset

In [30]:
dataset_name = "BeIR/bioasq-generated-queries"
column_tuple = ("text", "query")

bioasq_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")
bioasq_qa_dataset = bioasq_qa_dataset.rename(columns={"text": "context", "query": "question"})
bioasq_qa_dataset = bioasq_qa_dataset[["question", "context"]]

Downloading readme: 100%|██████████| 14.0k/14.0k [00:00<00:00, 40.1MB/s]
Downloading data: 100%|██████████| 7.12G/7.12G [09:22<00:00, 12.7MB/s]
Downloading data files: 100%|██████████| 1/1 [09:22<00:00, 562.45s/it]
Extracting data files: 100%|██████████| 1/1 [01:16<00:00, 76.97s/it]
Generating train split: 14100000 examples [01:29, 158196.47 examples/s]


### cuad (legal) dataset

In [14]:
dataset_name = "cuad"
column_tuple = ("question", "context", "answers")

cuad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

In [15]:
answers = []

for i in range(len(cuad_qa_dataset)):
    curr_ans_list = cuad_qa_dataset['answers'][i]['text']
    if len(curr_ans_list)!=0:
        curr_ans = max(curr_ans_list, key = len)
    else:
        curr_ans = ""
    answers.append(curr_ans)
    
cuad_qa_dataset['answers'] = answers

In [16]:
cuad_qa_dataset = cuad_qa_dataset[cuad_qa_dataset["answers"]!=""][:250]

### SciQ

In [18]:
dataset_name = "sciq"
column_tuple = ("question", "support", "correct_answer")

sciq_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="test")

In [19]:
sciq_qa_dataset = sciq_qa_dataset.rename(columns={"support": "context", "correct_answer": "answers"})


In [20]:
sciq_qa_dataset = sciq_qa_dataset.head(250)

### CovidQA Bio

In [21]:
dataset_name = "covid_qa_deepset"
column_tuple = ("question", "context", "answers")

covidqa_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

In [22]:
covidqa_qa_dataset['answers'][0]

{'text': ['zanamivir (Relenza) and oseltamivir (Tamiflu)'],
 'answer_start': [2356]}

In [23]:
answers = []

for i in range(len(covidqa_qa_dataset)):
    curr_ans_list = covidqa_qa_dataset['answers'][i]['text']
    if len(curr_ans_list)!=0:
        curr_ans = max(curr_ans_list, key = len)
    else:
        curr_ans = ""
    answers.append(curr_ans)
    
covidqa_qa_dataset['answers'] = answers

In [24]:
covidqa_qa_dataset = covidqa_qa_dataset.head(250)

### Combining datasets

In [25]:
cuad_qa_dataset["domain"] = "legal"
#bioasq_qa_dataset["domain"] = "bio"
pubmed_qa_dataset["domain"] = "bio"
squad_qa_dataset["domain"] = "None"
sciq_qa_dataset["domain"] = "science"
covidqa_qa_dataset["domain"] = "bio"

In [26]:
science_list = dataset_model_dict['quac'] + dataset_model_dict['zhengyun21/PMC-Patients']

In [27]:
bio_list = dataset_model_dict['BeIR/bioasq-generated-queries'] + dataset_model_dict['pubmed_qa'] + dataset_model_dict['covid_qa_deepset']

In [28]:
cuad_qa_dataset['models'] = ""
pubmed_qa_dataset['models'] = ""
squad_qa_dataset['models'] = ""
sciq_qa_dataset['models'] = ""
covidqa_qa_dataset['models'] = "" 

In [29]:
cuad_qa_dataset['models'] = cuad_qa_dataset['models'].apply(lambda x: domain_model_dict['legal'])
pubmed_qa_dataset['models'] = pubmed_qa_dataset['models'].apply(lambda x: domain_model_dict['bio'])
squad_qa_dataset['models'] = squad_qa_dataset['models'].apply(lambda x: domain_model_dict['None'])
sciq_qa_dataset['models'] = sciq_qa_dataset['models'].apply(lambda x: domain_model_dict['science'])
covidqa_qa_dataset['models'] = covidqa_qa_dataset['models'].apply(lambda x: domain_model_dict['bio'])


In [30]:
eval_dataset = pd.concat([cuad_qa_dataset, pubmed_qa_dataset, squad_qa_dataset, sciq_qa_dataset, covidqa_qa_dataset], ignore_index=True)

In [32]:
eval_dataset = eval_dataset.sample(frac=1).reset_index(drop=True)

In [37]:
eval_dataset.to_csv("eval_dataset_v2.csv")

In [35]:
df = pd.read_csv("eval_dataset_v2.csv")