In [117]:
from datasets import load_dataset
import pandas as pd
import os
import json

In [118]:
repository_directory = os.path.abspath('') + "/repository"
models_jsons = os.listdir(repository_directory)

In [119]:
dataset_model_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        for dataset in data['dataset']:
            if dataset not in dataset_model_dict:
                dataset_model_dict[dataset] = []
            
            dataset_model_dict[dataset].append(data['model_name'])

In [120]:
domain_model_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        for domain in data['domain']:
            if domain not in domain_model_dict:
                domain_model_dict[domain] = []
            
            domain_model_dict[domain].append(data['model_name'])

In [121]:
domain_dataset_dict = {}

for model_file in models_jsons:
    with open(repository_directory + "/" + model_file) as model_json:
        data = json.load(model_json)
        domain = data['domain'][0]
        for dt in data['dataset']:
            if domain not in domain_dataset_dict:
                domain_dataset_dict[domain] = []
            
            domain_dataset_dict[domain].append(dt)

In [122]:
domain_model_dict

{'None': ['mrm8488/longformer-base-4096-finetuned-squadv2',
  'allenai/longformer-large-4096-finetuned-triviaqa',
  'allenai/unifiedqa-t5-base'],
 'math': ['AdapterHub/roberta-base-pf-hotpotqa',
  'AlexWortega/taskGPT2-xl-v0.2a',
  'vanadhi/roberta-base-fiqa-flm-sq-flit'],
 'legal': ['Rakib/roberta-base-on-cuad', 'akdeniz27/deberta-v2-xlarge-cuad'],
 'bio': ['ozcangundes/T5-base-for-BioQA',
  'microsoft/biogpt',
  'Sarmila/pubmed-bert-squad-covidqa'],
 'narrative': ['MaRiOrOsSi/t5-base-finetuned-question-answering'],
 'science': ['razent/SciFive-base-Pubmed_PMC', 'ixa-ehu/SciBERT-SQuAD-QuAC'],
 'finance': ['vanadhi/roberta-base-fiqa-flm-sq-flit']}

In [123]:
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 250,
                             seed: int = 1001,
                             **kwargs) -> pd.DataFrame:    
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, split="test")
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e
    
    
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 2000,
                             seed: int = 1001,
                             **kwargs) -> pd.DataFrame:
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, **kwargs)
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e
    
def extract_answer_from_list(dataset, answer_column, answer_column_json_key):
    answers = []
    
    for i in range(len(dataset)):
        curr_ans_list = dataset[answer_column][i][answer_column_json_key]
        if len(curr_ans_list)==0:
            curr_ans = ""
        else:
            curr_ans = max(curr_ans_list, key = len)
            
        answers.append(curr_ans)
        
    dataset["answer"] = answers
    return dataset
    
def extract_relevant_columns(dataset, columns):
    return dataset[columns]

### Squad Dataset

In [124]:
dataset_name = "squad"
configs = None
column_tuple = ("question", "context", "answers")

squad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="validation")

In [125]:
extract_answer_from_list(squad_qa_dataset, "answers", "text")

Unnamed: 0,question,context,answers,answer
0,What Western was a flagship program for ABC ar...,At the same time he made attempts to help grow...,"{'text': ['The Lone Ranger', 'The Lone Ranger'...",The Lone Ranger
1,What types of scientists looks for signs of ma...,"In the laboratory, biostratigraphers analyze r...","{'text': ['Magnetic stratigraphers', 'Magnetic...",Magnetic stratigraphers
2,What happened in 1901?,"In December 1901, Marconi successfully transmi...",{'text': ['Marconi successfully transmitted th...,Marconi successfully transmitted the letter S ...
3,What was the name of France's primary colony i...,The exodus of Huguenots from France created a ...,"{'text': ['New France', 'New France', 'New Fra...",New France
4,What is not considered appropriate disclipine?,A modern example of school discipline in North...,{'text': ['sarcasm and attempts to humiliate p...,sarcasm and attempts to humiliate pupils
...,...,...,...,...
1995,What type of treaty was the Lisbon Treaty?,"Following the Nice Treaty, there was an attemp...","{'text': ['an amending treaty', 'an amending t...",an amending treaty
1996,What is the term that describes the difference...,Neoclassical economics views inequalities in t...,"{'text': ['productivity gap', 'productivity ga...",productivity gap
1997,In which region tribe were large settlements d...,"Terra preta (black earth), which is distribute...","{'text': ['Xingu tribe', 'Xingu', 'Xingu'], 'a...",Xingu tribe
1998,Where are some physicians permitted to prescri...,"In some rural areas in the United Kingdom, the...",{'text': ['In some rural areas in the United K...,prescribe and dispense prescription-only medic...


### Pubmed Biology Dataset

In [126]:
dataset_name = "pubmed_qa"
config = "pqa_labeled"
column_tuple = ("question", "context", "long_answer")

pubmed_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="train")

contexts_strings = []

for i in range(len(pubmed_qa_dataset)):
    contexts_strings.append(' '.join(pubmed_qa_dataset["context"][i]['contexts']))
    
pubmed_qa_dataset['context'] = contexts_strings
pubmed_qa_dataset = pubmed_qa_dataset.rename(columns={"long_answer": "answer"})

### BioASQ dataset

In [127]:
bioasq_dataset = pd.read_csv("bioasq.csv")

In [128]:
bioasq_dataset = bioasq_dataset.rename(columns={"Question": "question", "Context":"context", "Answer":"answer"})

### SciQ

In [129]:
dataset_name = "sciq"
column_tuple = ("question", "support", "correct_answer")

sciq_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="test")

In [130]:
sciq_qa_dataset = sciq_qa_dataset.rename(columns={"support": "context", "correct_answer": "answer"})


### CovidQA Bio

In [131]:
dataset_name = "covid_qa_deepset"
column_tuple = ("question", "context", "answers")

covidqa_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

In [132]:
extract_answer_from_list(covidqa_qa_dataset, "answers", "text")

Unnamed: 0,question,context,answers,answer
0,Why is this approach significant?,Respiratory Viral Infections in Exacerbation o...,{'text': ['due to the current scarcity of anti...,due to the current scarcity of antiviral drugs...
1,When was SARS-CoV first identified?,Host resilience to emerging coronaviruses\n\nh...,"{'text': ['2003'], 'answer_start': [1375]}",2003
2,What is the amino acid similarity between IFIT...,Role of S-Palmitoylation on IFITM5 for the Int...,"{'text': ['~ 65% similarity'], 'answer_start':...",~ 65% similarity
3,What family of virus does MERS reside in?,Host resilience to emerging coronaviruses\n\nh...,"{'text': ['coronavirus'], 'answer_start': [1018]}",coronavirus
4,How can a semi-mechanistic Bayesian hierarchic...,Estimating the number of infections and the im...,{'text': ['calculating backwards from the deat...,calculating backwards from the deaths observed...
...,...,...,...,...
1995,Patients from how many medical centers were st...,Which Kind of Provider’s Operation Volumes Mat...,"{'text': ['19'], 'answer_start': [9132]}",19
1996,What domembers of the Roquin and Regnase famil...,Frontiers in antiviral therapy and immunothera...,{'text': ['promote or effect degradation of mR...,promote or effect degradation of mRNAs harbour...
1997,What did the study report?,Chikungunya: A Potentially Emerging Epidemic?\...,{'text': ['neonatal infection associated with ...,neonatal infection associated with intrapartum...
1998,Why is additional research needed?,First cases of coronavirus disease 2019 (COVID...,{'text': ['to complement surveillance data to ...,to complement surveillance data to build knowl...


### MLQA

In [133]:
dataset_name = "mlqa"
column_tuple = ("question", "context", "answers")
config = "mlqa-translate-test.ar"

mlqa_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="test")

In [134]:
extract_answer_from_list(mlqa_qa_dataset, "answers", "text")

Unnamed: 0,question,context,answers,answer
0,Where is your look district located?,Donetsk () is a district (Province) in the Sou...,"{'answer_start': [-1], 'text': ['South East Uk...",South East Ukraine
1,"What does the term ""hrs"" mean?",Consultancy (33 %) - the consultancy services ...,"{'answer_start': [-1], 'text': ['Human resourc...",Human resources services
2,When did james land,James traveled to Ireland - with the help of F...,"{'answer_start': [64], 'text': ['March 1689..']}",March 1689..
3,What is the degree that has been achieved?,"Houston replied (Houston): "" thank you Apollo ...","{'answer_start': [1259], 'text': ['master of s...",master of science
4,What is the date of Modric's first goal in 201...,"On 11 September 2010, Modric scored his first ...","{'answer_start': [-1], 'text': ['September 11,...","September 11, 2010"
...,...,...,...,...
1995,What happened to the vaccine market when the t...,"In the late th century, vaccines were produced...","{'answer_start': [359], 'text': ['improved']}",improved
1996,What is the nationality of Laura Christine K?,"Christine Kreuk (born December 30, 1982 in Van...","{'answer_start': [72], 'text': ['Canada']}",Canada
1997,What is the available content?,"Founded on February 14, 2005, three former emp...","{'answer_start': [-1], 'text': ['Movie Clips, ...","Movie Clips, TV, music, as well as video produ..."
1998,What is Roger's mother's first name in Federer?,"Federer was born at Canton hospital in Basel, ...","{'answer_start': [174], 'text': ['Lynette']}",Lynette


### MRQA

In [135]:
dataset_name = "mrqa"
column_tuple = ("question", "context", "answers")

mrqa_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="test")

In [136]:
answers = []

for i in range(len(mrqa_qa_dataset)):
    curr_ans_list = mrqa_qa_dataset['answers'][i]
    curr_ans = max(curr_ans_list, key = len)
    answers.append(curr_ans)
    
mrqa_qa_dataset['answer'] = answers

### logi_qa

In [137]:
dataset_name = "lucasmccabe/logiqa"
column_tuple = ("query", "context", "options", "correct_option")

logi_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

In [138]:
answers = []

for i in range(len(logi_qa_dataset)):
    curr_options = logi_qa_dataset['options'][i]
    correct_answer = int(logi_qa_dataset['correct_option'][i])
    answers.append(curr_options[correct_answer])
    
logi_qa_dataset['answer'] = answers

In [139]:
logi_qa_dataset = logi_qa_dataset.rename(columns={"query": "question"})

### subjqa - grocery

In [140]:
dataset_name = "subjqa"
column_tuple = ("question", "context", "answers")
config = "grocery"

subjqa_grocery_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="train")

In [141]:
extract_answer_from_list(subjqa_grocery_qa_dataset, "answers", "text")

Unnamed: 0,question,context,answers,answer
0,Where i can find a new kernel?,I made this on the stovetop using an older sty...,"{'text': [], 'answer_start': [], 'answer_subj_...",
1,How is amount?,These Peeled Snacks Organic Apple Clusters are...,{'text': ['the serving size is extremely small...,the serving size is extremely small
2,How is the calorie?,I was attracted to this product because I've b...,"{'text': ['120 calories'], 'answer_start': [26...",120 calories
3,How is the sauce?,My whole family really enjoyed this kit. The s...,"{'text': ['The sauce was fresh', 'and super ea...",and super easy to make
4,How did you think about the selection?,I have to say that I was surprisingly impresse...,"{'text': [], 'answer_start': [], 'answer_subj_...",
...,...,...,...,...
1119,How natural does it taste?,I can't say this tastes like apples but it doe...,{'text': ['I can't say this tastes like apples...,I can't say this tastes like apples
1120,What is the quality of the blend?,Vanilla to me is like licorice: one of those f...,"{'text': [], 'answer_start': [], 'answer_subj_...",
1121,How do you like the texture?,I grew up on homemade andamaranth graham crack...,"{'text': ['andamaranth', 'The flavor and textu...",The flavor and texture more than makes up for ...
1122,Is that a cup of coffee?,"As in, I think I need to double the amount of ...",{'text': ['I think I need to double the amount...,I think I need to double the amount of coffee ...


### subjqa - restaurants

In [142]:
dataset_name = "subjqa"
column_tuple = ("question", "context", "answers")
config = "restaurants"

subjqa_restaurants_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="train")

In [143]:
extract_answer_from_list(subjqa_restaurants_qa_dataset, "answers", "text")

Unnamed: 0,question,context,answers,answer
0,Do you have a good working atmosphere?,"I've been wanting to go for the longest time, ...","{'text': ['It is a loud and fun environment', ...",It is a loud and fun environment
1,What are the burgers like in this spot?,A few foodie coworkers of mine read the glowin...,"{'text': ['which tasted perfectly fine, but it...","which tasted perfectly fine, but it wasn't as ..."
2,How is the seat?,Gluten free fried chicken and waffles!?! Yes p...,"{'text': [], 'answer_start': [], 'answer_subj_...",
3,Is the poutine in the menu?,$28 for a dozen wings?!! Seriously!!!!!? Their...,"{'text': [], 'answer_start': [], 'answer_subj_...",
4,Do they have a variety of salad?,I went on a Sunday night for dinner. They have...,{'text': ['but it's a great place for a date o...,but it's a great place for a date or a fancy d...
...,...,...,...,...
1395,How is the vibe for this game?,"I'm not much of a sports guy, but even I can s...","{'text': [], 'answer_start': [], 'answer_subj_...",
1396,How is the atmosphere?,"Always a good to time at Nome! Great food, gre...","{'text': ['the environment is amazing'], 'answ...",the environment is amazing
1397,How is the price?,Delish! We get a variety of dishes all the tim...,"{'text': [], 'answer_start': [], 'answer_subj_...",
1398,How is the sauce in that restaurant?,It's really diffcult for me to figure out how ...,"{'text': [], 'answer_start': [], 'answer_subj_...",


## Dataset List

In [144]:
dataset_list = [bioasq_dataset, pubmed_qa_dataset, squad_qa_dataset, sciq_qa_dataset, covidqa_qa_dataset, 
                mlqa_qa_dataset, mrqa_qa_dataset, logi_qa_dataset, subjqa_grocery_qa_dataset, subjqa_restaurants_qa_dataset]

dataset_list_names = ["bioasq_dataset", "pubmed_qa_dataset", "squad_qa_dataset", "sciq_qa_dataset", "covidqa_qa_dataset", 
                "mlqa_qa_dataset", "mrqa_qa_dataset", "logi_qa_dataset", "subjqa_grocery_qa_dataset", "subjqa_restaurants_qa_dataset"]

# Filtering Dataset Columns

In [145]:
column_list = ["context", "question", "answer"]

for i in range(len(dataset_list)):
    dataset_list[i] = extract_relevant_columns(dataset_list[i], column_list)

### Combining datasets

#### Adding domains of the dataset

In [146]:
dataset_list[0]["domain"] = "bio"
dataset_list[1]["domain"] = "bio"
dataset_list[2]["domain"] = "general"
dataset_list[3]["domain"] = "science"
dataset_list[4]["domain"] = "bio"
dataset_list[5]["domain"] = "general"
dataset_list[6]["domain"] = "general"
dataset_list[7]["domain"] = "general"
dataset_list[8]["domain"] = "general"
dataset_list[9]["domain"] = "general"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_list[0]["domain"] = "bio"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_list[1]["domain"] = "bio"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_list[3]["domain"] = "science"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

#### Adding names of the dataset

In [147]:
for i in range(len(dataset_list)):
    dataset_list[i]["models"] = ""
    dataset_list[i]["dataset"] = dataset_list_names[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_list[i]["models"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_list[i]["dataset"] = dataset_list_names[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_list[i]["models"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexe

#### Adding models to the dataset

In [148]:
for i in range(len(dataset_list)):
    dataset = dataset_list[i]
    domain = dataset['domain'][0]
    
    if domain=="general":
        dataset['models'] = dataset['models'].apply(lambda x: domain_model_dict["None"])
    else:
        dataset['models'] = dataset['models'].apply(lambda x: domain_model_dict[domain])
    
    dataset_list[i] = dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['models'] = dataset['models'].apply(lambda x: domain_model_dict[domain])


#### Filter empty rows

In [149]:
def filter_empty_rows(dataset):
    dataset = dataset[dataset['context']!='']
    dataset = dataset[dataset['question']!='']
    dataset = dataset[dataset['answer']!='']
    return dataset
    
for i in range(len(dataset_list)):
    dataset_list[i] = filter_empty_rows(dataset_list[i])

In [150]:
for i in range(len(dataset_list)):
    print(len(dataset_list[i]))

4719
1000
2000
884
2000
2000
2000
2000
523
598


#### Convert to lower case

In [151]:
def convert_to_lower_case(dataset):
    dataset['question'] = dataset['question'].str.lower() 
    dataset['context'] = dataset['context'].str.lower() 
    dataset['answer'] = dataset['answer'].str.lower() 
    return dataset
    
for i in range(len(dataset_list)):
    dataset_list[i] = convert_to_lower_case(dataset_list[i])

In [154]:
for i in range(len(dataset_list)):
    print(len(dataset_list[i]))

1000
1000
1000
884
1000
1000
1000
1000
523
598


#### Filtering 1000 from each dataset

In [153]:
def get_top_k(dataset, k):
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    dataset = dataset.head(k)
    return dataset

for i in range(len(dataset_list)):
    dataset_list[i] = get_top_k(dataset_list[i], 1000)

#### Combining datasets

In [155]:
eval_dataset = pd.concat(dataset_list, ignore_index=True)

In [156]:
len(eval_dataset)

9005

In [157]:
eval_dataset = eval_dataset.sample(frac=1).reset_index(drop=True)

In [158]:
eval_dataset.to_csv("eval_dataset.csv")