In [3]:
from datasets import load_dataset
import pandas as pd

In [25]:
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 100,
                             seed: int = 42,
                             **kwargs) -> pd.DataFrame:    
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, split="test")
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e
    
    
def sample_rows_from_dataset(dataset: str,
                             column_names: tuple,
                             *args,
                             num_samples: int = 250,
                             seed: int = 42,
                             **kwargs) -> pd.DataFrame:
    if not isinstance(column_names, tuple):
        raise Exception("Column names need to be a list of column names as strings.")
    try:
        dataset = load_dataset(dataset, *args, **kwargs)
    except Exception as e:
        print("Could NOT load dataset for {0}".format(dataset))
        raise Exception("Error while loading dataset {}".format(e))
    shuffled_dataset = dataset.shuffle(seed=seed)
    df = pd.DataFrame(shuffled_dataset[:num_samples])
    try:
        return df[list(column_names)]
    except KeyError as e:
        raise e

### Squad Dataset

In [26]:
dataset_name = "squad"
configs = None
column_tuple = ("question", "context")

squad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="validation")

### Pubmed Biology Dataset

In [29]:
dataset_name = "pubmed_qa"
config = "pqa_labeled"
column_tuple = ("question", "context")

pubmed_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, config, split="train")

contexts_strings = []

for i in range(len(pubmed_qa_dataset)):
    contexts_strings.append(' '.join(pubmed_qa_dataset["context"][i]['contexts']))
    
pubmed_qa_dataset['context'] = contexts_strings

Downloading builder script: 100%|██████████| 11.1k/11.1k [00:00<00:00, 22.4MB/s]
Downloading metadata: 100%|██████████| 12.7k/12.7k [00:00<00:00, 27.1MB/s]
Downloading readme: 100%|██████████| 4.59k/4.59k [00:00<00:00, 15.5MB/s]
Downloading data: 2.58MB [00:00, 5.03MB/s]/3 [00:00<?, ?it/s]
Downloading data: 100%|██████████| 152M/152M [00:05<00:00, 28.5MB/s]]
Downloading data: 100%|██████████| 533M/533M [00:14<00:00, 37.6MB/s]]
Downloading data files: 100%|██████████| 3/3 [00:22<00:00,  7.47s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 229.01it/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 4525.42 examples/s]


### BioASQ dataset

In [30]:
dataset_name = "BeIR/bioasq-generated-queries"
column_tuple = ("text", "query")

bioasq_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")
bioasq_qa_dataset = bioasq_qa_dataset.rename(columns={"text": "context", "query": "question"})
bioasq_qa_dataset = bioasq_qa_dataset[["question", "context"]]

Downloading readme: 100%|██████████| 14.0k/14.0k [00:00<00:00, 40.1MB/s]
Downloading data: 100%|██████████| 7.12G/7.12G [09:22<00:00, 12.7MB/s]
Downloading data files: 100%|██████████| 1/1 [09:22<00:00, 562.45s/it]
Extracting data files: 100%|██████████| 1/1 [01:16<00:00, 76.97s/it]
Generating train split: 14100000 examples [01:29, 158196.47 examples/s]


### cuad (legal) dataset

In [31]:
dataset_name = "cuad"
column_tuple = ("question", "context")

cuad_qa_dataset = sample_rows_from_dataset(dataset_name, column_tuple, split="train")

Downloading builder script: 100%|██████████| 5.19k/5.19k [00:00<00:00, 21.7MB/s]
Downloading metadata: 100%|██████████| 1.91k/1.91k [00:00<00:00, 21.8MB/s]
Downloading readme: 100%|██████████| 15.5k/15.5k [00:00<00:00, 21.3MB/s]
Downloading data: 100%|██████████| 18.3M/18.3M [00:05<00:00, 3.26MB/s]
Generating train split: 100%|██████████| 22450/22450 [00:01<00:00, 12300.53 examples/s]
Generating test split: 100%|██████████| 4182/4182 [00:00<00:00, 15362.74 examples/s]


In [49]:
cuad_qa_dataset["domain"] = "legal"
bioasq_qa_dataset["domain"] = "bio"
pubmed_qa_dataset["domain"] = "bio"
squad_qa_dataset["domain"] = "None"

In [50]:
eval_dataset = pd.concat([cuad_qa_dataset, bioasq_qa_dataset, pubmed_qa_dataset, squad_qa_dataset], ignore_index=True)

In [52]:
eval_dataset.to_csv("eval_dataset.csv")