In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from promptsource.templates import DatasetTemplates

  from .autonotebook import tqdm as notebook_tqdm


### Data for used in the preprint, "Maybe Only 0.5% Data is Needed: A Preliminary Exploration of Low Training Data Instruction Tuning"

In [62]:
# Randomly pick a instruction template, followin the settings in Section 4.1 of the paper -- 0.5% dataset
p3_datasets = {
    'nli': ['rte', 'cb', 'anli_r1', 'anli_r2', 'anli_r3'], # Natural Language Inference
    'sc': ['copa', 'hellaswag', 'storycloze'], # Sentence Complement
    'wsd': ['winogrande', 'wsc'], # Word Sense Disambiguation
    'cr': ['wic'], # Coreference Resolution
}

prompt_mapping = { # Mapping from dataset name to prompt template in bigscience/P3 
    'rte': 'super_glue_rte_does_it_follow_that', # RTE
    'cb': 'super_glue_cb_does_it_follow_that', # CB
    'anli_r1': 'anli_does_it_follow_that_r1', # ANLI R1
    'anli_r2': 'anli_does_it_follow_that_r2', # ANLI R2
    'anli_r3': 'anli_does_it_follow_that_r3', # ANLI R3
    'copa': 'super_glue_copa_cause_effect', # COPA
    'hellaswag': 'hellaswag_complete_first_then', # HelloSwag
    'storycloze': 'storycloze_choose_story_ending', # Story Cloze
    'winogrande': 'winogrande_winogrande_xl_fill_in_the_blank', # Winogrande
    'wsc': 'super_glue_wsc.fixed_replaced_with', # WSC
    'wic': 'super_glue_wic_question_context', # WIC
}

dataset_path = {
    'storycloze': './data/storycloze/'
}

In [63]:
import os
import random
random.seed(42)

all_train_datasets = []
all_test_datasets = []

def process_p3_datasets(dataset):
    # only keep the answer_choices, inputs_pretokenized and targets_pretokenized columns
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ['answer_choices', 'inputs_pretokenized', 'targets_pretokenized']])
    # rename the columns to be consistent with the template
    dataset = dataset.rename_column('answer_choices', 'choices')
    dataset = dataset.rename_column('inputs_pretokenized', 'input')
    dataset = dataset.rename_column('targets_pretokenized', 'label')

    # add dataset name to the dataset
    dataset = dataset.map(lambda example: {'dataset': dataset_name, 'category': category, 'prompt_template': prompt_template, **example})

    return dataset

def process_storycloze_datasets(dataset, prompt_name, prompt_template):
    dataset_name = 'storycloze'
    category = 'sc'
    dataset = dataset.map(lambda example: {'dataset': dataset_name, 'category': category, 'prompt_template': prompt_name, 'input': prompt_template.apply(example)[0], 'label': prompt_template.apply(example)[1], 'choices': [example['sentence_quiz1'], example['sentence_quiz2']]})
    dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ['dataset', 'category', 'prompt_template', 'input', 'label', 'choices']])

    return dataset
    
for category, datasets in p3_datasets.items():
    for dataset_name in datasets:
        prompt_template = prompt_mapping[dataset_name]
        if dataset_name == 'storycloze':
            path = dataset_path[dataset_name]

            prompt_name = ' '.join(prompt_template.split('_')[1:]).title()
            prompt_template = DatasetTemplates('story_cloze/2016')[prompt_name]
            def sample_random_wrong_answer(example):
                sentence_quiz1 = example['sentence5']
                sentence_quiz2 = random.choice(sentence5_collections)
                answer_right_ending = 1
                while sentence_quiz2 == sentence_quiz1:
                    sentence_quiz2 = random.choice(sentence5_collections)
                
                # shuffle the order of the two sentences and keep the idx of the correct answer "sentence_quiz1"
                if random.random() < 0.5:
                    sentence_quiz1, sentence_quiz2 = sentence_quiz2, sentence_quiz1
                    answer_right_ending = 2
                return {'sentence_quiz1': sentence_quiz1, 'sentence_quiz2': sentence_quiz2, 'answer_right_ending': answer_right_ending}
            
            # add idx to the dataset
            train_dataset = load_dataset("csv", data_files=os.path.join(path, 'train.csv'))
            test_dataset = load_dataset("csv", data_files=os.path.join(path, 'validation.csv'))
            sentence5_collections = train_dataset['train']['sentence5']

            train_dataset = train_dataset['train'].map(lambda example: {**sample_random_wrong_answer(example), 'input_sentence_1': example['sentence1'], 'input_sentence_2': example['sentence2'], 'input_sentence_3': example['sentence3'], 'input_sentence_4': example['sentence4']})
            test_dataset = test_dataset['train'].map(lambda example: {'input_sentence_1': example['InputSentence1'], 'input_sentence_2': example['InputSentence2'], 'input_sentence_3': example['InputSentence3'], 'input_sentence_4': example['InputSentence4'], 'sentence_quiz1': example['RandomFifthSentenceQuiz1'], 'sentence_quiz2': example['RandomFifthSentenceQuiz2'], 'answer_right_ending': example['AnswerRightEnding']})

            train_dataset = process_storycloze_datasets(train_dataset, prompt_name, prompt_template)
            test_dataset = process_storycloze_datasets(test_dataset, prompt_name, prompt_template)
            
        else:
            train_dataset = load_dataset("bigscience/P3", prompt_template, split='train')
            test_dataset = load_dataset("bigscience/P3", prompt_template, split='test')
            
            # Labels information does not exist in some test splits, use validation split instead
            answer_choices = test_dataset['answer_choices'][0]
            if test_dataset['targets_pretokenized'][0] not in answer_choices:
                test_dataset = load_dataset("bigscience/P3", prompt_template, split='validation')

            # filter the dataset to only contain train_split and test_split
            train_dataset = process_p3_datasets(train_dataset)
            test_dataset = process_p3_datasets(test_dataset)
            
        all_train_datasets.append(train_dataset)
        all_test_datasets.append(test_dataset)

In [65]:
# Merge all datasets
from datasets import concatenate_datasets

train_dataset = concatenate_datasets(all_train_datasets)
test_dataset = concatenate_datasets(all_test_datasets)

# combine train_dataset and test_dataset as 'train' and 'test' split

In [66]:
from datasets import DatasetDict
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['choices', 'input', 'label', 'dataset', 'category', 'prompt_template'],
        num_rows: 304955
    })
    test: Dataset({
        features: ['choices', 'input', 'label', 'dataset', 'category', 'prompt_template'],
        num_rows: 17255
    })
})

In [68]:
# push to huggingface datasets hub
dataset.push_to_hub('simonycl/p3_0.5_dataset', private=True)

Creating parquet from Arrow format: 100%|██████████| 305/305 [00:00<00:00, 419.69ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:05<00:00,  5.48s/it]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 202.17ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.59s/it]


In [3]:
p3_subset_dataset = load_dataset('simonycl/p3_0.5_dataset', split='train')
p3_subset_dataset

Downloading readme: 100%|██████████| 741/741 [00:00<00:00, 4.30MB/s]
Downloading data: 100%|██████████| 76.2M/76.2M [00:07<00:00, 9.67MB/s]
Downloading data: 100%|██████████| 10.7M/10.7M [00:00<00:00, 12.3MB/s]
Downloading data files: 100%|██████████| 2/2 [00:08<00:00,  4.38s/it]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 1142.40it/s]
Generating train split: 100%|██████████| 304955/304955 [00:00<00:00, 688231.80 examples/s]
Generating test split: 100%|██████████| 17255/17255 [00:00<00:00, 439264.11 examples/s]


Dataset({
    features: ['choices', 'input', 'label', 'dataset', 'category', 'prompt_template'],
    num_rows: 304955
})