Notebook for preprocessing induction datasets.

In [10]:
import os
import json
from os.path import join
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

Download the raw instruction induction data

In [35]:
%%bash
git clone https://github.com/orhonovich/instruction-induction.git
mkdir induction_processed

fatal: destination path 'instruction-induction' already exists and is not an empty directory.


In [1]:
raw_dir = 'instruction-induction/data/raw/induce'
annotations_dir = 'instruction-induction/data/annotations'
task_names = [task_name.replace('.json', '') for task_name in os.listdir(raw_dir)]
out_dir = 'induction_processed'
task_names

['cause_and_effect',
 'sum',
 'num_to_verbal',
 'diff',
 'first_word_letter',
 'singular_to_plural',
 'synonyms',
 'letters_list',
 'sentence_similarity',
 'informal_to_formal',
 'rhymes',
 'common_concept',
 'second_word_letter',
 'translation_en-fr',
 'taxonomy_animal',
 'sentiment',
 'active_to_passive',
 'word_in_context',
 'orthography_starts_with',
 'antonyms',
 'negation',
 'translation_en-de',
 'larger_animal',
 'translation_en-es']

In [45]:
task_defs = {}
for task_name in sorted(task_names):
    print('task_name', task_name)

    # load json file
    task_json_file = join(raw_dir, task_name + '.json')
    task = json.load(open(task_json_file, 'r'))
    vals = list(task['examples'].values())
    input_key = 'input'
    output_key = 'output'
    if task_name == 'cause_and_effect':
        input_key = 'cause'
        output_key = 'effect'
    elif task_name == 'common_concept':
        input_key = 'concept'
        output_key = 'items'
    inputs = [val[input_key] for val in vals]
    outputs = [val[output_key] for val in vals]
    if task_name == 'common_concept':
        outputs = [' '.join(out) for out in outputs]


    task_def = json.load(open(join(annotations_dir, task_name + '.json'), 'r'))['annotations'][0]
    # print('examples', inputs[:5], outputs[:5])
    df = pd.DataFrame.from_dict({
        'input': inputs,
        'output': outputs,
    })

    df['text'] = 'Input: ' + df['input'] + ' Answer: ' + df['output'] + '\n'
    # print(task_name, '\n' + task_def)
    # print('brief:', task_defs_brief[task_name])
    print(df.iloc[0].text)
    # print(df['output'].value_counts())
    task_defs[task_name] = task_def
    # print(df.head())
    df.to_csv(join(out_dir, task_name + '.csv'), index=False)
    
json.dump(task_defs, open(join(out_dir, 'task_defs.json'), 'w'), indent=4)

task_name active_to_passive
Input: The tourist supported the authors. Answer: The authors were supported by the tourist.

task_name antonyms
Input: sane Answer: insane

task_name cause_and_effect
Input: It started raining. Answer: The woman who was walking on the street opened her umbrella.

task_name common_concept
Input: involve oscillations. Answer: guitars pendulums neutrinos

task_name diff
Input: 0 0 Answer: 0

task_name first_word_letter
Input: time Answer: t

task_name informal_to_formal
Input: I think that this is interesting. Answer: It is my opinion that this is interesting.

task_name larger_animal
Input: rabbit, snail Answer: rabbit

task_name letters_list
Input: time Answer: t i m e

task_name negation
Input: To emphasize the 50th anniversary of the Super Bowl the gold color was used. Answer: To emphasize the 50th anniversary of the Super Bowl the gold color was not used.

task_name num_to_verbal
Input: 0 Answer: zero

task_name orthography_starts_with
Input: I prefer for