## Notebook for generating inverse-nli datasets

In [1]:
import os
import json
from os.path import join as oj
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

Download the raw NLI data

In [1]:
!git clone https://github.com/allenai/natural-instructions.git

fatal: destination path 'natural-instructions' already exists and is not an empty directory.


List of tasks is available here: https://github.com/allenai/natural-instructions/blob/master/tasks/README.md

# Get some tasks to invert

- :	Given a sentence, specify the tense of the main verb.	Linguistic Probing	Narrative	English	English
	Given a sentence, specify singularity or plurality of the subject.
- : Given a country, return it's capital city	Misc.	Countries	English	English
- task1147_country_currency: Given a country, return it's currency
- task1149_item_check_edible; Given an item, check if it is edible or not

In [10]:
task_names = [
    'task1146_country_capital',
    'task1147_country_currency',
    'task1149_item_check_edible',
    'task429_senteval_tense',
    'task430_senteval_subject_count',
    'task609_sbic_potentially_offense_binary_classification'
]
task_defs = {}
metadata = defaultdict(list)
nli_tasks_dir = '/home/chansingh/interpretable-autoprompting/data_utils/natural-instructions/tasks'
out_dir = '/home/chansingh/interpretable-autoprompting/data_utils/nli_processed'
os.makedirs(out_dir, exist_ok=True)
for task_name in tqdm(task_names):
    task_json_file = oj(nli_tasks_dir, task_name + '.json')
    task = json.load(open(task_json_file, 'r'))
    task_def = task['Definition'][0]
    df = pd.DataFrame.from_dict(task['Instances'])
    df = df.drop(columns='id')
    df['output'] = df['output'].apply(lambda x: x[0])
    df['text'] = 'Input: ' + df['input'] + ' Answer: ' + df['output'] + '\n'
    print(task_name, task_def)
    task_defs[task_name] = task_def
    df.to_csv(oj(out_dir, task_name + '.csv'), index=False)
json.dump(task_defs, open(oj(out_dir, 'task_defs.json'), 'w'), indent=4)

100%|██████████| 6/6 [00:00<00:00, 35.57it/s]

task1146_country_capital In this task, you are given a country name and you need to return the capital city of the given country
task1147_country_currency You are given a country name and you need to return the currency of the given country.
task1149_item_check_edible In this task, you are given an item and you need to check whether it is edible or not, return 1 if it is edible, else return 2.
task429_senteval_tense In this task you are given a sentence. You must judge whether the main verb of the sentence is in present or past tense. Label the instances as "Present" or "Past" based on your judgment. If there is no verb in the given text, answer "Present".
task430_senteval_subject_count In this task you are given a sentence. You must judge whether subject of the main clause is singular or plural. Label the instances as "Singular" or "Plural" based on your judgment.





# metadata

In [11]:
tabs = pd.read_html('https://github.com/allenai/natural-instructions/blob/master/tasks/README.md')
tab = tabs[0]

In [22]:
task_defs_brief = {}
for task_name in task_names:
    row = tab[tab.Name == task_name]
    # print(row)
    task_defs_brief[task_name] = row.Summary.values[0]
json.dump(task_defs_brief, open(oj(out_dir, 'task_defs_brief.json'), 'w'), indent=4)