This notebook generates a prompt prefix for every task. Prompt prefix consists of instruction and demonstrations.

The instruction is obtained from [PromptSource](https://github.com/bigscience-workshop/promptsource) using the notebook `get_instructions.ipynb`, and then manually filtered.

The demonstrations are sampled from the `dev` split.

# See statistics of each tasks

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from transformers import T5Tokenizer


TASK_NAMES = [
    "ade_corpus_v2-classification", "circa", "discovery", "emotion", "ethos-directed_vs_generalized",
    "ethos-disability", "ethos-gender", "ethos-sexual_orientation", "glue-cola", "glue-mnli", "glue-mrpc",
    "glue-qnli", "glue-qqp", "glue-rte", "glue-sst2", "glue-wnli", "google_wellformed_query", "hate_speech_offensive",
    "hatexplain", "health_fact", "imdb", "kilt_fever", "liar", "onestop_english", "paws", "rotten_tomatoes", "scicite",
    "scitail", "sick", "sms_spam", "superglue-rte", "superglue-wic", "superglue-wsc", "trec", "trec-finegrained",
    "tweet_eval-emoji", "tweet_eval-emotion", "tweet_eval-irony", "tweet_eval-offensive", "tweet_eval-sentiment",
    "tweet_eval-stance_abortion", "tweet_eval-stance_climate", "tweet_eval-stance_hillary", "wiki_auto", "yahoo_answers_topics", 
    "anli", "dbpedia_14", "emo", "ethos-race", "ethos-religion", "financial_phrasebank",
    "superglue-cb", "tab_fact", "wiki_qa", "yelp_polarity"]
T5_MODEL = "t5-base"
MAX_INPUT_LEN = 1024


def get_task_prefixes(data_path: str, task_name: str) -> list:
    """Returns all task prefixes (e.g., adversarialqa_32_13) of a task."""
    files = sorted(os.listdir(os.path.join(data_path, task_name)))
    prefixes = []
    for filename in files:
        if not filename.endswith(".tsv"):
            continue
        prefix = "_".join(filename.split("_")[:-1])
        if prefix not in prefixes:
            prefixes.append(prefix)
    return prefixes

def get_all_examples(task_name: str) -> list:
    examples = []
    prefix = get_task_prefixes("data/crossfit", task_name)[0]
    for suffix in ["_train.tsv", "_dev.tsv", "_test.tsv"]:
        with open(os.path.join("data/crossfit", task_name, prefix + suffix)) as fin:
            lines = fin.readlines()
        for line in lines:
            d = line.strip().split("\t")
            examples.append([d[0], d[1:]])
    return examples

tokenizer = T5Tokenizer.from_pretrained(T5_MODEL, model_max_length=MAX_INPUT_LEN)

data = []
for task_name in TASK_NAMES:
    examples = get_all_examples(task_name)    
    tokenized_input = tokenizer([ex[0] for ex in examples])
    lengths = [len(x) for x in tokenized_input["input_ids"]]

    data.append([
        task_name, len(examples), np.min(lengths), np.max(lengths),
        np.percentile(lengths, 25), np.percentile(lengths, 50), np.percentile(lengths, 75), lengths
    ])

stats_df = pd.DataFrame(
    data, columns=["task_name", "n_examples", "min_len", "max_len", "percentile25", "percentile50", "percentile75", "all_lengths"])
stats_df

  from .autonotebook import tqdm as notebook_tqdm
2022-12-11 12:06:26.610105: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Token indices sequence length is longer than the specified maximum sequence length for this model (1398 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,task_name,n_examples,min_len,max_len,percentile25,percentile50,percentile75,all_lengths
0,ade_corpus_v2-classification,4768,5,176,25.0,34.0,45.0,"[48, 23, 72, 15, 25, 27, 45, 11, 37, 11, 34, 1..."
1,circa,6860,31,69,41.0,45.0,49.0,"[48, 38, 57, 50, 45, 48, 50, 40, 47, 45, 44, 5..."
2,discovery,14268,14,140,42.0,53.0,64.0,"[43, 38, 45, 63, 83, 39, 60, 58, 34, 61, 68, 4..."
3,emotion,2192,5,90,16.0,24.0,34.0,"[27, 43, 12, 56, 13, 26, 20, 18, 20, 27, 15, 1..."
4,ethos-directed_vs_generalized,151,7,1398,17.0,24.0,37.5,"[25, 13, 17, 28, 16, 17, 51, 21, 25, 26, 22, 1..."
5,ethos-disability,151,7,1398,16.5,22.0,33.5,"[25, 16, 11, 25, 26, 18, 12, 24, 58, 12, 12, 2..."
6,ethos-gender,151,7,1398,17.0,24.0,36.0,"[25, 31, 13, 47, 28, 16, 31, 38, 17, 51, 11, 2..."
7,ethos-sexual_orientation,151,7,1398,17.0,25.0,35.0,"[25, 13, 17, 47, 28, 16, 22, 19, 31, 38, 17, 3..."
8,glue-cola,1107,6,40,10.0,12.0,16.0,"[22, 12, 13, 8, 13, 14, 14, 11, 12, 9, 12, 12,..."
9,glue-mnli,9911,11,267,31.0,42.0,57.0,"[18, 67, 25, 98, 30, 65, 78, 44, 60, 25, 37, 6..."


In [None]:
def plot_hist(task_name, n_bins=40):
    n, bins, patches = plt.hist(stats_df[stats_df.task_name == task_name].iloc[0]["all_lengths"], n_bins)
    plt.show()

plot_hist("tab_fact")

# Generate prompt prefix

In [2]:
TASK_NAMES = [
    "ade_corpus_v2-classification", "circa", "discovery", "emotion", "ethos-directed_vs_generalized",
    "ethos-disability", "ethos-gender", "ethos-sexual_orientation", "glue-cola", "glue-mnli", "glue-mrpc",
    "glue-qnli", "glue-qqp", "glue-rte", "glue-sst2", "glue-wnli", "google_wellformed_query", "hate_speech_offensive",
    "hatexplain", "health_fact", "liar", "paws", "rotten_tomatoes", "scicite",
    "scitail", "sick", "sms_spam", "superglue-rte", "superglue-wic", "superglue-wsc", "trec", "trec-finegrained",
    "tweet_eval-emotion", "tweet_eval-irony", "tweet_eval-offensive", "tweet_eval-sentiment",
    "tweet_eval-stance_abortion", "tweet_eval-stance_climate", "tweet_eval-stance_hillary", "wiki_auto",
    "anli", "dbpedia_14", "emo", "ethos-race", "ethos-religion", "financial_phrasebank",
    "superglue-cb", "wiki_qa"]
T5_MODEL = "t5-base"
MAX_INPUT_LEN = 1024
K = 8  # Number of demonstrations.
INSTRUCTIONS_FILE = "data/prompt/instructions_iosep.tsv"
OUTPUT_FILE = "data/prompt/prompt.tsv"


# Read instructions data.
INSTRUCTIONS_DICT = {}
with open(INSTRUCTIONS_FILE) as fin:
    lines = fin.readlines()
for line in lines:
    splits = line.strip().split("\t")  # Splits into (task_name, instruction, input_output_separator).
    INSTRUCTIONS_DICT[splits[0]] = splits[1], splits[2]

In [3]:
import random

data = []
for task_name in TASK_NAMES:
    prefixes = get_task_prefixes("data/crossfit", task_name)
    for prefix in prefixes:
        # Get dev examples
        dev_examples = []
        with open(os.path.join("data/crossfit", task_name, prefix + "_dev.tsv")) as fin:
            lines = fin.readlines()
        for line in lines:
            d = line.strip().split("\t")
            dev_examples.append([d[0], d[1:]])

        # Construct prompt with demos and instructions
        demos = random.sample(dev_examples, K)
        instructions, iosep = INSTRUCTIONS_DICT[task_name]
        demos_text = " ".join(["{} {} {}".format(ex[0], iosep, random.choice(ex[1])) for ex in demos])
        prompt = instructions + " " + demos_text

        data.append([
            task_name, prefix, prompt, len(tokenizer(prompt)["input_ids"]), iosep
        ])

prompt_df = pd.DataFrame(
    data, columns=["task_name", "task_prefix", "prompt", "prompt_len", "io_sep"])
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(prompt_df)

Unnamed: 0,task_name,task_prefix,prompt,prompt_len,io_sep
0,ade_corpus_v2-classification,ade_corpus_v2-classification_16_100,Is the following text related to adverse drug ...,333,answer:
1,ade_corpus_v2-classification,ade_corpus_v2-classification_16_13,Is the following text related to adverse drug ...,315,answer:
2,ade_corpus_v2-classification,ade_corpus_v2-classification_16_21,Is the following text related to adverse drug ...,399,answer:
3,ade_corpus_v2-classification,ade_corpus_v2-classification_16_42,Is the following text related to adverse drug ...,306,answer:
4,ade_corpus_v2-classification,ade_corpus_v2-classification_16_87,Is the following text related to adverse drug ...,365,answer:
5,circa,circa_16_100,Given the question-answer pair of X and Y in a...,403,answer:
6,circa,circa_16_13,Given the question-answer pair of X and Y in a...,418,answer:
7,circa,circa_16_21,Given the question-answer pair of X and Y in a...,425,answer:
8,circa,circa_16_42,Given the question-answer pair of X and Y in a...,395,answer:
9,circa,circa_16_87,Given the question-answer pair of X and Y in a...,439,answer:


In [4]:
# Save results.
prompt_df.to_csv(OUTPUT_FILE, index=False, sep="\t", header=None)