In [None]:
import functools
import pandas as pd
import seqio
import t5
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

tsv_path = {
    "train": "data/train-train_clf_test_clf.tsv",
    "test": "data/test-train_clf_test_clf.tsv"
}

def read_prompt_dict(filename: str) -> dict:
    result = {}
    df = pd.read_csv(filename, header=None, sep="\t", names=["task_name", "task_prefix", "prompt", "prompt_len", "io_sep"])
    for _, row in df.iterrows():
        result[row.task_prefix] = (row.prompt, row.io_sep)
    return result

PROMPT_DICT = read_prompt_dict("data/prompt/prompt.tsv")

def dataset_fn(split, shuffle_files=False):
    del shuffle_files  # We only have one file for each split.

    df = pd.read_csv(tsv_path[split], header=None, sep="\t")
    df = df[range(4)]  # Only take the first 4 columns.
    df.columns = ["task_name", "task_prefix", "input", "target"]
    lines = []
    for _, row in df.iterrows():
        prompt_prefix, io_sep = PROMPT_DICT[row.task_prefix]
        input_text = prompt_prefix + " " + row.input + " " + io_sep
        lines.append(input_text + "\t" + str(row.target))
    ds = tf.data.Dataset.from_tensor_slices(lines)
    # Split each "<input>\t<target>" example into (input, target) tuple.
    ds = ds.map(
        functools.partial(
            tf.io.decode_csv, record_defaults=["", ""], field_delim="\t", use_quote_delim=False
        ), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Map each tuple to a {"input": ... "target": ...} dict.
    ds = ds.map(lambda *ex: dict(zip(["input", "target"], ex)))
    return ds

def preprocessor_fn(ds):
    def normalize_text(text):
        """Lowercase and remove quotes from a TensorFlow string."""
        text = tf.strings.lower(text)
        text = tf.strings.regex_replace(text, "'(.*)'", r"\1")
        return text
    def to_inputs_and_targets(ex):
        """Map {"input": ..., "target": ...}->{"inputs": ..., "targets": ...}."""
        return {
            "inputs": normalize_text(ex["input"]),
            "targets": normalize_text(ex["target"])
        }
    return ds.map(to_inputs_and_targets, 
                  num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
train_ds = dataset_fn("train")
train_ds = preprocessor_fn(train_ds)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(vocabulary=t5.data.get_default_vocabulary(), add_eos=True),
    "targets": seqio.Feature(vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}

tokenized_train_ds = seqio.preprocessors.tokenize(
    train_ds,
    DEFAULT_OUTPUT_FEATURES,
    copy_pretokenized=True, with_eos=True)

for x in tokenized_train_ds.take(25663):
    if x["inputs"].shape[0] > 1024:
        print(x["inputs"].shape[0])

In [None]:
vocab = t5.data.get_default_vocabulary()
vocab.encode_tf(tf.constant("the quick brown fox")).shape[0]

In [None]:
import re

with open("data/train-train_non_mrc_qa_test_mrc.tsv") as fin:
    lines = fin.readlines()
text = ''.join(lines)
text = re.sub('\s+','',text)
text = re.sub('[ -~]', '', text)

set1 = set()
for c in text:
    set1.add(c)
weirdtext = ''.join([a for a in set1])
print(weirdtext)