In [25]:
# load_hans_dataset, load_mnli_mismatched_dataset, load_paws_qqp_dataset, and load_cola_ood_dataset

#PAWS-QQP is not available in huggingface due to license of QQP. It must be reconstructed by downloading the original data and then running our scripts to produce the data and attach the labels.

In [37]:
import logging

# Step 2: Configure the logging level
logging.basicConfig(level=logging.WARNING)

# Step 3: Optionally, you can customize the logging format
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.WARNING)

# Step 4: Create a logger instance
logger = logging.getLogger(__name__)

# Now you can use the logger in your code
logger.warning('This is a warning message')
logger.error('This is an error message')

ERROR:__main__:This is an error message


### Input arguments

In [38]:
task_name="rte"  #["rte", "mnli", "mnli-original", "qqp", "cola"]
pad_to_max_length=False

### Load In-domain evaluation dataset (training data also included in the huggingface api call)

In [39]:
#function almost exact copy from paper's github repo
def load_glue_datasets(task_name, use_auth_token,cache_dir=None):
    # Get the datasets: specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.

    if task_name is not None:
        if task_name == "mnli":
            # convert to binary format (remove neutral class)
            raw_datasets = load_dataset(
                "glue", task_name, cache_dir=cache_dir)

            raw_datasets = raw_datasets.filter(
                lambda example: example["label"] != 1)

            # change labels of contradiction examples from 2 to 1
            def change_label(example):
                example["label"] = 1 if example["label"] == 2 else example["label"]
                return example
            raw_datasets = raw_datasets.map(change_label)

            # change features to reflect the new labels
            features = raw_datasets["train"].features.copy()
            features["label"] = ClassLabel(
                num_classes=2, names=['entailment', 'contradiction'], id=None)
            raw_datasets = raw_datasets.cast(
                features)  # overwrite old features
        
        elif task_name == "mnli-original":
            # convert to binary format (merge neutral and contradiction class)
            raw_datasets = load_dataset(
                path="glue", name="mnli", cache_dir=cache_dir)

            # change labels of contradiction examples from 2 to 1
            def change_label(example):
                example["label"] = 1 if example["label"] == 2 else example["label"]
                return example
            raw_datasets = raw_datasets.map(change_label)

            # change features to reflect the new labels
            features = raw_datasets["train"].features.copy()
            features["label"] = ClassLabel(
                num_classes=2, names=['entailment', 'contradiction'], id=None)
            raw_datasets = raw_datasets.cast(
                features)  # overwrite old features
            
        else:
            # Downloading and loading a dataset from the hub.
            raw_datasets = load_dataset(
                "glue",
                task_name,
                cache_dir=cache_dir,
                use_auth_token=True if use_auth_token else None,
            )

            if task_name == "qqp":
                # we subsample qqp already here because its really big
                # make sure we fix the seed here
                np.random.seed(123)
                for split in raw_datasets.keys():
                    raw_datasets[split] = raw_datasets[split].select(np.random.choice(
                        np.arange(len(raw_datasets[split])), size=1000, replace=False
                    ))
                    
    # Determine number of labels
    is_regression = task_name == "stsb"
    if not is_regression:
        label_list = raw_datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1

    return raw_datasets, label_list, num_labels, is_regression


In [40]:
raw_datasets, label_list, num_labels, is_regression = load_glue_datasets(task_name, use_auth_token=True, cache_dir=None)
#raw datasets contains train, validation and test datasets



### Load Out-of-domain evaluation dataset

In [41]:
#function almost exact copy from paper's github repo
def load_hans_dataset(cache_dir=None, heuristic=None, subcase=None, label=None):
    # heuristic = {lexical_overlap, subsequence, constituent}
    # subcase = see HANS_SUBCASES
    # label = {0 (entailment), 1 (contradiction)}

    subset = "hans"
    dataset = load_dataset(
        "hans", cache_dir=cache_dir, split="validation")

    # hans comes without indices, so we add them
    indices = list(range(len(dataset)))
    dataset = dataset.add_column(name="idx", column=indices)

    if heuristic is not None:  # filter dataset based on heuristic
        dataset = dataset.filter(
            lambda example: example["heuristic"] == heuristic)
        subset = f"{subset}-{heuristic}"

    if subcase is not None:  # filter dataset based on subcase
        dataset = dataset.filter(
            lambda example: example["subcase"] == subcase)
        subset = f"{subset}-{subcase}"

    if label is not None:  # filter dataset based on label
        dataset = dataset.filter(
            lambda example: example["label"] == label)
        subset = f"{subset}-{'entailment' if label == 0 else 'contradiction'}"

    return dataset, subset

def load_mnli_mismatched_dataset(label=None,cache_dir=None, merge=False):
    subset = "mnli_mm"

    dataset = load_dataset(
        "glue", "mnli", split=f"validation_mismatched", cache_dir=cache_dir)

    if not merge:
        # remove neutral class
        dataset = dataset.filter(
            lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset.features.copy()
    features["label"] = ClassLabel(
        num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(
        features)  # overwrite old features

    if label is not None:  # filter dataset based on label
        dataset = dataset.filter(
            lambda example: example["label"] == label)
        subset = f"{subset}-{'entailment' if label == 0 else 'contradiction'}"

    return dataset, subset

def load_paws_qqp_dataset(path, label=None, cache_dir=None):
    # TODO(mm): there's probably a better way of doing this
    data_files = {"validation": path}
    dataset = load_dataset("csv", data_files=data_files,
                           sep="\t", cache_dir=cache_dir)
    dataset = dataset["validation"]

    subset = "paws-qqp"

    def _clean_data(sample):
        # the paws-qqp dataset was created as a stream of bytes. So every sentence starts with "b and ends with ".
        # we remove these
        sample["sentence1"] = sample["sentence1"][2:-1]
        sample["sentence2"] = sample["sentence2"][2:-1]
        return sample

    dataset = dataset.map(_clean_data, batched=False)
    dataset = dataset.rename_column("id", "idx")

    if label is not None:  # filter dataset based on label
        dataset = dataset.filter(
            lambda example: example["label"] == label)
        subset = f"{subset}-{'paraphrase' if label == 1 else 'not-paraphrase'}"

    return dataset, subset

def load_cola_ood_dataset(path, label=None, cache_dir=None):
    # TODO(mm): there's probably a better way of doing this
    data_files = {"validation": path}
    dataset = load_dataset("csv", data_files=data_files, sep="\t", column_names=[
                           'code', 'label', 'annotation', 'sentence'], cache_dir=cache_dir)
    dataset = dataset["validation"]

    # cola-ood comes without indices, so we add them
    indices = list(range(len(dataset)))
    dataset = dataset.add_column(name="idx", column=indices)

    subset = "cola-ood"

    if label is not None:  # filter dataset based on label
        dataset = dataset.filter(
            lambda example: example["label"] == label)
        subset = f"{subset}-{'acceptable' if label == 1 else 'unacceptable'}"

    return dataset, subset

In [42]:
def load_ood_eval_datasets():
        out_of_domain_eval_datasets = {}

        for heuristic in ["lexical_overlap"]:
                for label in [0, 1]:
                        dataset, subset = load_hans_dataset(
                                cache_dir="data", heuristic=heuristic, subcase=None, label=label)
                        print(f"{subset}: {len(dataset)} examples")
                        out_of_domain_eval_datasets[subset] = dataset

        for label in [0, 1]:
                mnli_mm_subset, subset_name = load_mnli_mismatched_dataset(label=label)
                out_of_domain_eval_datasets[subset_name] = mnli_mm_subset

        for label in [0, 1]:
                paws_qqp_subset, subset_name = load_paws_qqp_dataset(
                path="/Users/zijie-machine/Documents/DeepLearning/Project/foundation/data/paws_qqp/dev_and_test.tsv", label=label)
                out_of_domain_eval_datasets[subset_name] = paws_qqp_subset

        for label in [0, 1]:
                cola_ood_subset, subset_name = load_cola_ood_dataset(
                path="/Users/zijie-machine/Documents/DeepLearning/Project/foundation/data/cola_ood/dev.tsv", label=label)
                out_of_domain_eval_datasets[subset_name] = cola_ood_subset
        return out_of_domain_eval_datasets

In [43]:
out_of_domain_eval_datasets = load_ood_eval_datasets()

hans-lexical_overlap-entailment: 5000 examples
hans-lexical_overlap-contradiction: 5000 examples


### Data pre-processing

In [44]:
task_to_keys = {
    # labels are: 0 (entailment), 1 (contradiction)
    "rte": ("sentence1", "sentence2"),
    "mnli": ("premise", "hypothesis"),
    "mnli-original": ("premise", "hypothesis"),
    "mnli-mismatched": ("premise", "hypothesis"),
    "hans": ("premise", "hypothesis"),

    # labels are: 0 (not_duplicate), 1 (duplicate)
    "qqp": ("question1", "question2"),
    "paws-qqp": ("sentence1", "sentence2"),

    # labels are: 0 (not acceptable), 1 (acceptable)
    "cola": ("sentence", None),
    "cola-ood": ("sentence", None),
}


In [45]:
sentence1_key, sentence2_key = task_to_keys[task_name]
if pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False

if max_seq_length > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({max_seq_length}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
    )

max_seq_length = min(max_seq_length, tokenizer.model_max_length)




NameError: name 'max_seq_length' is not defined

In [None]:
import numpy as np

max_eval_samples = None
do_eval = True

if do_eval:
    eval_task_names = [task_name for task_name in out_of_domain_eval_datasets.keys()]
    eval_datasets = [dataset for _, dataset in out_of_domain_eval_datasets.items()]

    if max_eval_samples is not None:
                # we fix the random seed that controls the sampling
                # we need to uses a fixed seed here to make sure we evaluate on the same data
                np.random.seed(123)

                max_eval_samples = min(
                    len(eval_dataset), max_eval_samples)
                # randomly select a subset of the eval data
                indices = np.random.choice(
                    range(len(eval_dataset)), size=max_eval_samples, replace=False)
                eval_dataset = eval_dataset.select(indices)