In [2]:
import logging
import os
import random
import sys

import datasets
from datasets import ClassLabel, Value
import numpy as np
import pandas as pd
import torch

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    BitsAndBytesConfig,
)
from transformers.utils.versions import require_version

from options import DataTrainingArguments, ModelArguments, InContextLearningArguments, WandbArguments, FtArguments
from utils import create_dir, get_timestamp
from task_utils import task_to_keys, load_glue_datasets, load_hans_dataset, load_mnli_mismatched_dataset, load_paws_qqp_dataset, load_cola_ood_dataset
from custom_trainer.ft_trainer import FtTrainer
from eval_utils import create_few_shot_context, add_context_to_dataset, _select_subset_by_idx
from llama_wrapper import LlamaWithLMClassifier

from peft import LoraConfig, TaskType, get_peft_model

logger = logging.getLogger(__name__)
torch_dtype = torch.float16



In [3]:
from huggingface_hub import login

# Log in to Hugging Face
hf_token = "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"
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
def _load_model(model_args: ModelArguments):
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Add vanilla fine-tuning specific args to the model config
    config.classifier_type = None

    # Add pattern-verbalizer fine-tuning specific args to the model config
    config.untie_embeddings = False

    # Add adapter specific args to the model config
    config.use_adapters = False
    config.adapter_type = None
    config.adapter_dim = None

    # Add soft prompt tuning specific args to the model config
    config.use_soft_prompt = False
    config.num_soft_prompt_tokens = None

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_use_double_quant=True,
    )

    model = LlamaWithLMClassifier(config).from_pretrained(
        model_args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        attn_implementation="eager",
    )

    peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, 
                            inference_mode=False, 
                            lora_alpha=16,
                            lora_dropout=0.05,
                            r=16,
                            bias="none",
                            target_modules=["q_proj", "v_proj"])
    model = get_peft_model(model, peft_config)

    # We need to add a padding token for llama
    tokenizer.add_special_tokens({"pad_token":"<pad>"})
    config.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    tokenizer.pad_token_id = config.pad_token_id
    tokenizer.padding_side = "right"

    # Save the updated configuration back to the model
    model.config.pad_token_id = config.pad_token_id

    # Update Model Embeddings:
    model.resize_token_embeddings(len(tokenizer))

    return config, tokenizer, model

In [5]:
def _add_args_to_results(args, results):
    # Save results in a dataframe
    results["task_description"] = args.task_description if args.task_description is not None else " "
    results["pattern"] = args.pattern
    results["target_tokens"] = args.target_tokens
    results["num_shots"] = args.num_shots
    results["separate_shots_by"] = args.separate_shots_by
    results["balanced"] = args.balanced
    results["shuffle"] = args.shuffle
    results["target_prefix"] = args.target_prefix
    results["group"] = args.group

    return results

In [6]:
def _create_df(results):
    data = {k: [v] for k, v in results.items()}
    df = pd.DataFrame.from_dict(data)
    return df

In [7]:
def preprocess_function(examples):
    # Tokenize the texts

    # Apply a pattern to the inputs
    if context != "":
        # we add the context here
        pattern = f"{context}{in_context_args.pattern}"
    else:
        pattern = in_context_args.pattern

    if in_context_args.target_prefix != "":
        pattern = f"{pattern} {in_context_args.target_prefix.strip()}"

    pattern_examples = [
        pattern.format(
            text1=examples[sentence1_key][idx],
            text2=examples[sentence2_key][idx] if sentence2_key is not None else None)
        for idx in range(len(examples[sentence1_key]))
    ]

    args = (pattern_examples,)
    result = tokenizer(*args, padding=padding,
                        max_length=max_seq_length, truncation=True)

    # Get tokens
    result["input_tokens"] = [tokenizer.convert_ids_to_tokens(
        ids) for ids in result["input_ids"]]

    # Decode input
    result["input_text"] = [tokenizer.decode(
        ids) for ids in result["input_ids"]]

    # Replace labels by target tokens indices when using lm_head
    result["label"] = [target_tokens_ids[l] for l in examples["label"]]
    result["label_text"] = [id_to_target_token[l] if l != -1 else "unlabeled"
                            for l in examples["label"]]

    return result

In [8]:
def compute_metrics(p: EvalPrediction):
    result = {}

    preds = p.predictions[0] if isinstance(
        p.predictions, tuple) else p.predictions
    labels = p.label_ids
    predicted_token_ids = np.argmax(preds, axis=1)
    # get the logits for each of the target tokens
    class_logits = [[logits[target_tokens_ids[0]], logits[target_tokens_ids[1]]]
                    for _, logits in enumerate(preds)]
    class_logits = np.asarray(class_logits)

    # Compute exact match
    result["accuracy"] = np.mean(labels == predicted_token_ids)

    # Compute score based performance
    # TODO(mm): speed this up
    scores = []
    for idx, batch_logits in enumerate(class_logits):
        # we get the class id of the label token
        class_id = token_id_to_label_id[labels[idx]]
        # does it receive larger probability than the other classes?
        predicted_token_class = np.argmax(batch_logits)
        score = predicted_token_class == class_id
        scores.append(score)

    scores = np.asarray(scores)
    result["score_accuracy"] = np.mean(scores)

    return result

In [9]:
model_args: ModelArguments = ModelArguments(
    model_name_or_path="meta-llama/Meta-Llama-3-8B-Instruct",
    use_auth_token=hf_token,
)


In [10]:
data_args: DataTrainingArguments = DataTrainingArguments(
    task_name="mnli",
    max_seq_length=2048,
    eval_task_name="hans",
)

In [11]:
in_context_args: InContextLearningArguments = InContextLearningArguments(
    pattern="{text1} question: {text2} Yes or No?",
    target_prefix=" answer: ",
    target_tokens="ĠYes,ĠNo",
    separate_shots_by="\n\n",
    group="gpt-3", # TODO: Change this
    num_shots=2, # 2, 16, 32
    balanced=True,
    shuffle=True,
)

In [12]:
training_args: TrainingArguments = TrainingArguments(
    output_dir="./",
    do_eval=True,
    per_device_eval_batch_size=10,
    fp16=True,
    seed=0,
    report_to="none"
)

In [13]:
os.environ["WANDB_DISABLED"] = "False"

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)

# Set seed before initializing model.
set_seed(training_args.seed)



In [14]:
# Load training dataset and validation set for in-domain data
if data_args.task_name in ["rte", "mnli", "mnli-original", "qqp", "cola"]:
    raw_datasets, label_list, num_labels, is_regression = load_glue_datasets(
        data_args, model_args)

additional_evaluation_datasets = {}
if data_args.eval_task_name == "hans":
    for heuristic in ["lexical_overlap"]:
        # for heuristic in ["lexical_overlap", "subsequence", "constituent"]:
        # Load HANS subsets as additional validation data
        for label in [0, 1]:
            hans_subset, subset_name = load_hans_dataset(
                data_args.dataset_cache_dir, heuristic=heuristic, subcase=None, label=label)
            additional_evaluation_datasets[subset_name] = hans_subset

elif data_args.eval_task_name == "mnli-mismatched":
    # Load mnli mismatched validation set
    for label in [0, 1]:
        mnli_mm_subset, subset_name = load_mnli_mismatched_dataset(
            data_args, label=label)
        additional_evaluation_datasets[subset_name] = mnli_mm_subset

elif data_args.eval_task_name == "paws-qqp":
    for label in [0, 1]:
        paws_qqp_subset, subset_name = load_paws_qqp_dataset(
            data_args.eval_task_path, label=label, cache_dir=data_args.dataset_cache_dir)
        additional_evaluation_datasets[subset_name] = paws_qqp_subset

elif data_args.eval_task_name == "cola-ood":
    for label in [0, 1]:
        cola_ood_subset, subset_name = load_cola_ood_dataset(
            data_args.eval_task_path, label=label, cache_dir=data_args.dataset_cache_dir)
        additional_evaluation_datasets[subset_name] = cola_ood_subset

In [15]:
config, tokenizer, model = _load_model(model_args)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
if data_args.task_name is not None:
    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
else:
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    non_label_column_names = [
        name for name in raw_datasets["train"].column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None


In [17]:
# Padding strategy
if data_args.pad_to_max_length:
    padding = "max_length"
else:
    # We will pad later, dynamically at batch creation, to the max sequence length in each batch
    padding = False

In [18]:
# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = None
if (
    model.config.label2id != PretrainedConfig(
        num_labels=num_labels).label2id
    and data_args.task_name is not None
    and not is_regression
):
    # Some have all caps in their config, some don't.
    label_name_to_id = {
        k.lower(): v for k, v in model.config.label2id.items()}
    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
        label_to_id = {
            i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
            "\nIgnoring the model labels as a result.",
        )
elif data_args.task_name is None and not is_regression:
    label_to_id = {v: i for i, v in enumerate(label_list)}

if label_to_id is not None:
    model.config.label2id = label_to_id
    model.config.id2label = {
        id: label for label, id in config.label2id.items()}
elif data_args.task_name is not None and not is_regression:
    model.config.label2id = {l: i for i, l in enumerate(label_list)}
    model.config.id2label = {
        id: label for label, id in config.label2id.items()}

print(model.config.label2id)
print(model.config.id2label)

{'entailment': 0, 'contradiction': 1}
{0: 'LABEL_0', 1: 'LABEL_1'}


In [19]:
# map targets to ids and vice versa
target_tokens = [t.strip()
                    for t in in_context_args.target_tokens.split(",")]
target_tokens_ids = tokenizer.convert_tokens_to_ids(target_tokens)
id_to_target_token = {idx: t for idx, t in enumerate(target_tokens)}
target_token_to_id = {t: idx for idx, t in enumerate(target_tokens)}
token_id_to_label_id = {tidx: lidx for lidx,
                        tidx in enumerate(target_tokens_ids)}

In [20]:
# Compute max_seq_length
if data_args.max_seq_length > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
    )

max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

In [21]:
# Create in-context learning prompt from training data
context, contex_indices = create_few_shot_context(
    data_args.task_name, raw_datasets["train"], in_context_args.num_shots, pattern=in_context_args.pattern,
    label_to_tokens=id_to_target_token,
    separate_shots_by=in_context_args.separate_shots_by, description=in_context_args.task_description,
    target_prefix=in_context_args.target_prefix,
    from_indices=in_context_args.sample_indices_file, balanced=in_context_args.balanced, shuffle=in_context_args.shuffle,
    seed=training_args.data_seed
)
# inspect context
logger.info("Using the following context:")
logger.info(context)

Filter:   0%|          | 0/261802 [00:00<?, ? examples/s]

In [22]:
# tokenize context
result = tokenizer(context, padding=padding,
                    max_length=max_seq_length, truncation=False)
# print(result["input_ids"])
# print(len(result["input_ids"]))
if len(result["input_ids"]) > max_seq_length:
    # we skip the current run. The context is too long
    print("Context is too long. Skipping run")

In [23]:
if training_args.do_eval:
    # Get the in-domain validation dataset
    eval_dataset = raw_datasets["validation_matched" if data_args.task_name in
                                ["mnli", "mnli-original"] else "validation"]

    # (optional) subsample eval datasets
    if data_args.max_eval_samples is not None:
        # we fix the random seed that controls the sampling
        # we need to uses a fixed seed here to make sure we evaluate on the same data
        np.random.seed(123)

        max_eval_samples = min(
            len(eval_dataset), data_args.max_eval_samples)
        # randomly select a subset of the eval data
        indices = np.random.choice(
            range(len(eval_dataset)), size=max_eval_samples, replace=False)
        eval_dataset = eval_dataset.select(indices)

    for name, dataset in additional_evaluation_datasets.items():
        if data_args.max_eval_samples is not None:
            # we fix the random seed that controls the sampling
            # we need to uses a fixed seed here to make sure we evaluate on the same data
            np.random.seed(123)

            max_eval_samples = min(
                len(dataset), data_args.max_eval_samples)
            # randomly select a subset of the eval data
            indices = np.random.choice(
                range(len(dataset)), size=max_eval_samples, replace=False)
            dataset = dataset.select(indices)
            additional_evaluation_datasets[name] = dataset

    # set all random seeds again (not sure if this is really needed)
    set_seed(training_args.seed)

    # We need to update the number of classes of the dataset when using the lm_head
    if in_context_args.target_tokens is not None and not in_context_args.target_tokens_logits_only:
        new_features = eval_dataset.features.copy()
        names = [f"{idx}" for idx in np.arange(len(tokenizer))]
        new_features["label"] = ClassLabel(
            names=names, num_classes=len(tokenizer))
        eval_dataset = eval_dataset.cast(new_features)

        for name, dataset in additional_evaluation_datasets.items():
            new_features = dataset.features.copy()
            names = [f"{idx}" for idx in np.arange(len(tokenizer))]
            new_features["label"] = ClassLabel(
                names=names, num_classes=len(tokenizer))
            additional_evaluation_datasets[name] = dataset.cast(
                new_features)

    # Tokenize and encode validation datasets
    with training_args.main_process_first(desc="dataset map pre-processing"):
        eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            batch_size=1000,
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
        )

        for name, dataset in additional_evaluation_datasets.items():
            sentence1_key, sentence2_key = task_to_keys[data_args.eval_task_name]
            dataset = dataset.map(
                preprocess_function,
                batched=True,
                batch_size=1000,
                load_from_cache_file=False,
                desc="Running tokenizer on dataset",
            )
            additional_evaluation_datasets[name] = dataset


Running tokenizer on dataset:   0%|          | 0/6692 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [24]:
# Log a few random samples from the validation set:
for index in random.sample(range(len(eval_dataset)), 1):
    print(f"Sample {index} of the validation set: {eval_dataset[index]}.")
    logger.info(f"Sample {index} of the validation set: {eval_dataset[index]}.")

Sample 3155 of the validation set: {'premise': 'And Alan Tonelson, of the U.S.', 'hypothesis': 'In the U.S., there is a person named Alan Tonelson.', 'label': 7566, 'idx': 4606, 'input_ids': [128000, 8100, 11075, 311, 636, 3412, 315, 430, 5684, 520, 682, 7194, 11, 323, 304, 420, 11175, 6140, 3782, 311, 1077, 12576, 13, 220, 3488, 25, 3005, 574, 9087, 555, 1077, 26314, 311, 636, 3412, 315, 279, 5684, 13, 7566, 477, 2360, 30, 4320, 25, 7566, 271, 53, 1466, 3458, 31645, 323, 75371, 13, 3488, 25, 650, 1466, 3458, 10837, 3201, 87625, 662, 220, 7566, 477, 2360, 30, 4320, 25, 2360, 271, 3112, 26349, 31816, 22110, 11, 315, 279, 549, 815, 13, 3488, 25, 763, 279, 549, 815, 2637, 1070, 374, 264, 1732, 7086, 26349, 31816, 22110, 13, 7566, 477, 2360, 30, 4320, 25, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256

In [25]:
# Iterate over the validation set and make sure the last token is a padding token
keep_counter = {}
keep_indices = []
for sample in eval_dataset:
    # assert sample["input_ids"][-1] == tokenizer.pad_token_id, sample["input_text"]
    if sample["input_ids"][-1] == tokenizer.pad_token_id:
        # the last position is a padding token
        keep_indices.append(sample["idx"])
# keep only those eval samples that fit into the context
keep_num_samples = len(keep_indices)
if keep_num_samples > 0:
    logger.info(f"Keeping {keep_num_samples} validation examples")
    print(f"Keeping {keep_num_samples} validation examples")
    eval_dataset = _select_subset_by_idx(eval_dataset, keep_indices)
    keep_counter["in-domain"] = keep_num_samples
else:
    logger.info("Skipping the current run. The prompt is too long.")
    print("Skipping the current run. The prompt is too long.")

Keeping 6692 validation examples


Filter:   0%|          | 0/6692 [00:00<?, ? examples/s]

In [26]:
# for ood
additional_evaluation_datasets_tmp = {}
for name, dataset in additional_evaluation_datasets.items():
    keep_indices = []
    for sample in dataset:
        # assert sample["input_ids"][-1] == tokenizer.pad_token_id, sample["input_text"]
        if sample["input_ids"][-1] == tokenizer.pad_token_id:
            keep_indices.append(sample["idx"])
    # keep only those eval samples that fit into the context
    keep_num_samples = len(keep_indices)
    if keep_num_samples > 0:
        logger.info(f"Keeping {keep_num_samples} validation examples")
        print(f"Keeping {keep_num_samples} validation examples")
        tmp_dataset = _select_subset_by_idx(dataset, keep_indices)
        additional_evaluation_datasets_tmp[name] = tmp_dataset
        keep_counter[name] = keep_num_samples
    else:
        logger.info("Skipping the current run. The prompt is too long.")
        print("Skipping the current run. The prompt is too long.")

additional_evaluation_datasets = additional_evaluation_datasets_tmp

Keeping 5000 validation examples


Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Keeping 5000 validation examples


Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [27]:
# Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
# we already did the padding.
if data_args.pad_to_max_length:
    data_collator = default_data_collator
elif training_args.fp16:
    data_collator = DataCollatorWithPadding(
        tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None

In [28]:
ft_args = FtArguments(
    head_only = False,
    bitfit = True,
    adapter_type: None,  #  "lora", "ia3", "parallel-attn", "parallel-fc", "parallel"
    use_adapters = False
)

In [29]:
trainer = FtTrainer(
    model=model,
    args=training_args,
    train_dataset=None,
    eval_dataset=None,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    data_args=data_args,
    ft_args = ft_args,
    eval_only=True
)


In [30]:
if training_args.do_eval:
    logger.info("*** In-context learning evaluation ***")

    # Get datasets
    eval_task_names = [data_args.task_name]
    eval_task_names += [task_name for task_name in additional_evaluation_datasets.keys()]
    eval_datasets = [eval_dataset]
    eval_datasets += [dataset for _,
                        dataset in additional_evaluation_datasets.items()]

    all_results = {}
    for task_name, dataset in zip(eval_task_names, eval_datasets):
        outputs = trainer.predict(
            dataset, metric_key_prefix=task_name, ignore_keys=["past_key_values"])
        predictions = outputs.predictions
        labels = outputs.label_ids
        metrics = outputs.metrics
        all_results = {**metrics, **all_results}
        # output_predict_file = os.path.join(
        #     training_args.output_dir, f"predict_results_{task}.txt")

    if trainer.is_world_process_zero():

        #     with open(output_predict_file, "w") as writer:
        #         logger.info(f"***** Predict results {task} *****")
        #         writer.write("index\tprediction\n")
        #         for index, item in enumerate(predictions):
        #             if is_regression:
        #                 writer.write(f"{index}\t{item:3.3f}\n")
        #             else:
        #                 item = label_list[item]
        #                 writer.write(f"{index}\t{item}\n")

        # Save everything to in a dataframe
        all_results = _add_args_to_results(in_context_args, all_results)
        all_results["indices"] = contex_indices
        all_results["context"] = context
        all_results["data_seed"] = training_args.data_seed
        all_results["keep_samples_in-domain"] = keep_counter["in-domain"]
        for name in additional_evaluation_datasets.keys():
            all_results[f"keep_samples_{name}"] = keep_counter[name]

        df = _create_df(all_results)
        file_name = f"llama3" + \
            f"_{data_args.task_name}" + \
            f"_{data_args.eval_task_name}"

        output_file = os.path.join(
            training_args.output_dir, f"{file_name}.csv")
        if os.path.exists(output_file):
            # if the file already exists, we append to it
            df.to_csv(output_file, mode='a', header=False)
        else:
            df.to_csv(output_file)

