# 1. IMPORT LIBRARIES

In [1]:
!pip install -q datasets accelerate
!pip install -q git+https://github.com/huggingface/transformers.git@main


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, OPTForCausalLM, GPT2Tokenizer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [2]:

!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter


Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp_iouwrkz".


In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

## 2. SET MAIN INPUTS FOR NOTEBOOK

In [4]:
bx_size = 8
format_train_val = 'gpt3'             # 'minimal' or 'gpt3
task_name = 'mnli'
model_name = "facebook/opt-1.3b"
examples_per_exp =  16
num_experiments = 10
num_validations = 1024


# model_name = "facebook/opt-125m"
# model_name = "facebook/opt-350m"
# model_name = "facebook/opt-1.3b"
# model_name = "facebook/opt-2.7b"
# model_name = "facebook/opt-6.7b"

## 3. SET DEVICE

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [6]:
import torch

# Check if CUDA (GPU support) is available
cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")

# If CUDA is available, print the GPU name(s)
if cuda_available:
    print(f"GPU Name(s): {torch.cuda.get_device_name(0)}")

CUDA Available: True
GPU Name(s): NVIDIA A100-SXM4-40GB


In [7]:
device = torch.device("cuda")

device_count = torch.cuda.device_count()
if device_count > 0:
    print("Select GPU device")
    device = torch.device("cuda")
else:
    print("Select GPU device")
    device = torch.device("cpu")

print(device)
torch.cuda.is_available()

Select GPU device
cuda


True

## 4. IMPORT TOKENIZER AND INSTANTIATE SELECTED MODEL

In [8]:
# Choose model to work with:

# model_name = "facebook/opt-125m"
# model_name = "facebook/opt-350m"
# model_name = "facebook/opt-1.3b"
# model_name = "facebook/opt-2.7b"
# model_name = "facebook/opt-6.7b"

model_name = model_name # it is set up in top of NB

In [9]:
OPT_tokenizer = GPT2Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

In [10]:
# Create class as myBaseOPT_ICL to work with in-context learning set up

class myBaseOPT_ICL(nn.Module):

  def __init__(self, load_model_name = "facebook/opt-350m",model_max_tokens=2048, device = 'cuda'):
    super(myBaseOPT_ICL, self).__init__()

    self.model_max_tokens = model_max_tokens
    self.device = device

    self.coreOPT = OPTForCausalLM.from_pretrained(
    load_model_name,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    ).model

    self.lm_OPT_head = OPTForCausalLM.from_pretrained(
    load_model_name,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    ).lm_head

  def forward(self, src, attention_mask):

    src.to(device)
    attention_mask.to(device)

    core_outputs = self.coreOPT.forward(
        src,
        attention_mask=attention_mask
    )['last_hidden_state']

    final_outputs = self.lm_OPT_head.forward(core_outputs)

    return final_outputs

  def forward_generate(self, src, attention_mask):
    # forward used in generate_text function,
    # separated from forward function to avoid sending again to device to avoid any issues

    core_outputs = self.coreOPT.forward(
        src,
        attention_mask=attention_mask
    )['last_hidden_state']


    final_outputs = self.lm_OPT_head.forward(core_outputs)

    return final_outputs


  def generate_text(self, src_inputs, src_attn, gen_tokens=torch.tensor(1)):


    src_len = src_inputs.shape[1]

    gen_tokens = gen_tokens.item()


    outputs = torch.zeros((src_inputs.shape[0], src_inputs.shape[1] + gen_tokens), dtype=torch.long).to(self.device)
    att_mask = torch.zeros((src_attn.shape[0], src_attn.shape[1] + gen_tokens), dtype=torch.long).to(self.device)

    outputs[:,0:src_inputs.shape[1]] = src_inputs
    att_mask[:,0:src_attn.shape[1]] = src_attn

    for t_step in range(gen_tokens):

      all_scores = self.forward_generate(outputs[:,0:src_inputs.shape[1]+t_step], att_mask[:,0:src_attn.shape[1]+t_step])

      new_tokens = torch.argmax(all_scores[:,-1,:], dim=1)

      outputs[:,src_inputs.shape[1]+t_step] = new_tokens
      att_mask[:,src_attn.shape[1]+t_step] = 1

    # Yes token = 9904
    # No token = 3084
    binary_yes_no = torch.zeros(all_scores.shape[0]).half().to(self.device)
    binary_yes_no[:] = all_scores[:,-1,9904] - all_scores[:,-1,3084]

    binary_yes_no[binary_yes_no >= 0] = 0 # or 9904 if token used "Yes"
    binary_yes_no[binary_yes_no < 0] = 1 # or 3084 if token used "No"

    last_scores = all_scores[:,-1,:]

    return outputs, binary_yes_no, last_scores


In [11]:
# Instantiate model and send to selected device
example_myBaseOPT_ICL = myBaseOPT_ICL(load_model_name = model_name)
example_myBaseOPT_ICL.half()
example_myBaseOPT_ICL.to(device)
example_myBaseOPT_ICL

pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

myBaseOPT_ICL(
  (coreOPT): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 2048, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 2048)
      (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
            (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=2048, bias=True)
          (final_layer_norm): Layer

In [12]:
# Check total number of model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Model Parameters: ", count_parameters(example_myBaseOPT_ICL))

Model Parameters:  1418715136


In [13]:
# Check base model output correctness (from OPT HF example)
prompt_example = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
              "there?")
prompt_example_tokenized = OPT_tokenizer(prompt_example)
example_myBaseOPT_ICL.eval()
outputs_ex_sp, binary_ex_sp, scores_ex_sp = example_myBaseOPT_ICL.generate_text(src_inputs = torch.unsqueeze(torch.tensor(prompt_example_tokenized['input_ids']),0).to(device),
                                    src_attn = torch.unsqueeze(torch.tensor(prompt_example_tokenized['attention_mask']),0).to(device),
                                    gen_tokens=torch.tensor(200).to(device))
OPT_tokenizer.batch_decode(outputs_ex_sp)


['</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for over 100 years.\nHuman: What do you do?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue: I am a tourist attraction.\nHuman: What do you do when you are not a tourist attraction?\nStatue:']

## 5. IMPORT NLI DATASET FOR IN CONTEXT LEARNING (MNLI)

In [14]:
# reference: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
# this reference is useful for cleaning the neutral sentences of the dataset, just keeping the 0 and 1.

In [15]:
from collections import Counter
from datasets import load_dataset, ClassLabel

In [16]:
# this comes from original paper, to remove neutral examples from MNLI
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset


In [17]:
# Select GLUE task (set it at the top of NB for simplicity with all other inputs)

# task_name = "rte"
# task_name = "mnli"
# task_name = "qqp"
# task_name = "cola"

task_name = task_name

In [18]:
dataset = load_dataset("glue", task_name)

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [19]:
# binarize dataset
if task_name == "mnli":
    dataset = binarize_mnli(dataset, remove_neutral=True) # mnli


Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/261802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6692 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9796 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [20]:
# analyze and visualize dataset imported

print("task_name:", task_name)
# for split in ["train", "validation"]:
for split in ["train", "validation_matched"]:
    c = Counter(dataset[split]["label"])
    total = len(list(c.elements()))
    print("Total number of samples:", total)
    print(split)
    for k in c:
        print(f"fraction of labels per class: {k}={c[k] / total}")
print(dataset)

task_name: mnli
Total number of samples: 261802
train
fraction of labels per class: 0=0.49999236063895613
fraction of labels per class: 1=0.5000076393610439
Total number of samples: 6692
validation_matched
fraction of labels per class: 1=0.4801255230125523
fraction of labels per class: 0=0.5198744769874477
DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 261802
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 6692
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 6703
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})


In [21]:
# Perform the filters and splits from the original datasets


random_split_seed = 42

examples_per_exp =  examples_per_exp # 16
num_experiments = num_experiments # 10
num_validations = num_validations # 16*64 #64*16 = 1024 #6692

max_train_samples = examples_per_exp*num_experiments
train_dataset = dataset['train']

train_dataset_yes = dataset['train'].filter(lambda example: example["label"] == 0)
train_dataset_no = dataset['train'].filter(lambda example: example["label"] == 1)

val_dataset = dataset['validation_matched']

# randomly select a subset of the training data
max_train_samples = min(len(train_dataset), max_train_samples)

np.random.seed(random_split_seed)
indices_yes = np.random.choice(range(len(train_dataset_yes)), size=int(max_train_samples/2), replace=False)

np.random.seed(random_split_seed+1)
indices_no = np.random.choice(range(len(train_dataset_no)), size=int(max_train_samples/2), replace=False)

np.random.seed(random_split_seed+2)
indices_val = np.random.choice(range(len(val_dataset)), size=num_validations, replace=False)

train_dataset_yes = train_dataset_yes.select(indices_yes)
train_dataset_no = train_dataset_no.select(indices_no)

val_dataset = val_dataset.select(indices_val)
print("Train Dataset Yes: ", train_dataset_yes)
print("Train Dataset No: ", train_dataset_no)
print("Validation Dataset (in-domain): ", val_dataset)

Filter:   0%|          | 0/261802 [00:00<?, ? examples/s]

Filter:   0%|          | 0/261802 [00:00<?, ? examples/s]

Train Dataset Yes:  Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 80
})
Train Dataset No:  Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 80
})
Validation Dataset (in-domain):  Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 1024
})


In [22]:
# Calculate the number of 0 and 1 in validation dataset
# and calculate the majority class accuracy

val_dataset_yes = val_dataset.filter(lambda example: example["label"] == 0)
val_dataset_no = val_dataset.filter(lambda example: example["label"] == 1)
print(val_dataset_yes)
print(val_dataset_no)
print("Majority Class Accuracy: ", 100*max(len(val_dataset_yes), len(val_dataset_no))/(len(val_dataset_yes) + len(val_dataset_no)))

Filter:   0%|          | 0/1024 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1024 [00:00<?, ? examples/s]

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 536
})
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 488
})
Majority Class Accuracy:  52.34375


In [23]:
# This is specific for ICL
# format examples functions formats according to different types of formats for ICL both training and validation examples

# select format to use here:
format_train_val = format_train_val # set it at the top of notebook in a common place


def format_examples_train(example_train, format_train=format_train_val):
    # format examples of train data for ICL
    # select format

    if format_train == 'minimal':
      # "minimal" format
      if example_train['label'] == 0:
        return {'text': "{"  + example_train['premise'] + "} {" + example_train['hypothesis'] + "}" + " ? ĠYes \n\n"}
      elif example_train['label'] == 1:
        return {'text': "{"  + example_train['premise'] + "} {" + example_train['hypothesis'] + "}" + " ? ĠNo \n\n"}
    elif format_train == 'gpt3':
      # "gpt-3" format
      if example_train['label'] == 0:
        return {'text': "{"  + example_train['premise'] + "} question: {" + example_train['hypothesis'] + "}" + " Yes or No? answer: ĠYes \n\n"}
      elif example_train['label'] == 1:
        return {'text': "{"  + example_train['premise'] + "} question: {" + example_train['hypothesis'] + "}" + " Yes or No? answer: ĠNo \n\n"}

def format_examples_validation(example_val, format_val = format_train_val):
    if format_val== 'minimal':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} {" + example_val['hypothesis'] + "}" + " ? Ġ"}
    elif format_val== 'gpt3':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} question: {" + example_val['hypothesis'] + "}" + " Yes or No? answer: Ġ"}

def create_combined_dataset(train_ds_yes, train_ds_no, val_dataset, num_expts=num_experiments, num_train_examples=examples_per_exp):
    combined_dataset = []
    train_examples_yes = [example for example in train_ds_yes]
    train_examples_no = [example for example in train_ds_no]

    for irep in range(num_expts):
      for val_ex in val_dataset:
            sampled_train_exs_yes = train_examples_yes[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
            sampled_train_exs_no = train_examples_no[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
            # for random option if used below
            merged_sampled_train_exs = sampled_train_exs_yes + sampled_train_exs_no
            shuffled_list = merged_sampled_train_exs.copy()
            # Shuffle the copy
            random.seed(irep)
            random.shuffle(shuffled_list)

            combined_ex = {'text': '', 'label': val_ex['label'], 'exp': irep}

            # Way 1: set examples Yes, No, Yes, No, ...
            '''
            for idx_train in range(len(sampled_train_exs_yes)):
              # put order one Yes and another No consecutively
              combined_ex['text'] += sampled_train_exs_yes[idx_train]['text']
              combined_ex['text'] += sampled_train_exs_no[idx_train]['text']
            '''

            # Way 2: set randomized
            for idx_shuffled_list in range(len(shuffled_list)):
              # random option
              combined_ex['text'] += shuffled_list[idx_shuffled_list]['text']

            # Add the example to predict (validation)
            combined_ex['text'] += val_ex['text']

            # Append the new combined example to the combined dataset
            combined_dataset.append([combined_ex])

    return combined_dataset


def dynamic_padding_collate_fn(batch):
    # This function is created to be able to tokenize dynamically to max length within each batch
    # Also, by modifying the tokenizer used, several other options are available
    # for example, if we set padding to a specified max_length, for example the model max_length, is also an option, not the default though
    # the default is the dynamic padding

    batch = [item for sublist in batch for item in sublist]

    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]

    # choose option
    tokenized_inputs = OPT_tokenizer(texts, padding="longest", truncation=True, return_tensors="pt")
    # tokenized_inputs = OPT_tokenizer(texts, padding="max_length", max_length = 2048, truncation=True, return_tensors="pt")

    labels_tensor = torch.tensor(labels, dtype=torch.long)


    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels_tensor
    }


class CustomDataset(Dataset):
    def __init__(self, combined_dataset):
        self.dataset = combined_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]


In [24]:
# First the samples are formatted according to selection above

formatted_train_dataset_yes = train_dataset_yes.map(format_examples_train)
formatted_train_dataset_no = train_dataset_no.map(format_examples_train)
formatted_val_dataset = val_dataset.map(format_examples_validation)


combined_dataset = create_combined_dataset(formatted_train_dataset_yes, formatted_train_dataset_no, formatted_val_dataset)
custom_dataset = CustomDataset(combined_dataset)
print(custom_dataset)

bx_size = bx_size # set it up at the beg of NB
dataloader = DataLoader(custom_dataset, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn, shuffle=False) #shuffle=False for reproducibility

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

<__main__.CustomDataset object at 0x7fe1e0b1ff40>


In [25]:

for i, batch in enumerate(dataloader):
    if i<10:
      print("ORIGINAL: ", i, batch)
      print("TOKENIZE / DETOKENIZE: ", OPT_tokenizer.batch_decode(batch['input_ids']))
    else:
      break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ORIGINAL:  0 {'input_ids': tensor([[    2, 45152,   133,  ...,     1,     1,     1],
        [    2, 45152,   133,  ...,     1,     1,     1],
        [    2, 45152,   133,  ...,     1,     1,     1],
        ...,
        [    2, 45152,   133,  ...,     1,     1,     1],
        [    2, 45152,   133,  ...,     1,     1,     1],
        [    2, 45152,   133,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 0, 0, 0, 0, 0])}
TOKENIZE / DETOKENIZE:  ["</s>{The chief complaint of reformers these days is that the power of special-interest money is breeding public cynicism about the political process.} question: {Reformers never complain about special interest money.  } Yes or No? answer: ĠNo \n\n{The game's up.} question: {The game keeps going.} Yes o

# 6. EVALUATION OF IN CONTEXT LEARNING FOR IN-DOMAIN PERFORMANCE

In [26]:


example_myBaseOPT_ICL.eval()

model_pred = torch.zeros(0, dtype=torch.long).to(device)
ground_truth = torch.zeros(0, dtype=torch.long).to(device)

with torch.no_grad():
    for i, batch in enumerate(dataloader):


        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        gen_tokens = torch.tensor(1)
        gen_tokens = gen_tokens.to(device)


        # output only the binary yes/no,
        _, binary_yes_no, _ = example_myBaseOPT_ICL.generate_text(input_ids, attention_mask, gen_tokens=gen_tokens)
        model_pred = torch.cat((model_pred, binary_yes_no), dim=0)
        ground_truth = torch.cat((ground_truth, batch['labels'].to(device)), dim=0)

        #print("BATCH#: ", i, "NUM EXPTS TOTAL: ", (i+1)*bx_size, "PREDICTION: ", binary_yes_no, "TRUE LABELS: ", batch['labels'].detach())



In [27]:
# Evaluate results beyond accuracy:

print("YES answer MODEL (%): ", ((torch.sum(model_pred==0))/len(model_pred)).item()*100)
print("NO  answer MODEL (%): ", ((torch.sum(model_pred==1))/len(model_pred)).item()*100)

print("YES answer LABEL (%): ", ((torch.sum(ground_truth==0))/len(ground_truth)).item()*100)
print("NO  answer LABEL (%): ", ((torch.sum(ground_truth==1))/len(ground_truth)).item()*100)


YES answer MODEL (%):  14.658203721046448
NO  answer MODEL (%):  85.34179925918579
YES answer LABEL (%):  52.34375
NO  answer LABEL (%):  47.65625


In [28]:
# For each of the experiments, typically 10, the accuracy is calculated
# Results are printed out

list_acc = []

for exp_i in range(num_experiments):

  model_pred_expi = model_pred[exp_i*num_validations : (exp_i +1)*num_validations]
  ground_truth_expi = ground_truth[exp_i*num_validations : (exp_i +1)*num_validations]

  accuracy_calc = 100*torch.sum(model_pred_expi == ground_truth_expi)/(model_pred_expi.shape[0])
  list_acc.append(accuracy_calc.item())

  print("Experiment#: ", exp_i+1, "Accuracy: ", accuracy_calc.item())
print("Average Accuracy: ", np.mean(list_acc))

Experiment#:  1 Accuracy:  53.7109375
Experiment#:  2 Accuracy:  54.58984375
Experiment#:  3 Accuracy:  53.41796875
Experiment#:  4 Accuracy:  58.49609375
Experiment#:  5 Accuracy:  52.83203125
Experiment#:  6 Accuracy:  48.6328125
Experiment#:  7 Accuracy:  52.24609375
Experiment#:  8 Accuracy:  56.0546875
Experiment#:  9 Accuracy:  55.37109375
Experiment#:  10 Accuracy:  54.78515625
Average Accuracy:  54.013671875


# 7. OOD VALIDATION: HANS DATASET

In [29]:
dataset_ood = load_dataset("hans")
dataset_ood

Downloading data:   0%|          | 0.00/3.14M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/30000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
        num_rows: 30000
    })
})

In [30]:
dataset_ood_val = (dataset_ood['validation']).filter(lambda example: example["heuristic"] == 'lexical_overlap')

Filter:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [31]:
# Perform the filters and splits from the original datasets


random_split_seed_ood = 42 # set above, equal to 42

examples_per_exp =  examples_per_exp # 16
num_experiments = num_experiments # 10
num_validations = num_validations # 16*64 #64*16 = 1024 #6692

np.random.seed(random_split_seed_ood)
indices_ood_val = np.random.choice(range(len(dataset_ood_val)), size=num_validations, replace=False)
print("indices_ood_val: ", indices_ood_val)

dataset_ood_val_sel = dataset_ood_val.select(indices_ood_val)
dataset_ood_val_sel

indices_ood_val:  [6252 4684 1731 ... 9410 1671  474]


Dataset({
    features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
    num_rows: 1024
})

In [32]:
# format examples functions formats according to different types of formats for ICL both training and validation examples

# select format to use here:
format_train_val = format_train_val # set it at the top of notebook in a common place

def format_examples_train_VALOOD(example_train, format_train=format_train_val):
    # format examples of train data for ICL
    # select format

    if format_train == 'minimal':
      # "minimal" format
      if example_train['label'] == 0:
        return {'text': "{"  + example_train['premise'] + "} {" + example_train['hypothesis'] + "}" + " ? ĠYes \n\n"}
      elif example_train['label'] == 1:
        return {'text': "{"  + example_train['premise'] + "} {" + example_train['hypothesis'] + "}" + " ? ĠNo \n\n"}
    elif format_train == 'gpt3':
      # "gpt-3" format
      if example_train['label'] == 0:
        return {'text': "{"  + example_train['premise'] + "} question: {" + example_train['hypothesis'] + "}" + " Yes or No? answer: ĠYes \n\n"}
      elif example_train['label'] == 1:
        return {'text': "{"  + example_train['premise'] + "} question: {" + example_train['hypothesis'] + "}" + " Yes or No? answer: ĠNo \n\n"}

def format_examples_validation_VALOOD(example_val, format_val = format_train_val):
    if format_val== 'minimal':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} {" + example_val['hypothesis'] + "}" + " ? Ġ"}
    elif format_val== 'gpt3':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} question: {" + example_val['hypothesis'] + "}" + " Yes or No? answer: Ġ"}


def create_combined_dataset_VALOOD(train_ds_yes, train_ds_no, val_dataset, num_expts=num_experiments, num_train_examples=examples_per_exp):
    combined_dataset = []
    train_examples_yes = [example for example in train_ds_yes]
    train_examples_no = [example for example in train_ds_no]

    for irep in range(num_expts):
      for val_ex in val_dataset:
            sampled_train_exs_yes = train_examples_yes[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)] # random.sample(train_examples, num_train_examples)
            sampled_train_exs_no = train_examples_no[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
            # for random option if used below
            merged_sampled_train_exs = sampled_train_exs_yes + sampled_train_exs_no
            shuffled_list = merged_sampled_train_exs.copy()
            # Shuffle the copy
            random.seed(irep)
            random.shuffle(shuffled_list)

            combined_ex = {'text': '', 'label': val_ex['label'], 'exp': irep+1}

            # Way 1: set examples Yes, No, Yes, No, ...
            '''
            for idx_train in range(len(sampled_train_exs_yes)):
              # put order one Yes and another No consecutively
              combined_ex['text'] += sampled_train_exs_yes[idx_train]['text']
              combined_ex['text'] += sampled_train_exs_no[idx_train]['text']
            '''

            # Way 2: set randomized
            for idx_shuffled_list in range(len(shuffled_list)):
              # random option
              combined_ex['text'] += shuffled_list[idx_shuffled_list]['text']

            # Add the example to predict (validation)
            combined_ex['text'] += val_ex['text']

            # Append the new combined example to the combined dataset
            combined_dataset.append([combined_ex])

    return combined_dataset


def dynamic_padding_collate_fn_VALOOD(batch):
    # This function is created to be able to tokenize dynamically to max length within each batch
    # Also, by modifying the tokenizer used, several other options are available
    # for example, if we set padding to a specified max_length, for example the model max_length, is also an option, not the default though
    # the default is the dynamic padding

    batch = [item for sublist in batch for item in sublist]

    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    exps = [item['exp'] for item in batch]

    # choose option
    tokenized_inputs = OPT_tokenizer(texts, padding="longest", truncation=True, return_tensors="pt")
    # tokenized_inputs = OPT_tokenizer(texts, padding="max_length", max_length = 2048, truncation=True, return_tensors="pt")

    labels_tensor = torch.tensor(labels, dtype=torch.long)
    exps_tensor = torch.tensor(exps, dtype=torch.long).to(device)

    # return here the outputs desired
    # we have chosen the input_ids, attention_mask, label of the validation samples
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels_tensor,
        'exps': exps_tensor
    }


class CustomDataset(Dataset):
    def __init__(self, combined_dataset):
        self.dataset = combined_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]


In [33]:


formatted_train_dataset_yes = train_dataset_yes.map(format_examples_train)
formatted_train_dataset_no = train_dataset_no.map(format_examples_train)
formatted_val_dataset_ood = dataset_ood_val_sel.map(format_examples_validation_VALOOD)


combined_dataset_VALOOD = create_combined_dataset_VALOOD(
                                          formatted_train_dataset_yes,
                                          formatted_train_dataset_no,
                                          val_dataset = formatted_val_dataset_ood,
                                          num_expts=num_experiments
                                           )

custom_dataset_VALOOD = CustomDataset(combined_dataset_VALOOD)
print(custom_dataset_VALOOD)

# Last step, we create Dataloader passing the bx_size for inference (typically: 1, 4, 8, 16)
bx_size = bx_size # set it up at the beg of NB
dataloader_VALOOD = DataLoader(custom_dataset_VALOOD, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn_VALOOD, shuffle=False) #shuffle=False for reproducibility

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

<__main__.CustomDataset object at 0x7fe1cc0309a0>


In [34]:
# Set eval model for inference
# initialize to store results of model predictions and compare with ground-truth
# use generate text with only one token
# extract only the max score token YES (0 label) or NO (1 label)

example_myBaseOPT_ICL.eval()

model_pred_ood = torch.zeros(0, dtype=torch.long).to(device)
ground_truth_ood = torch.zeros(0, dtype=torch.long).to(device)

with torch.no_grad():
    for i, batch in enumerate(dataloader_VALOOD):


        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        gen_tokens = torch.tensor(1)
        gen_tokens = gen_tokens.to(device)


        # output only the binary yes/no,
        _, binary_yes_no, _ = example_myBaseOPT_ICL.generate_text(input_ids, attention_mask, gen_tokens=gen_tokens)
        model_pred_ood = torch.cat((model_pred_ood, binary_yes_no), dim=0)
        ground_truth_ood = torch.cat((ground_truth_ood, batch['labels'].to(device)), dim=0)

        #print("BATCH#: ", i, "NUM EXPTS TOTAL: ", (i+1)*bx_size, "PREDICTION: ", binary_yes_no, "TRUE LABELS: ", batch['labels'].detach())



In [35]:
# Evaluate results beyond accuracy:

print("YES answer MODEL OOD (%): ", ((torch.sum(model_pred_ood==0))/len(model_pred_ood)).item()*100)
print("NO  answer MODEL OOD (%): ", ((torch.sum(model_pred_ood==1))/len(model_pred_ood)).item()*100)

print("YES answer LABEL OOD (%): ", ((torch.sum(ground_truth_ood==0))/len(ground_truth_ood)).item()*100)
print("NO  answer LABEL OOD (%): ", ((torch.sum(ground_truth_ood==1))/len(ground_truth_ood)).item()*100)


YES answer MODEL OOD (%):  23.896484076976776
NO  answer MODEL OOD (%):  76.1035144329071
YES answer LABEL OOD (%):  50.5859375
NO  answer LABEL OOD (%):  49.4140625


In [36]:
# For each of the experiments, typically 10, the accuracy is calculated
# Results are printed out

list_acc_ood = []

for exp_i in range(num_experiments):

  model_pred_ood_expi = model_pred_ood[exp_i*num_validations : (exp_i +1)*num_validations]
  ground_truth_ood_expi = ground_truth_ood[exp_i*num_validations : (exp_i +1)*num_validations]

  accuracy_calc_ood = 100*torch.sum(model_pred_ood_expi == ground_truth_ood_expi)/(model_pred_ood_expi.shape[0])
  list_acc_ood.append(accuracy_calc_ood.item())

  print("Experiment#: ", exp_i+1, "Accuracy OOD: ", accuracy_calc_ood.item())
print("Average Accuracy OOD: ", np.mean(list_acc_ood))

Experiment#:  1 Accuracy OOD:  53.90625
Experiment#:  2 Accuracy OOD:  54.00390625
Experiment#:  3 Accuracy OOD:  53.3203125
Experiment#:  4 Accuracy OOD:  50.09765625
Experiment#:  5 Accuracy OOD:  54.00390625
Experiment#:  6 Accuracy OOD:  49.609375
Experiment#:  7 Accuracy OOD:  54.78515625
Experiment#:  8 Accuracy OOD:  53.515625
Experiment#:  9 Accuracy OOD:  54.1015625
Experiment#:  10 Accuracy OOD:  52.63671875
Average Accuracy OOD:  52.998046875
