# 1. IMPORT LIBRARIES

In [None]:

!pip install -q datasets
!pip install -q transformers==4.33.1


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import gc
from torch.cuda.amp import autocast, GradScaler


from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, OPTForCausalLM, GPT2Tokenizer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[?25h

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [None]:

!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter


Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp9bjtuptt".


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

## 2. SET MAIN INPUTS FOR NOTEBOOK

In [None]:
bx_size = 1
format_train_val = 'gpt3'
task_name = 'mnli'
model_name = "facebook/opt-350m"
examples_per_exp =  16
num_experiments = 10
num_validations = 1024
num_reasoning_context_per_example = 16
num_repeats = 100

# for context distillation
number_max_probs_match = 50


# model_name = "facebook/opt-125m"
# model_name = "facebook/opt-350m"
# model_name = "facebook/opt-1.3b"
# model_name = "facebook/opt-2.7b"
# model_name = "facebook/opt-6.7b"

## 3. SET DEVICE

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import torch

# Check if CUDA (GPU support) is available
cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")

# If CUDA is available, print the GPU name(s)
if cuda_available:
    print(f"GPU Name(s): {torch.cuda.get_device_name(0)}")

CUDA Available: True
GPU Name(s): NVIDIA A100-SXM4-40GB


In [None]:
device = torch.device("cuda")

device_count = torch.cuda.device_count()
if device_count > 0:
    print("Select GPU device")
    device = torch.device("cuda")
else:
    print("Select GPU device")
    device = torch.device("cpu")

print(device)
torch.cuda.is_available()

Select GPU device
cuda


True

## 4. IMPORT TOKENIZER AND INSTANTIATE LLM MODEL SELECTED

In [None]:
# Choose model to work with:

# model_name = "facebook/opt-125m"
# model_name = "facebook/opt-350m"
# model_name = "facebook/opt-1.3b"
# model_name = "facebook/opt-2.7b"
# model_name = "facebook/opt-6.7b"

model_name = model_name # it is set up in top of NB

In [None]:
OPT_tokenizer = GPT2Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

In [None]:
# Create class as myBaseOPT_ICL to work with in-context learning set up

class myBaseOPT_CD(nn.Module):

  def __init__(self, load_model_name = "facebook/opt-350m",model_max_tokens=2048, device = 'cuda'):
    super(myBaseOPT_CD, self).__init__()

    self.model_max_tokens = model_max_tokens
    self.device = device

    self.coreOPT = OPTForCausalLM.from_pretrained(
    load_model_name,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    ).model

    self.lm_OPT_head = OPTForCausalLM.from_pretrained(
    load_model_name,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    ).lm_head

  def forward(self, src, attention_mask):

    core_outputs = self.coreOPT.forward(
        src,
        attention_mask=attention_mask
    )['last_hidden_state']

    final_outputs = self.lm_OPT_head.forward(core_outputs)

    return final_outputs

  def forward_generate(self, src, attention_mask):
    # forward used in generate_text function,
    # separated from forward function to avoid sending again to device to avoid any issues

    core_outputs = self.coreOPT.forward(
        src,
        attention_mask=attention_mask
    )['last_hidden_state']


    final_outputs = self.lm_OPT_head.forward(core_outputs)

    return final_outputs


  def generate_text(self, src_inputs, src_attn, gen_tokens=torch.tensor(1)):


    src_len = src_inputs.shape[1]

    gen_tokens = gen_tokens.item()


    outputs = torch.zeros((src_inputs.shape[0], src_inputs.shape[1] + gen_tokens), dtype=torch.long).to(self.device)
    att_mask = torch.zeros((src_attn.shape[0], src_attn.shape[1] + gen_tokens), dtype=torch.long).to(self.device)

    outputs[:,0:src_inputs.shape[1]] = src_inputs
    att_mask[:,0:src_attn.shape[1]] = src_attn

    for t_step in range(gen_tokens):

      all_scores = self.forward_generate(outputs[:,0:src_inputs.shape[1]+t_step], att_mask[:,0:src_attn.shape[1]+t_step])

      new_tokens = torch.argmax(all_scores[:,-1,:], dim=1)

      outputs[:,src_inputs.shape[1]+t_step] = new_tokens
      att_mask[:,src_attn.shape[1]+t_step] = 1

    # Yes token = 9904
    # No token = 3084
    binary_yes_no = torch.zeros(all_scores.shape[0]).half().to(self.device)
    binary_yes_no[:] = all_scores[:,-1,9904] - all_scores[:,-1,3084]

    binary_yes_no[binary_yes_no >= 0] = 0 # or 9904 if token used "Yes"
    binary_yes_no[binary_yes_no < 0] = 1 # or 3084 if token used "No"

    last_scores = all_scores[:,-1,:]

    return outputs, binary_yes_no, last_scores


In [None]:
# Instantiate model and send to selected device
example_myBaseOPT_CD = myBaseOPT_CD(load_model_name = model_name, device = device)
example_myBaseOPT_CD.half()
example_myBaseOPT_CD.to(device)

for param in example_myBaseOPT_CD.parameters():
  if param.ndim <=2:
    param.data = param.data.to(torch.float32)

example_myBaseOPT_CD

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

myBaseOPT_CD(
  (coreOPT): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [None]:
# Check total number of model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Model Parameters: ", count_parameters(example_myBaseOPT_CD))

Model Parameters:  356935680


In [None]:

# Check base model output correctness (from OPT HF example)
prompt_example = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
              "there?")
prompt_example_tokenized = OPT_tokenizer(prompt_example )
example_myBaseOPT_CD.eval()
outputs_ex_sp, binary_ex_sp, scores_ex_sp = example_myBaseOPT_CD.generate_text(src_inputs = torch.unsqueeze(torch.tensor(prompt_example_tokenized['input_ids']),0).to(device),
                                    src_attn = torch.unsqueeze(torch.tensor(prompt_example_tokenized['attention_mask']),0).to(device),
                                    gen_tokens=torch.tensor(100).to(device))
OPT_tokenizer.batch_decode(outputs_ex_sp)



['</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love to eat at the New York City Subway.\nHuman: What is your favorite movie?\nStatue: I love to watch movies.\nHuman: What is your favorite book?\nStatue: I love to read books.\nHuman: What is your favorite movie?\nStatue: I love to watch movies.\nHuman: What']

## 5. IMPORT NLI DATASET FOR TRAINING AND VALIDATION: MNLI

In [None]:
# reference: https://github.com/uds-lsv/llmft/blob/main/notebooks/majority_baseline.ipynb
# this reference is useful for cleaning the neutral sentences of the dataset, just keeping the 0 and 1.

In [None]:
from collections import Counter
from datasets import load_dataset, ClassLabel

In [None]:
# this comes from original paper, to remove neutral examples from MNLI
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example

    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features

    return dataset


In [None]:
# Select GLUE task (set it at the top of NB for simplicity with all other inputs)

# task_name = "rte"
# task_name = "mnli"
# task_name = "qqp"
# task_name = "cola"

task_name = task_name

In [None]:
dataset = load_dataset("glue", task_name)

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [None]:
# binarize dataset
if task_name == "mnli":
    dataset = binarize_mnli(dataset, remove_neutral=True) # mnli


Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9832 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9796 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/261802 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6692 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6703 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9796 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [None]:
# analyze and visualize dataset imported

print("task_name:", task_name)
# for split in ["train", "validation"]:
for split in ["train", "validation_matched"]:
    c = Counter(dataset[split]["label"])
    total = len(list(c.elements()))
    print("Total number of samples:", total)
    print(split)
    for k in c:
        print(f"fraction of labels per class: {k}={c[k] / total}")
print(dataset)

task_name: mnli
Total number of samples: 261802
train
fraction of labels per class: 0=0.49999236063895613
fraction of labels per class: 1=0.5000076393610439
Total number of samples: 6692
validation_matched
fraction of labels per class: 1=0.4801255230125523
fraction of labels per class: 0=0.5198744769874477
DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 261802
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 6692
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 6703
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})


In [None]:
# Perform the filters and splits from the original datasets


random_split_seed = 42

examples_per_exp =  examples_per_exp # 16
num_experiments = num_experiments # 10
num_validations = num_validations # 16*64 #64*16 = 1024 #6692

max_train_samples = examples_per_exp*num_experiments
train_dataset = dataset['train']
print(train_dataset)

train_dataset_yes_all = dataset['train'].filter(lambda example: example["label"] == 0)
train_dataset_no_all = dataset['train'].filter(lambda example: example["label"] == 1)
print(train_dataset_yes_all)
print(train_dataset_no_all)

val_dataset_all = dataset['validation_matched']

# randomly select a subset of the training data
max_train_samples = min(len(train_dataset), max_train_samples)

np.random.seed(random_split_seed)
indices_yes = np.random.choice(range(len(train_dataset_yes_all)), size=int(max_train_samples/2), replace=False)
#print("indices_yes: ", indices_yes)

np.random.seed(random_split_seed+1)
indices_no = np.random.choice(range(len(train_dataset_no_all)), size=int(max_train_samples/2), replace=False)
#print("indices_no: ", indices_no)

np.random.seed(random_split_seed+2)
indices_val = np.random.choice(range(len(val_dataset_all)), size=num_validations, replace=False)
#print("indices_val: ", indices_val)

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 261802
})


Filter:   0%|          | 0/261802 [00:00<?, ? examples/s]

Filter:   0%|          | 0/261802 [00:00<?, ? examples/s]

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 130899
})
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 130903
})


In [None]:
train_dataset_yes = train_dataset_yes_all.select(indices_yes)
train_dataset_no = train_dataset_no_all.select(indices_no)

val_dataset = val_dataset_all.select(indices_val)
print("Train Dataset Yes: ", train_dataset_yes)
print("Train Dataset No: ", train_dataset_no)
print("Validation Dataset (in-domain): ", val_dataset)

Train Dataset Yes:  Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 80
})
Train Dataset No:  Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 80
})
Validation Dataset (in-domain):  Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 1024
})


In [None]:

# Combine indices of selected subsets

used_indices_yes_no = train_dataset_yes['idx'] + train_dataset_no['idx']

train_ind_remaining = [i for i, element in enumerate(train_dataset['idx']) if element not in used_indices_yes_no]

train_dataset_remaining = train_dataset.select(train_ind_remaining)

print("train_dataset_remaining:", len(train_dataset_remaining))
print(train_dataset_remaining)

train_dataset_remaining_yes_all = train_dataset_remaining.filter(lambda example: example["label"] == 0)
train_dataset_remaining_no_all = train_dataset_remaining.filter(lambda example: example["label"] == 1)

print("train_dataset_remaining_yes_all:", len(train_dataset_remaining_yes_all))
print(train_dataset_remaining_yes_all)
#print(train_dataset_remaining_yes_all['idx'])

print("train_dataset_remaining_no_all:", len(train_dataset_remaining_no_all))
print(train_dataset_remaining_no_all)
#print(train_dataset_remaining_no_all['idx'])



train_dataset_remaining: 261642
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 261642
})


Filter:   0%|          | 0/261642 [00:00<?, ? examples/s]

Filter:   0%|          | 0/261642 [00:00<?, ? examples/s]

train_dataset_remaining_yes_all: 130819
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 130819
})
train_dataset_remaining_no_all: 130823
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 130823
})


In [None]:
num_reasoning_context_per_example = num_reasoning_context_per_example
total_to_select_CD_YES = int(num_repeats*num_experiments*examples_per_exp*num_reasoning_context_per_example/2)
total_to_select_CD_NO = int(num_repeats*num_experiments*examples_per_exp*num_reasoning_context_per_example/2)
print("total_to_select_CD_YES: ", total_to_select_CD_YES)
print("total_to_select_CD_NO: ", total_to_select_CD_NO)

total_to_select_CD_YES:  128000
total_to_select_CD_NO:  128000


In [None]:
random_split_seed_CD = 100

np.random.seed(random_split_seed_CD)
indices_rem_yes = np.random.choice(range(len(train_dataset_remaining_yes_all)), size=total_to_select_CD_YES, replace=False)
print("indices_yes: ", indices_yes)

np.random.seed(random_split_seed_CD+1)
indices_rem_no = np.random.choice(range(len(train_dataset_remaining_no_all)), size=total_to_select_CD_NO, replace=False)
print("indices_no: ", indices_no)

train_dataset_CD_yes = train_dataset_remaining_yes_all.select(indices_rem_yes)
train_dataset_CD_no = train_dataset_remaining_no_all.select(indices_rem_no)

print("Train Dataset CD Yes: ", train_dataset_CD_yes)
print("Train Dataset CD No: ", train_dataset_CD_no)

indices_yes:  [108195  86013  39482  39689  10288  11589  94511  78690  36953  74067
  93678  83921  83896  21665  76736    651  48482  40811 127490  49367
 121664  39918  60933 126502  65765  12966  33438   7201  19815  49187
  29116  48565 125127  60274  33985 130032 104535 120345 104033  44914
  89806  87143 103906  15697  29521   4906  46884  75442  57625  32365
  70562  78463  18684  45639  30223 118624  40945  75797  63681  77117
  16126 130579   2132 113346  68080   7433 120366 122242  75493  64389
  95467  86480  52323  42308 101738  51386 126981  27346  45655 121440]
indices_no:  [ 54039  34647  34994 102702  14063 110662  33077  24477  24337  19083
  61263 109299 107760  88071  22063  90740 113958   9163  45235  32885
  58399  59560 102582  10964  38283  16146  72067  55788  60576  21220
  41478 123489  38278  15117  71374  69791  39777 122448  10098  35761
  74547 109598  19072  61567  56626 102957  18014  14118  46250 117891
  87958 113798 107148 121622  88599   8239 119796

In [None]:
# Calculate the number of 0 and 1 in validation dataset
# and calculate the majority class accuracy

val_dataset_yes = val_dataset.filter(lambda example: example["label"] == 0)
val_dataset_no = val_dataset.filter(lambda example: example["label"] == 1)
print(val_dataset_yes)
print(val_dataset_no)
print("Majority Class Accuracy: ", 100*max(len(val_dataset_yes), len(val_dataset_no))/(len(val_dataset_yes) + len(val_dataset_no)))

Filter:   0%|          | 0/1024 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1024 [00:00<?, ? examples/s]

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 536
})
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 488
})
Majority Class Accuracy:  52.34375


In [None]:


# select format to use here:
format_train_val = format_train_val # set it at the top of notebook in a common place


def format_examples_CD_train(example_train, format_train=format_train_val):
    # format examples of train data for ICL
    # select format

    if format_train == 'minimal':
      # "minimal" format
      if example_train['label'] == 0:
        return {'text': "{"  + example_train['premise'] + "} {" + example_train['hypothesis'] + "}" + " ? ĠYes \n\n"}
      elif example_train['label'] == 1:
        return {'text': "{"  + example_train['premise'] + "} {" + example_train['hypothesis'] + "}" + " ? ĠNo \n\n"}
    elif format_train == 'gpt3':
      # "gpt-3" format
      if example_train['label'] == 0:
        return {'text': "{"  + example_train['premise'] + "} question: {" + example_train['hypothesis'] + "}" + " Yes or No? answer: ĠYes \n\n"}
      elif example_train['label'] == 1:
        return {'text': "{"  + example_train['premise'] + "} question: {" + example_train['hypothesis'] + "}" + " Yes or No? answer: ĠNo \n\n"}

def format_examples_train(example_val, format_val = format_train_val):
    if format_val== 'minimal':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} {" + example_val['hypothesis'] + "}" + " ? Ġ"}
    elif format_val== 'gpt3':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} question: {" + example_val['hypothesis'] + "}" + " Yes or No? answer: Ġ"}

def create_combined_dataset(train_ds_yes, train_ds_no, context_ds_yes, context_ds_no, num_expts=num_experiments, num_train_examples=examples_per_exp, num_contxt_dist_examples = num_reasoning_context_per_example, num_repeats = num_repeats):
    combined_dataset = []
    train_examples_yes = [example for example in train_ds_yes]
    train_examples_no = [example for example in train_ds_no]
    context_examples_yes = [example for example in context_ds_yes]
    context_examples_no = [example for example in context_ds_no]

    for irep in range(num_expts):

          sampled_train_exs_yes = train_examples_yes[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
          sampled_train_exs_no = train_examples_no[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
          # for random option if used below
          merged_sampled_train_exs = sampled_train_exs_yes + sampled_train_exs_no
          shuffled_list = merged_sampled_train_exs.copy()
          # Shuffle the copy
          random.seed(irep)
          random.shuffle(shuffled_list)

          # Way 2: set randomized
          for idx_shuffled_list in range(len(shuffled_list)):

            # NEW FOR REPETITIONS
            for idx_repeat in range(num_repeats):

                  sampled_context_exs_yes = context_examples_yes[int((num_contxt_dist_examples/2)*(irep*num_train_examples*num_repeats + idx_shuffled_list*num_repeats + idx_repeat)):\
                                                                  int((num_contxt_dist_examples/2)*(irep*num_train_examples*num_repeats + (idx_shuffled_list)*num_repeats + (idx_repeat+1)))]
                  sampled_context_exs_no = context_examples_no[int((num_contxt_dist_examples/2)*(irep*num_train_examples*num_repeats + idx_shuffled_list*num_repeats + idx_repeat)): \
                                                                  int((num_contxt_dist_examples/2)*(irep*num_train_examples*num_repeats + (idx_shuffled_list)*num_repeats + (idx_repeat+1)))]
                  #print("IDX INIT: ", int((num_contxt_dist_examples/2)*(irep*num_train_examples*num_repeats + idx_shuffled_list*num_repeats + idx_repeat)), "IDX FINAL: ", int((num_contxt_dist_examples/2)*(irep*num_train_examples*num_repeats + idx_shuffled_list*num_repeats + (idx_repeat+1))))

                  # random option
                  merged_sampled_context_exs = sampled_context_exs_yes + sampled_context_exs_no
                  context_shuffled_list = merged_sampled_context_exs.copy()
                  # Shuffle the copy
                  random.seed(int((irep+1)*(idx_shuffled_list+1)*(idx_repeat+1)))
                  random.shuffle(context_shuffled_list)

                  combined_ex = {'text': '', 'label': shuffled_list[idx_shuffled_list]['label'], 'pred_label': -100, 'equal_label': False, 'exp': irep+1, 'idx_example': idx_shuffled_list+1, 'idx_repeat': idx_repeat+1, 'OPT_prob_CD': torch.zeros(0).half().to(device), 'OPT_idx_CD': torch.zeros(0).half().to(device)}

                  for example_context_shuffled_list in context_shuffled_list:

                      combined_ex['text'] += example_context_shuffled_list['text']

                  combined_ex['text'] += shuffled_list[idx_shuffled_list]['text']

                  combined_dataset.append([combined_ex])

    return combined_dataset


def dynamic_padding_collate_fn(batch):
    # This function is created to be able to tokenize dynamically to max length within each batch
    # Also, by modifying the tokenizer used, several other options are available
    # for example, if we set padding to a specified max_length, for example the model max_length, is also an option, not the default though
    # the default is the dynamic padding

    batch = [item for sublist in batch for item in sublist]

    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    equal_labels = [item['equal_label'] for item in batch]
    pred_labels = [item['pred_label'] for item in batch]

    exps = [item['exp'] for item in batch]
    idx_exs = [item['idx_example'] for item in batch]
    idx_reps = [item['idx_repeat'] for item in batch]
    OPT_probs_cd = [item['OPT_prob_CD'] for item in batch]
    OPT_idxs_cd = [item['OPT_idx_CD'] for item in batch]

    # choose option
    tokenized_inputs = OPT_tokenizer(texts, padding="longest", truncation=True, return_tensors="pt")

    labels_tensor = torch.tensor(labels, dtype=torch.long).to(device)
    pred_labels_tensor = torch.tensor(pred_labels, dtype=torch.long).to(device)
    exps_tensor = torch.tensor(exps, dtype=torch.long).to(device)
    equal_labels_tensor = torch.tensor(equal_labels, dtype=torch.long).to(device)
    idx_exs_tensor = torch.tensor(idx_exs, dtype=torch.long).to(device)
    idx_reps_tensor = torch.tensor(idx_reps, dtype=torch.long).to(device)

    OPT_probs_cd_tensor = torch.cat(OPT_probs_cd, dim=0)

    OPT_idxs_cd_tensor = torch.cat(OPT_idxs_cd, dim=0)

    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels_tensor,
        'pred_labels': pred_labels_tensor,
        'equal_labels': equal_labels_tensor,
        'exps': exps_tensor,
        'idx_exs': idx_exs_tensor,
        'idx_reps': idx_reps_tensor,
        'OPT_probs': OPT_probs_cd_tensor,
        'OPT_idxs': OPT_idxs_cd_tensor,
    }

class CustomDataset(Dataset):
    def __init__(self, combined_dataset):
        self.dataset = combined_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]


In [None]:
# First the samples are formatted according to selection above

formatted_train_dataset_yes = train_dataset_yes.map(format_examples_train)
formatted_train_dataset_no = train_dataset_no.map(format_examples_train)
formatted_train_CD_dataset_yes = train_dataset_CD_yes.map(format_examples_CD_train)
formatted_train_CD_dataset_no = train_dataset_CD_no.map(format_examples_CD_train)

# Initialize custom dataset with the combined dataset

combined_dataset = create_combined_dataset(
                                          train_ds_yes = formatted_train_dataset_yes,
                                          train_ds_no = formatted_train_dataset_no,
                                          context_ds_yes = formatted_train_CD_dataset_yes,
                                          context_ds_no = formatted_train_CD_dataset_no,
                                          num_expts=num_experiments,
                                          num_train_examples=examples_per_exp,
                                          num_contxt_dist_examples = num_reasoning_context_per_example,
                                          num_repeats = num_repeats
                                           )

custom_dataset = CustomDataset(combined_dataset)
print(custom_dataset)

# Last step, we create Dataloader passing the bx_size for inference (typically: 1, 4, 8, 16)
bx_size = bx_size # set it up at the beg of NB
dataloader = DataLoader(custom_dataset, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn, shuffle=False) #shuffle=False for reproducibility

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/128000 [00:00<?, ? examples/s]

Map:   0%|          | 0/128000 [00:00<?, ? examples/s]

<__main__.CustomDataset object at 0x7ff9dbb81000>


In [None]:
idxcount = 0
for elements in dataloader:
  if idxcount <10:
    print(elements)
    idxcount = idxcount +1
  else:
    break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[    2, 45152,  1185,  1467,  5476,   328,   660,  1090,   197,  3215,
           375,  8238, 49463,   864,    35, 25522,  4688,  1090,  1410,   375,
          8238, 49463,  3216,    50,   440,   116,  1948,    35,  4236, 21402,
          9904,  1437, 50118, 50118, 45152, 31033,  2512,  2710,     9,    86,
            13,    10,  8694,  1656,    66,    15,     5,  5645, 49463,   864,
            35, 25522,   970,    16,  1085,     7,   192,    15,     5,  5645,
         49463,  3216,    50,   440,   116,  1948,    35,  4236, 21402,  3084,
          1437, 50118, 50118, 45152,   574, 11842,   965,    75,  1227,    11,
           143,   403,     6, 12521,     4,    20,  6741,   661,  1224,   683,
            55,     7,    39,   979, 49463,   864,    35, 25522,   574, 11842,
           965,    75,  2460,   648,     6, 12521,     8,     5,  6741,   661,
          1224,   124,     7,    39,   920, 49463,  3216,    50,   440,   116,
          1948,    35,  4236, 21402,  

In [None]:
# This is to inspect that the dataloader is performing as expected
# Also using the decoding to check back that results are expected and examples can be compared
# Only done for few examples

for i, batch in enumerate(dataloader):
    if i<10:
      print("ORIGINAL: ", i, " ||| experiment#: ", batch['exps'], " ||| example#: ", batch['idx_exs'], " ||| repetition: ", batch['idx_reps'])
      print("TOKENIZE / DETOKENIZE: ", OPT_tokenizer.batch_decode(batch['input_ids']))
    else:
      break

ORIGINAL:  0  ||| experiment#:  tensor([1], device='cuda:0')  ||| example#:  tensor([1], device='cuda:0')  ||| repetition:  tensor([1], device='cuda:0')
TOKENIZE / DETOKENIZE:  ["</s>{You knew Pa! Anse shouldered past Drew.} question: {Anse moved past Drew.} Yes or No? answer: ĠYes \n\n{Give yourself plenty of time for a spectacular walk out on the roof.} question: {There is nothing to see on the roof.} Yes or No? answer: ĠNo \n\n{Lunch isn't ready in any case, Doctor. The Industrialist turned once more to his son.} question: {Lunch isn't prepared yet, Doctor and the Industrialist turned back to his child.} Yes or No? answer: ĠYes \n\n{what uh how they develop uh what the candidate stands for the you know the views and uh} question: {There is no candidate in the election at all.} Yes or No? answer: ĠNo \n\n{During the night, he'd partially awakened in agony to find Nema chanting and gesturing desperately beside him, and he'd been sure he was on the verge of his second death.} question:

In [31]:
# Set eval model for inference
# initialize to store results of model predictions and compare with ground-truth
# use generate text with only one token
# extract all probabilities of next token

example_myBaseOPT_CD.eval()
import time


model_target_bin_int = torch.zeros(0, dtype=torch.long).to(device)
#ground_truth = torch.zeros(0, dtype=torch.long).to(device)

with torch.no_grad():
    for i, batch in enumerate(dataloader):

        #print("BATCH#: ", i, "NUM RUNS TOTAL: ", (i+1)*bx_size)

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # output only the binary yes/no,
        output_scores = (example_myBaseOPT_CD.forward(input_ids, attention_mask))[:,-1,:]

        output_probs_raw = torch.softmax(output_scores, dim=1)

        output_probs = output_probs_raw.to(dtype=torch.float16)

        output_binary = output_probs[:,9904] - output_probs[:,3084]
        output_binary[output_binary >= 0] = 0 # or 9904 if token used "Yes"
        output_binary[output_binary < 0] = 1 # or 3084 if token used "No"

        output_binary = output_binary.to(dtype=torch.long)

        model_target_bin_int = torch.cat((model_target_bin_int, output_binary), dim=0)
        time.sleep(0.04)

model_target_bin = model_target_bin_int.detach()
print("model_target_probs SHAPE: ", model_target_bin.shape)
print("model_target_probs: ", model_target_bin)

model_target_probs SHAPE:  torch.Size([16000])
model_target_probs:  tensor([0, 0, 0,  ..., 0, 1, 0], device='cuda:0')


In [32]:
print("total YES: ", torch.sum(model_target_bin==0))
print("total NO: ", torch.sum(model_target_bin==1))

total YES:  tensor(11019, device='cuda:0')
total NO:  tensor(4981, device='cuda:0')


In [35]:
def add_ICL_pred_label_to_dataset(dataset, pred_bin_tensor_input):
    new_data = []
    idx = 0
    for item in dataset:
        new_item = item.copy()

        new_item[0]['pred_label'] = torch.unsqueeze(pred_bin_tensor_input[idx],0)
        new_item[0]['equal_label'] = (new_item[0]['pred_label'] == new_item[0]['label']).item()

        new_data.append(new_item)
        idx = idx +1

    return CustomDataset(new_data)


In [36]:
custom_dataset_FILTERED_ICL_Pred = add_ICL_pred_label_to_dataset(custom_dataset, model_target_bin)
print(custom_dataset_FILTERED_ICL_Pred)


<__main__.CustomDataset object at 0x7ff9a412fd00>


In [37]:
dataloader_FILTERED_ICL_Pred = DataLoader(custom_dataset_FILTERED_ICL_Pred, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn, shuffle=False)

for i, batch in enumerate(dataloader_FILTERED_ICL_Pred):
    if i<200:
      print("ORIGINAL: ", i, " ||| experiment#: ", batch['exps'], " ||| equal_lab? #: ", batch['equal_labels'], " ||| labels#: ", batch['labels'], " ||| pred_labels#: ", batch['pred_labels'], " ||| example#: ", batch['idx_exs'], " ||| repetition: ", batch['idx_reps'])
      print("TOKENIZE / DETOKENIZE: ", OPT_tokenizer.batch_decode(batch['input_ids']))
    else:
      break

ORIGINAL:  0  ||| experiment#:  tensor([1], device='cuda:0')  ||| equal_lab? #:  tensor([0], device='cuda:0')  ||| labels#:  tensor([1], device='cuda:0')  ||| pred_labels#:  tensor([0], device='cuda:0')  ||| example#:  tensor([1], device='cuda:0')  ||| repetition:  tensor([1], device='cuda:0')
TOKENIZE / DETOKENIZE:  ["</s>{You knew Pa! Anse shouldered past Drew.} question: {Anse moved past Drew.} Yes or No? answer: ĠYes \n\n{Give yourself plenty of time for a spectacular walk out on the roof.} question: {There is nothing to see on the roof.} Yes or No? answer: ĠNo \n\n{Lunch isn't ready in any case, Doctor. The Industrialist turned once more to his son.} question: {Lunch isn't prepared yet, Doctor and the Industrialist turned back to his child.} Yes or No? answer: ĠYes \n\n{what uh how they develop uh what the candidate stands for the you know the views and uh} question: {There is no candidate in the election at all.} Yes or No? answer: ĠNo \n\n{During the night, he'd partially awaken

In [38]:
custom_dataset_TO_FILTER_ICL_INT = CustomDataset([item for item in custom_dataset_FILTERED_ICL_Pred if item[0]['equal_label'] == True])
custom_dataset_TO_FILTER_ICL_INT

<__main__.CustomDataset at 0x7ff995aa8850>

In [40]:
dataloader_TO_FILTER_ICL_INT = DataLoader(custom_dataset_TO_FILTER_ICL_INT, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn, shuffle=False)

for i, batch in enumerate(dataloader_TO_FILTER_ICL_INT):
    if i<10:
      print("ORIGINAL: ", i, " ||| experiment#: ", batch['exps'], " ||| equal_lab? #: ", batch['equal_labels'], " ||| labels#: ", batch['labels'], " ||| pred_labels#: ", batch['pred_labels'], " ||| example#: ", batch['idx_exs'], " ||| repetition: ", batch['idx_reps'])
      #print("TOKENIZE / DETOKENIZE: ", OPT_tokenizer.batch_decode(batch['input_ids']))
    else:
      break

ORIGINAL:  0  ||| experiment#:  tensor([1], device='cuda:0')  ||| equal_lab? #:  tensor([1], device='cuda:0')  ||| labels#:  tensor([1], device='cuda:0')  ||| pred_labels#:  tensor([1], device='cuda:0')  ||| example#:  tensor([1], device='cuda:0')  ||| repetition:  tensor([5], device='cuda:0')
ORIGINAL:  1  ||| experiment#:  tensor([1], device='cuda:0')  ||| equal_lab? #:  tensor([1], device='cuda:0')  ||| labels#:  tensor([1], device='cuda:0')  ||| pred_labels#:  tensor([1], device='cuda:0')  ||| example#:  tensor([1], device='cuda:0')  ||| repetition:  tensor([11], device='cuda:0')
ORIGINAL:  2  ||| experiment#:  tensor([1], device='cuda:0')  ||| equal_lab? #:  tensor([1], device='cuda:0')  ||| labels#:  tensor([1], device='cuda:0')  ||| pred_labels#:  tensor([1], device='cuda:0')  ||| example#:  tensor([1], device='cuda:0')  ||| repetition:  tensor([17], device='cuda:0')
ORIGINAL:  3  ||| experiment#:  tensor([1], device='cuda:0')  ||| equal_lab? #:  tensor([1], device='cuda:0')  ||

In [41]:

list_FILTERED_ICL_INT = []
for exp_id in range(num_experiments):
    for example_id in range(examples_per_exp):
        print("Exp ID: ", exp_id+1, "||| Example ID: ", example_id+1)
        matched = False
        for item in custom_dataset_TO_FILTER_ICL_INT:
            #print(item)
            if item[0]['exp'] == exp_id+1 and item[0]['idx_example'] == example_id+1:
                print(item)
                list_FILTERED_ICL_INT.append(item)
                matched = True
                break
        if matched:
            continue


Exp ID:  1 ||| Example ID:  1
[{'text': "{especially if you've got kids} question: {Especially if you have kids.} Yes or No? answer: ĠYes \n\n{OIRA approved the final rule as complying with the order on November 7, 1997.} question: {The final rule was marked as complied with on November 7, 1997.} Yes or No? answer: ĠYes \n\n{Exhibits 12 and 13 present a summary of health effects benefits resulting from improvements in air quality between the Base Case and the Clear Skies Act scenarios.} question: {There is a summary of health effects benefits in exhibits 12 and 13.} Yes or No? answer: ĠYes \n\n{The other venerable house in the square, now a restaurant, is the Maison Kam?\xadmer?\xadzell.} question: {There are no other venerable houses in the square.} Yes or No? answer: ĠNo \n\n{This is celestially ordained blondness, the mark of God's favor, affirming the signal beauty of the old pagan deities who had already given all blondes--torrid or chilly, fake or real--an edge for 2,000 years.} 

In [42]:
custom_dataset_FILTERED_ICL_INT = CustomDataset(list_FILTERED_ICL_INT)

In [43]:
dataloader_FILTERED_ICL_INT = DataLoader(custom_dataset_FILTERED_ICL_INT, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn, shuffle=False)

for i, batch in enumerate(dataloader_FILTERED_ICL_INT):
    if i<2000:
      print("ORIGINAL: ", i, " ||| experiment#: ", batch['exps'], " ||| equal_lab? #: ", batch['equal_labels'], " ||| labels#: ", batch['labels'], " ||| pred_labels#: ", batch['pred_labels'], " ||| example#: ", batch['idx_exs'], " ||| repetition: ", batch['idx_reps'])
      print("TOKENIZE / DETOKENIZE: ", OPT_tokenizer.batch_decode(batch['input_ids']))
    else:
      break

ORIGINAL:  0  ||| experiment#:  tensor([1], device='cuda:0')  ||| equal_lab? #:  tensor([1], device='cuda:0')  ||| labels#:  tensor([1], device='cuda:0')  ||| pred_labels#:  tensor([1], device='cuda:0')  ||| example#:  tensor([1], device='cuda:0')  ||| repetition:  tensor([5], device='cuda:0')
TOKENIZE / DETOKENIZE:  ["</s>{especially if you've got kids} question: {Especially if you have kids.} Yes or No? answer: ĠYes \n\n{OIRA approved the final rule as complying with the order on November 7, 1997.} question: {The final rule was marked as complied with on November 7, 1997.} Yes or No? answer: ĠYes \n\n{Exhibits 12 and 13 present a summary of health effects benefits resulting from improvements in air quality between the Base Case and the Clear Skies Act scenarios.} question: {There is a summary of health effects benefits in exhibits 12 and 13.} Yes or No? answer: ĠYes \n\n{The other venerable house in the square, now a restaurant, is the Maison Kam?\xadmer?\xadzell.} question: {There a

# GENERATE PROBABILITIES WITH MODEL IN EVAL (ONLY FOR FILTERED DATASET)

In [44]:
# Set eval model for inference
# initialize to store results of model predictions and compare with ground-truth
# use generate text with only one token
# extract all probabilities of next token

example_myBaseOPT_CD.eval()

model_target_probs_int = torch.zeros(0, dtype=torch.long).to(device)
#ground_truth = torch.zeros(0, dtype=torch.long).to(device)

with torch.no_grad():
    for i, batch in enumerate(dataloader_FILTERED_ICL_INT):

        #print("BATCH#: ", i, "NUM RUNS TOTAL: ", (i+1)*bx_size)

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # output only the binary yes/no,
        output_scores = (example_myBaseOPT_CD.forward(input_ids, attention_mask))[:,-1,:]

        output_probs = torch.softmax(output_scores, dim=1)

        model_target_probs_int = torch.cat((model_target_probs_int, output_probs), dim=0)

model_target_probs = model_target_probs_int.detach()
print("model_target_probs SHAPE: ", model_target_probs.shape)
print("model_target_probs: ", model_target_probs)

model_target_probs SHAPE:  torch.Size([160, 50272])
model_target_probs:  tensor([[1.3728e-09, 2.7518e-11, 7.7708e-07,  ..., 1.7232e-11, 5.0136e-11,
         3.2752e-11],
        [3.2152e-09, 7.4173e-11, 8.6289e-07,  ..., 2.5806e-11, 7.7101e-11,
         6.5238e-11],
        [7.0206e-10, 1.7906e-11, 5.4827e-07,  ..., 6.2805e-12, 1.2679e-11,
         2.0025e-11],
        ...,
        [1.1444e-09, 5.9779e-11, 1.0637e-06,  ..., 9.2651e-11, 5.0164e-11,
         3.7827e-11],
        [7.9555e-10, 3.6107e-11, 3.3264e-07,  ..., 1.2233e-11, 4.5242e-11,
         4.4712e-11],
        [1.5236e-09, 3.7738e-11, 2.2589e-07,  ..., 1.0364e-11, 8.9644e-11,
         5.3470e-11]], device='cuda:0')


In [45]:
# Scores for YES token
model_target_probs[:, 9904]

tensor([0.4063, 0.4725, 0.5354, 0.7058, 0.3315, 0.6194, 0.5042, 0.4804, 0.3897,
        0.6542, 0.4500, 0.5704, 0.6677, 0.4897, 0.4335, 0.4201, 0.6482, 0.4703,
        0.5123, 0.3577, 0.6697, 0.6304, 0.6272, 0.4201, 0.5507, 0.4331, 0.4201,
        0.4921, 0.3879, 0.3710, 0.4778, 0.5447, 0.3897, 0.3586, 0.5921, 0.4004,
        0.5373, 0.7412, 0.4099, 0.6058, 0.4502, 0.7361, 0.4382, 0.5988, 0.6441,
        0.4182, 0.4334, 0.5737, 0.6488, 0.4749, 0.4645, 0.3881, 0.3776, 0.6053,
        0.5740, 0.6157, 0.4918, 0.4149, 0.2919, 0.6987, 0.5847, 0.4044, 0.3190,
        0.6834, 0.4638, 0.8251, 0.3870, 0.4798, 0.3954, 0.5057, 0.6025, 0.4654,
        0.4421, 0.5358, 0.3940, 0.4895, 0.3859, 0.5341, 0.5753, 0.6528, 0.6921,
        0.5959, 0.6743, 0.4222, 0.3975, 0.5157, 0.5326, 0.5552, 0.7270, 0.2928,
        0.4650, 0.4019, 0.4508, 0.6340, 0.4627, 0.4860, 0.4653, 0.6927, 0.3831,
        0.6560, 0.7240, 0.4626, 0.3717, 0.5160, 0.4726, 0.4338, 0.4401, 0.5436,
        0.5798, 0.4672, 0.5743, 0.6235, 

In [46]:
# Scores for NO token
model_target_probs[:, 3084]

tensor([0.5707, 0.4739, 0.4332, 0.2662, 0.6531, 0.3676, 0.4741, 0.4995, 0.5919,
        0.3080, 0.5064, 0.4129, 0.3054, 0.3592, 0.5457, 0.5466, 0.3338, 0.5005,
        0.4570, 0.6110, 0.2772, 0.3344, 0.3435, 0.5541, 0.3373, 0.5434, 0.5147,
        0.4912, 0.5820, 0.6074, 0.4935, 0.3803, 0.5741, 0.5593, 0.3846, 0.5632,
        0.4030, 0.2319, 0.5582, 0.3418, 0.5175, 0.2373, 0.5391, 0.3500, 0.3323,
        0.5392, 0.5429, 0.3723, 0.3257, 0.4872, 0.4875, 0.5749, 0.5207, 0.3639,
        0.4083, 0.3422, 0.4797, 0.4184, 0.6931, 0.2807, 0.3962, 0.5515, 0.6517,
        0.3029, 0.5052, 0.1393, 0.5908, 0.4991, 0.5932, 0.4780, 0.3622, 0.4884,
        0.5257, 0.4443, 0.5700, 0.3952, 0.5844, 0.4305, 0.4030, 0.3133, 0.2818,
        0.3810, 0.3050, 0.5350, 0.5672, 0.4626, 0.4379, 0.4006, 0.2546, 0.6786,
        0.5179, 0.5756, 0.5190, 0.3424, 0.5011, 0.4963, 0.4929, 0.2698, 0.5538,
        0.3193, 0.2549, 0.5112, 0.5909, 0.4546, 0.5073, 0.5463, 0.5122, 0.4005,
        0.3963, 0.5037, 0.4054, 0.3486, 

In [47]:
sorted_probs, idx_probs = model_target_probs.sort(dim=1, descending=True)
print("sorted_probs SHAPE: ", sorted_probs.shape)
print("idx_probs SHAPE: ", idx_probs.shape)

sorted_probs SHAPE:  torch.Size([160, 50272])
idx_probs SHAPE:  torch.Size([160, 50272])


In [48]:
# EXTRACT PROBABILITIES

# TOP x PROBS TO MATCH
target_prob_match = torch.zeros(sorted_probs.shape[0], number_max_probs_match + 1).half().to(device)
target_prob_match[:, 0:number_max_probs_match] = sorted_probs[:, 0:number_max_probs_match]
target_prob_match[:, number_max_probs_match] = 1-torch.sum(target_prob_match, dim=1)
print("target_prob_match SHAPE: ", target_prob_match.shape)
print("target_prob_match: ", target_prob_match)
print("Check sum 1: ", torch.sum(target_prob_match, dim=1))

# INDEX TOP X TO MATCH
target_idx_match = torch.zeros(idx_probs.shape[0], number_max_probs_match, dtype=torch.long).to(device)
target_idx_match[:, 0:number_max_probs_match] = idx_probs[:,0:number_max_probs_match]
print("target_idx_match SHAPE: ", target_idx_match.shape)
print("target_idx_match: ", target_idx_match)

target_prob_match SHAPE:  torch.Size([160, 51])
target_prob_match:  tensor([[5.7080e-01, 4.0625e-01, 2.3651e-03,  ..., 7.4744e-05, 7.3493e-05,
         6.3477e-03],
        [4.7388e-01, 4.7241e-01, 7.3128e-03,  ..., 1.6963e-04, 1.6940e-04,
         1.4648e-02],
        [5.3564e-01, 4.3311e-01, 4.3411e-03,  ..., 9.7394e-05, 9.7275e-05,
         7.8125e-03],
        ...,
        [7.7002e-01, 1.7236e-01, 1.0185e-02,  ..., 1.2815e-04, 1.2720e-04,
         1.1719e-02],
        [5.9326e-01, 3.8184e-01, 3.3150e-03,  ..., 7.8321e-05, 7.6354e-05,
         6.3477e-03],
        [5.8252e-01, 3.9087e-01, 3.0785e-03,  ..., 8.3029e-05, 7.9513e-05,
         6.8359e-03]], device='cuda:0', dtype=torch.float16)
Check sum 1:  tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 

# SELECT RELEVANT INDEXES FOR LATER ON SELECTING THE PREPARED TRAINING DATASET

In [49]:
# extract which experiment and example indexes have data to generate data to train

list_datapoints = [str(item[0]['exp'])+"_"+str(item[0]['idx_example']) for item in custom_dataset_FILTERED_ICL_INT]
list_datapoints


['1_1',
 '1_2',
 '1_3',
 '1_4',
 '1_5',
 '1_6',
 '1_7',
 '1_8',
 '1_9',
 '1_10',
 '1_11',
 '1_12',
 '1_13',
 '1_14',
 '1_15',
 '1_16',
 '2_1',
 '2_2',
 '2_3',
 '2_4',
 '2_5',
 '2_6',
 '2_7',
 '2_8',
 '2_9',
 '2_10',
 '2_11',
 '2_12',
 '2_13',
 '2_14',
 '2_15',
 '2_16',
 '3_1',
 '3_2',
 '3_3',
 '3_4',
 '3_5',
 '3_6',
 '3_7',
 '3_8',
 '3_9',
 '3_10',
 '3_11',
 '3_12',
 '3_13',
 '3_14',
 '3_15',
 '3_16',
 '4_1',
 '4_2',
 '4_3',
 '4_4',
 '4_5',
 '4_6',
 '4_7',
 '4_8',
 '4_9',
 '4_10',
 '4_11',
 '4_12',
 '4_13',
 '4_14',
 '4_15',
 '4_16',
 '5_1',
 '5_2',
 '5_3',
 '5_4',
 '5_5',
 '5_6',
 '5_7',
 '5_8',
 '5_9',
 '5_10',
 '5_11',
 '5_12',
 '5_13',
 '5_14',
 '5_15',
 '5_16',
 '6_1',
 '6_2',
 '6_3',
 '6_4',
 '6_5',
 '6_6',
 '6_7',
 '6_8',
 '6_9',
 '6_10',
 '6_11',
 '6_12',
 '6_13',
 '6_14',
 '6_15',
 '6_16',
 '7_1',
 '7_2',
 '7_3',
 '7_4',
 '7_5',
 '7_6',
 '7_7',
 '7_8',
 '7_9',
 '7_10',
 '7_11',
 '7_12',
 '7_13',
 '7_14',
 '7_15',
 '7_16',
 '8_1',
 '8_2',
 '8_3',
 '8_4',
 '8_5',
 '8_6',
 '8_7',

# PREPARE DATASET FOR CONTEXT DISTILLATION EXPORT

In [50]:

# select format to use here:
format_train_val = format_train_val # set it at the top of notebook in a common place


def format_examples_train_CDTRAIN(example_val, format_val = format_train_val):
    if format_val== 'minimal':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} {" + example_val['hypothesis'] + "}" + " ? Ġ"}
    elif format_val== 'gpt3':
      # "minimal" format
      return {'text': "{"  + example_val['premise'] + "} question: {" + example_val['hypothesis'] + "}" + " Yes or No? answer: Ġ"}

def create_combined_dataset_CDTRAIN(train_ds_yes, train_ds_no, num_expts=num_experiments, num_train_examples=examples_per_exp):
    combined_dataset = []
    train_examples_yes = [example for example in train_ds_yes]
    train_examples_no = [example for example in train_ds_no]


    for irep in range(num_expts):

          sampled_train_exs_yes = train_examples_yes[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
          sampled_train_exs_no = train_examples_no[int(irep*num_train_examples/2) : int((irep +1)*num_train_examples/2)]
          # for random option if used below
          merged_sampled_train_exs = sampled_train_exs_yes + sampled_train_exs_no
          shuffled_list = merged_sampled_train_exs.copy()
          # Shuffle the copy
          random.seed(irep)
          random.shuffle(shuffled_list)

          # Way 2: set randomized
          for idx_shuffled_list in range(len(shuffled_list)):


            combined_ex = {'text': '', 'label': shuffled_list[idx_shuffled_list]['label'], 'exp': torch.tensor(irep+1).to(device), 'idx_example': idx_shuffled_list+1, 'OPT_prob_CD': torch.zeros(0).half().to(device), 'OPT_idx_CD': torch.zeros(0).half().to(device)}

            combined_ex['text'] += shuffled_list[idx_shuffled_list]['text']

            combined_dataset.append([combined_ex])

    return combined_dataset


def dynamic_padding_collate_fn_CDTRAIN(batch):
    # This function is created to be able to tokenize dynamically to max length within each batch
    # Also, by modifying the tokenizer used, several other options are available
    # for example, if we set padding to a specified max_length, for example the model max_length, is also an option, not the default though
    # the default is the dynamic padding

    batch = [item for sublist in batch for item in sublist]

    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    exps = [item['exp'] for item in batch]
    idx_exs = [item['idx_example'] for item in batch]
    OPT_probs_cd = [item['OPT_prob_CD'] for item in batch]
    OPT_idxs_cd = [item['OPT_idx_CD'] for item in batch]


    # choose option
    tokenized_inputs = OPT_tokenizer(texts, padding="longest", truncation=True, return_tensors="pt")
    # tokenized_inputs = OPT_tokenizer(texts, padding="max_length", max_length = 2048, truncation=True, return_tensors="pt")

    labels_tensor = torch.tensor(labels, dtype=torch.long).to(device)
    exps_tensor = torch.tensor(exps, dtype=torch.long).to(device)
    idx_exs_tensor = torch.tensor(idx_exs, dtype=torch.long).to(device)
    OPT_probs_cd_tensor = torch.cat(OPT_probs_cd, dim=0)


    OPT_idxs_cd_tensor = torch.cat(OPT_idxs_cd, dim=0)

    return {
        'texts': texts,
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels_tensor,
        'exps': exps_tensor,
        'idx_exs': idx_exs_tensor,
        'OPT_probs': OPT_probs_cd_tensor,
        'OPT_idxs': OPT_idxs_cd_tensor,
    }

class CustomDataset(Dataset):
    def __init__(self, combined_dataset):
        self.dataset = combined_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]


In [51]:
# First the samples are formatted according to selection above

formatted_train_dataset_yes_CDTRAIN = train_dataset_yes.map(format_examples_train_CDTRAIN)
formatted_train_dataset_no_CDTRAIN = train_dataset_no.map(format_examples_train_CDTRAIN)

# Initialize custom dataset with the combined dataset

combined_dataset_CDTRAIN = create_combined_dataset_CDTRAIN(
                                          train_ds_yes = formatted_train_dataset_yes_CDTRAIN,
                                          train_ds_no = formatted_train_dataset_no_CDTRAIN,
                                          num_expts=num_experiments,
                                          num_train_examples=examples_per_exp,
                                           )

custom_dataset_CDTRAIN = CustomDataset(combined_dataset_CDTRAIN)
print(custom_dataset_CDTRAIN)

# Last step, we create Dataloader passing the bx_size for inference (typically: 1, 4, 8, 16)
bx_size = bx_size # set it up at the beg of NB
dataloader_CDTRAIN = DataLoader(custom_dataset_CDTRAIN, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn_CDTRAIN, shuffle=False) #shuffle=False for reproducibility

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

<__main__.CustomDataset object at 0x7ff9a4128a90>


In [52]:
new_combined_dataset_CDTRAIN = []

for item in combined_dataset_CDTRAIN:

    if str(item[0]['exp'].item())+"_"+str(item[0]['idx_example']) in list_datapoints:

          new_combined_dataset_CDTRAIN.append(item)

new_custom_dataset_CDTRAIN = CustomDataset(new_combined_dataset_CDTRAIN)

In [53]:
def add_scores_to_dataset(dataset, probs_tensor, idx_tensor):
    new_data = []
    idx = 0
    for item in dataset:
        new_item = item.copy()

        new_item[0]['OPT_prob_CD'] = torch.unsqueeze(probs_tensor[idx,:],0)
        new_item[0]['OPT_idx_CD'] = torch.unsqueeze(idx_tensor[idx,:],0)

        new_data.append(new_item)
        idx = idx +1

    return CustomDataset(new_data)


In [54]:
new_custom_dataset_probs_CDTRAIN = add_scores_to_dataset(new_custom_dataset_CDTRAIN, target_prob_match, target_idx_match)
print(new_custom_dataset_probs_CDTRAIN)

<__main__.CustomDataset object at 0x7ff9a412a7a0>


In [55]:
dataloader_CD_probs_CDTRAIN = DataLoader(new_custom_dataset_probs_CDTRAIN, batch_size=bx_size, collate_fn=dynamic_padding_collate_fn_CDTRAIN, shuffle=False) #shuffle=False for reproducibility
dataloader_CD_probs_CDTRAIN

for i, batch in enumerate(dataloader_CD_probs_CDTRAIN):
    if i<200:
      print("ORIGINAL: ", i, "experiment#: ", batch['exps'], batch['idx_exs'])
      print("TOKENIZE / DETOKENIZE: ", OPT_tokenizer.batch_decode(batch['input_ids']))
    else:
      break

ORIGINAL:  0 experiment#:  tensor([1], device='cuda:0') tensor([1], device='cuda:0')
TOKENIZE / DETOKENIZE:  ['</s>{The chief complaint of reformers these days is that the power of special-interest money is breeding public cynicism about the political process.} question: {Reformers never complain about special interest money.  } Yes or No? answer: Ġ']
ORIGINAL:  1 experiment#:  tensor([1], device='cuda:0') tensor([2], device='cuda:0')
TOKENIZE / DETOKENIZE:  ["</s>{The game's up.} question: {The game keeps going.} Yes or No? answer: Ġ"]
ORIGINAL:  2 experiment#:  tensor([1], device='cuda:0') tensor([3], device='cuda:0')
TOKENIZE / DETOKENIZE:  ["</s>{Nobody pointed out that at Chappaquiddick, unlike Dallas and Los Angeles, the person most responsible for the tragedy was a Kennedy, whereas the victim was not--and that Ted Kennedy's invocation of the family curse was a clever way of papering over these differences.} question: {Ted Kennedy invoked a family curse that has plagued the Kenne

In [56]:
# save file
import csv

file_path = model_name[13:] + '_CD_TRAIN_DATA_qtyprobs_'+ str(number_max_probs_match) +'_toptokens.csv'

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['text','input_ids', 'attention_mask', 'label', 'exp', 'idx_example', 'OPT_prob_CD', 'OPT_idx_CD'])
    for item in dataloader_CD_probs_CDTRAIN:
        for a in zip([item['texts']], [item['input_ids'].tolist()], [item['attention_mask'].tolist()], [item['labels'].tolist()], [item['exps'].tolist()], [item['idx_exs'].tolist()], [item['OPT_probs'].tolist()], [item['OPT_idxs'].tolist()]):
            writer.writerow(a)