**Note to self:** 
* args = parser.parse_args() MUST BE UNCOMMENTED WHEN USING THE HPC
* args = parser.parse_args() MUST BE COMMENTED WHEN USING COLAB

# **Prep**

## Parse command line arguments

In [1]:
# Configure parser for command line arguments
import argparse

# Create parser
parser = argparse.ArgumentParser(description="Parse command line arguments")

# Add arguments
# Source: https://docs.python.org/3/library/argparse.html
# Source: https://www.youtube.com/watch?v=FbEJN8FsJ9U&t=233s

parser.add_argument('-b',
                    '--batch_size',
                    metavar='batch size',
                    type=int,
                    nargs='?',
                    default=22,
                    help='Enter batch size.')

parser.add_argument('-d',
                    '--device',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='cpu',
                    help='Enter cuda or cpu.')

parser.add_argument('-dp',
                    '--data_parallel',
                    metavar='data_parallel',
                    type=str,
                    nargs='?',
                    default='N',
                    help='Distributed inference Y/N')

parser.add_argument('-f',
                    '--filename',
                    metavar='prompt filename',
                    type=str,
                    nargs='?',
                    default='0shot_10fold',
                    help='Enter prompt pickle filename.')

parser.add_argument('-k',
                    '--topk',
                    metavar='topk',
                    type=int,
                    nargs='?',
                    default=0,
                    help='The K most likely next words.')

parser.add_argument('-m',
                    '--model',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='gpt-neo-125M',
                    help='Enter pre-trained transformer model.')

parser.add_argument('-n',
                    '--number_outputs',
                    metavar='number_outputs',
                    type=int,
                    nargs='?',
                    default=1,
                    help='Enter the number of independantly sampled outputs per problem.')

parser.add_argument('-o',
                    '--organisation',
                    metavar='organisation',
                    type=str,
                    nargs='?',
                    default='EleutherAI',
                    help='Enter source (i.e. organisation) of pre-trained transformer model')

parser.add_argument('-p',
                    '--platform',
                    metavar='platform',
                    type=str,
                    nargs='?',
                    default='hpc',
                    help='Enter hpc or colab.')

parser.add_argument('-pr',
                    '--topp',
                    metavar='topp',
                    type=int,
                    nargs='?',
                    default=10,
                    help='Probability threshold')

parser.add_argument('-t',
                    '--temperature',
                    metavar='temperature',
                    type=int,
                    nargs='?',
                    default=1,
                    help='Softmax temperature.')

# Parses arguments through the parse_args() method

# HPC
args = parser.parse_args()

# Laptop

# # No arguments
# # Note: The list is passed so no arguments are passed here.
# args = parser.parse_args([])

# # top-k
# args = parser.parse_args(['-f', '0shot_10fold', # model input
#                           '-p', 'laptop', # platform
#                           '-k', '10', '110', # top-k
#                           '-t', '1', '11', # softmax-temp
#                           '-pr', '100', '105', # probability
#                          ])
# top-p
# args = parser.parse_args(['-f', '3shot_10fold_OOD_test', # model input
#                           '-p', 'colab', # platform
#                          '-k', '0', # top-k 
#                           '-t', '1', # softmax-temp
#                           '-pr', '5', # top-p
#                           '-d', 'cuda', # device
#                           '-n', '10', # number of independantly sampled outputs
#                           '-b', '1',
#                          ])  

In [2]:
# Assign variables
b_size = args.batch_size
data_parallel = args.data_parallel
device_type = args.device
filename = args.filename
model_type = args.model
organisation = args.organisation
platform = args.platform
temp = args.temperature
topk = args.topk
topp = args.topp
topp = topp/10
num_outputs = args.number_outputs

## Mount Google Drive

In [3]:
# Mount google drive if platform
if platform == 'colab':

    # Load the Drive helper and mount
    from google.colab import drive

    # This will prompt for authorization.
    drive.mount('/content/drive')

    # Cd to relevant google drive directory
    # Note: Run a shell command using os.system
    import os
    os.chdir("/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K")

Mounted at /content/drive


## Installations

In [4]:
# Installations
if platform == 'colab':

    # Hugging Face
    # Note: Run a shell command using os.system
    import os
    os.system("pip install transformers")

## Import dependencies

In [5]:
# Import relevant dependencies
import bz2
import gc
import json
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.model_selection import ParameterGrid
import time
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM, set_seed

## Set torch.device
*A torch.device is an object representing the device on which a torch.Tensor is or will be allocated.*

Source: https://pytorch.org/docs/stable/tensor_attributes.html#torch.device

In [6]:
# Set torch device
# Source: https://pytorch.org/docs/stable/tensor_attributes.html#torch.device
# torch_device = torch.device(device_type)
if device_type == "cuda":
    torch_device = torch.device("cuda:0")
else:
    torch_device = torch.device(device_type)

print(f"Torch device: {torch_device}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

Torch device: cuda:0
Number of GPUs: 1


**Note:** Data parrallelism will be implemented as per [here](https://stackoverflow.com/questions/61736317/huggingface-transformers-gpt2-generate-multiple-gpus). This means the input will be distributed across the available GPUs.

## Set random number seed values

In [7]:
# Create a function for setting/resetting the fixed seed value for pseudo-random generators
# Source: https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752
# Source: https://discuss.pytorch.org/t/does-pytorch-change-its-internal-seed-during-training/46505/4

def setSeedValue(seedValue):

    # 1. Torch
    # Source: https://pytorch.org/docs/stable/notes/randomness.html
    torch.manual_seed(seedValue)

    # 2. Python
    random.seed(seedValue)

    # 3. Numpy
    # Source: https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
    np.random.seed(seedValue)

    # 4. HuggingFace helper function
    # Helper function for reproducible behavior to set the seed in random , numpy , torch
    # Source: https://huggingface.co/docs/transformers/internal/trainer_utils
    # Used here: https://huggingface.co/Narsil/gpt2
    set_seed(seedValue)

In [8]:
# Set seed seed value for pseudo-random generators
# Source: https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752
setSeedValue(0)

## Set relevant directories

In [9]:
# Set file directories
prompt_filename = filename + '.pickle'

# Results
time_string = time.strftime("%Y%m%d-%H%M%S")
raw_results_file_name = filename + "-" + \
    model_type + "-" + time_string + '_raw_results'
results_file_name = filename + "-" + model_type + "-" + time_string + '_results'

if platform == 'colab':

    # Model and tokenizer directory
    model_directory = organisation + "/" + model_type
    local_model_directory = "/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K/" + model_type

    # Prompt directory
    prompt_directory = "/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K"

elif platform == 'hpc':

    # Model and tokenizer directory (archive folder)
    archive_folder = "/mnt/data/users/adbz866/"
    local_model_directory = archive_folder + model_type

    # Prompt directory (scratch folder)
    prompt_directory = "/mnt/scratch/users/adbz866/"

## Print variables and directories

In [10]:
# Print variables and directories
print("Time:", time_string)
print("Prompt directory:", prompt_directory)
print("Prompt file name:", prompt_filename)
print("\n")
print("Local model directory:", local_model_directory)
print("Platform:", platform)
print("Organisation:", organisation)
print("Model:", model_type)
print("Device:", device_type)
print("Top-K:", topk)
print("Top-P:", topp)
print("Temperature:", temp)
print("Batch size:", b_size)
print("Data parallelisation:" , data_parallel)
print("Number of outputs per problem", num_outputs)
print("\n")

Time: 20221127-182038
Prompt directory: /content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K
Prompt file name: 3shot_10fold_OOD_test.pickle


Local model directory: /content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K/gpt-neo-125M
Platform: colab
Organisation: EleutherAI
Model: gpt-neo-125M
Device: cuda
Top-K: 0
Top-P: 50
Temperature: 1
Batch size: 1
Data parallelisation: N
Number of outputs per problem 10




## General functions

### Save/load

Objects will be saved using the pickle module and the bz2 library.

1. Create a pickle object using the dump() method. This serialises Python objects into a binary format

2. Compress the pickle object using the bz2 library

Note: Without compression, the pickle files are too large (if output scores are included, the raw output file is roughly 238 mb with 3 validation sets for only one parameter combination). 

In [11]:
# Pickle a file and then compress it
# Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
def save_object(fname, data):
    with bz2.open(fname, "wb") as f:
        pickle.dump(data, f)

In [12]:
# Load compressed pickle file
# Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
def load_object(fname):
    with bz2.open(fname, "rb") as f:
        data = pickle.load(f)
    return data

### Delete K valued key in dict
This can be applied to nested dictionaries of arbitrary length as it relies on a recursive function.

In [13]:
# Delete K valued key using dictionary comprehension and recursion.
# Source of code: https://www.geeksforgeeks.org/python-remove-k-valued-key-from-nested-dictionary/
delete_key = lambda input: { key: delete_key(value) if isinstance(value, dict) else value
      for key, value in input.items() if key != rem_key}

# Load prompts

* Prompts created using NL4DV prep v3 notebook using all published NL4DV queries.

In [14]:
# Load queries via deseriealisation using pickle
# Note: rb = read binary
# Source: https://www.programiz.com/python-programming/file-operation
# Source: https://realpython.com/python-pickle-module/
# Source: https://ianlondon.github.io/blog/pickling-basics/
with open(prompt_directory + "/" + prompt_filename, 'rb') as f:
    test_set = pickle.load(f)

# **Instantiate pre-trained model**

In [15]:
# ### Download a local copy of the model

# # Instantiate the relevant model
# local_model = AutoModelForCausalLM.from_pretrained(model_directory)

# # Save a model object
# local_model.save_pretrained(save_directory = local_model_directory)

# del local_model

In [16]:
# Instantiate the model
chosen_model = AutoModelForCausalLM.from_pretrained(local_model_directory)
chosen_model.to(torch_device)

if device_type == "cuda" and data_parallel == "Y":
    chosen_model = torch.nn.DataParallel(chosen_model, device_ids=[0, 1])

# **Instantiate pre-trained tokenizer**

In [17]:
# ### Download a local copy of the tokenizer
# local_tokenizer = AutoTokenizer.from_pretrained(model_directory)
# local_tokenizer.save_pretrained(save_directory = local_model_directory)

# del local_tokenizer

In [18]:
# Instantiate the tokenizer
chosen_tokenizer = AutoTokenizer.from_pretrained(local_model_directory)
type(chosen_tokenizer)
# print(chosen_tokenizer)

transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast

# **Tokenize model inputs**

## Configure padding

In [19]:
# Configure padding

# 1) Tokenizer

# # Print tokenizer's end of sequence token and beginning of sequence token
# print(chosen_tokenizer.eos_token)
# print(chosen_tokenizer.encode(chosen_tokenizer.eos_token))
# print(chosen_tokenizer.bos_token)
# print(chosen_tokenizer.encode(chosen_tokenizer.bos_token))

# Configure tokenizer padding strategy
chosen_tokenizer.padding_side = "left"
chosen_tokenizer.pad_token = chosen_tokenizer.eos_token

# 2) Model

# # Print model's end of sequence token and beginning of sequence token
# print(chosen_model.config.eos_token_id)
# print(chosen_model.config.bos_token_id)

# Configure model padding strategy
# chosen_model.config.pad_token_id = chosen_model.config.eos_token_id

if device_type == "cuda" and  data_parallel == "Y":
    chosen_model.module.config.pad_token_id = chosen_model.module.config.eos_token_id

else:
    chosen_model.config.pad_token_id = chosen_model.config.eos_token_id

## Tokenization
* Tokenize with padding.

In [20]:
# Tokenize prompts (with padding) using call()

# Extract inputs
# Output: [validation set 1, validation set 2, validation set 3...] where each validation set contains a list with model inputs
tokenized_test_set = [test_sample[-1] for test_sample in test_set]

# Tokenize inputs
# Output: [validation set 1, validation set 2, validation set 3...] where each validation set contains a tensor with tokenized inputs
tokenized_test_set = chosen_tokenizer(
    tokenized_test_set, return_tensors="pt", padding=True)

print(tokenized_test_set['input_ids'].shape)
print(tokenized_test_set['attention_mask'].shape)

torch.Size([160, 1599])
torch.Size([160, 1599])


# **Inference with Top-K/P sampling**

## Functions

**Sequence probabilities**

* Sequence probabilities can be calculated from output scores.
* As output scores are provided for all tokens in the model's vocab (~50,000) for each generated token, storing output scores for every test sample in all validation sets consumes a lot of memory.
* To counter the above, sequence probabilities are calculated immediately for each validation set and then output scores are then deleted.
* The alternative approach is to output all output scores for each token in each test sample for all validation sets. This consumes a lot of memory.

Generate probabilities for individual sequences methodology:

Source: https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175

Softmax: https://stackoverflow.com/questions/17187507/why-use-softmax-as-opposed-to-standard-normalization

In [21]:
# Generate sequence probability function
# Code adapted from: https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175

def sequence_probability(model_inputs, output_sequences, output_scores):

    # Prompt length
    # model_inputs = tokenized_prompts = {input_ids: tensor([prompt 1], [prompt 2]...), attention_mask: tensor([prompt 1], [prompt 2]...)}
    prompt_length = model_inputs.shape[-1]

    # Vocab size
    # model_outputs = output_batch = SampleDecoderOnlyOutput([(sequences, tensor([output 1], [output 2]...), (scores, tensor([output 1], [output 2]...))]))
    vocab_size = output_scores[0].shape[-1]

    # Create sequence tensors containing only generated tokens (i.e. exclude prompt)
    # model_outputs = output_batch = SampleDecoderOnlyOutput([(sequences, tensor([output 1], [output 2]...), (scores, tensor([output 1], [output 2]...))]))
    model_outputs_genTokens = output_sequences[:, prompt_length:]

    # Stack the logits generated at each step
    # Note: PyTorch torch.stack() method joins (concatenates) a sequence of tensors (two or more tensors) along a new dimension.
    model_outputs_logits = torch.stack(output_scores, dim=1)

    # Calculate probabilities across the entire vocab at each step
    model_outputs_probs = model_outputs_logits.softmax(-1)

    # Collect probability of generated token
    model_outputs_genProbs = torch.gather(
        model_outputs_probs, 2, model_outputs_genTokens[:, :, None]).squeeze(-1)

    # Calculate sequence probability
    model_outputs_seqProbs = model_outputs_genProbs.prod(-1)

    # # Print summary information
    # print(f"Length of padded prompts: {prompt_length}")
    # print(f"Number of generated sequences: {model_outputs_genTokens.shape[0]}")
    # print(f"Length of generated sequences: {model_outputs_genTokens.shape[-1]}")
    # print(f"Tensor shape - Generated sequences: {model_outputs_genTokens.shape}")
    # print("\n")

    # print(f"Vocab size: {vocab_size}")
    # print(f"Tensor shape - Vocab token logits at each step: {model_outputs_logits.shape}")
    # print(f"Tensor shape - Vocab token probabilities at each step: {model_outputs_probs.shape}")
    # print(f"Tensor shape - Generated token probabilities: {model_outputs_genProbs.shape}")
    # print(f"Tensor shape - Generated sequence probabilities: {model_outputs_seqProbs.shape}")
    # print(f"Sequence probabilities: {model_outputs_seqProbs.tolist()}")

    return model_outputs_seqProbs.tolist()

The below inference function utilises generator functions to deliver batches of samples within each validation set to the model.

Relevant references:
* https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
* https://www.lachlaneagling.com/reducing-memory-consumption-python/
* https://www.programiz.com/python-programming/generator
* https://djangostars.com/blog/list-comprehensions-and-generator-expressions/
* https://nolowiz.com/split-list-into-batches-using-generator-in-python/

In [22]:
# Inference function when using generator objects

def create_generator(v_set, b_size):
    for i in range(0, len(v_set), b_size):
        yield v_set[i: i + b_size]


def model_inference(tokenized_set):
    output = {
        'Inference time': None,
        'Input sequences': None,
        'Output sequences': None,
        # 'Output scores': None,
        # 'Sequence probabilities': None,
    }

    # Create two generator objects containing 1) batches of tokenized input_ids and 2) associated batches of attention masks
    input_ids_generator = create_generator(
        tokenized_test_set['input_ids'], b_size)
    attn_mask_generator = create_generator(
        tokenized_test_set['attention_mask'], b_size)

    # Empty lists to collect outputs for each generator batch
    input_batches = []
    output_batches_sequences = []
    # output_batches_scores = []

    # Reset seed seed value for pseudo-random generators.
    # Source: https://discuss.pytorch.org/t/does-pytorch-change-its-internal-seed-during-training/46505/4
    setSeedValue(0)
    start = time.time()  # Start time

    # Iterate over batches in generator object
    generator_count = 0

    for input_batch, attn_batch in zip(input_ids_generator, attn_mask_generator):

        generator_count += 1
        attn_mask = attn_batch.to(torch_device)
        batch_input = input_batch.to(torch_device)

        if device_type == "cuda" and data_parallel == "Y":
            output_batch = chosen_model.module.generate(batch_input,
                                                        attention_mask=attn_mask,
                                                        num_beams=1,
                                                        do_sample=True,
                                                        top_k=topk,
                                                        temperature=temp,
                                                        top_p=topp,
                                                        early_stopping=True,
                                                        max_new_tokens=max_length,
                                                        eos_token_id=50256,
                                                        # Return the prediction scores.
                                                        output_scores=False,
                                                        return_dict_in_generate=True,
                                                        num_return_sequences=num_outputs,
                                                        )

            # Release GPU memory
            # Step 1: Detach tensors, create a copy on the CPU and overwrite variables
            # Step 2: Deleting unused objects by trigerring a manual garbage collection process and releasing all unoccupied cached memory
            output_batch.sequences = output_batch.sequences.detach().cpu()
            # output_batch.scores = output_batch.scores.detach().cpu()
            attn_mask = attn_mask.detach().cpu()
            batch_input = batch_input.detach().cpu()
            gc.collect()
            torch.cuda.empty_cache()

        elif device_type == "cuda" and data_parallel == "N":
            output_batch = chosen_model.generate(batch_input,
                                                 attention_mask=attn_mask,
                                                 num_beams=1,
                                                 do_sample=True,
                                                 top_k=topk,
                                                 temperature=temp,
                                                 top_p=topp,
                                                 early_stopping=True,
                                                 max_new_tokens=max_length,
                                                 eos_token_id=50256,
                                                 # Return the prediction scores.
                                                 output_scores=False,
                                                 return_dict_in_generate=True,
                                                 num_return_sequences=num_outputs,
                                                 )

            # Release GPU memory
            # Step 1: Detach tensors, create a copy on the CPU and overwrite variables
            # Step 2: Deleting unused objects by trigerring a manual garbage collection process and releasing all unoccupied cached memory
            output_batch.sequences = output_batch.sequences.detach().cpu()
            # output_batch.scores = output_batch.scores.detach().cpu()
            attn_mask = attn_mask.detach().cpu()
            batch_input = batch_input.detach().cpu()
            gc.collect()
            torch.cuda.empty_cache()

        else:
            output_batch = chosen_model.generate(batch_input,
                                                 attention_mask=attn_mask,
                                                 num_beams=1,
                                                 do_sample=True,
                                                 top_k=topk,
                                                 temperature=temp,
                                                 top_p=topp,
                                                 early_stopping=True,
                                                 max_new_tokens=max_length,
                                                 eos_token_id=50256,
                                                 # Return the prediction scores.
                                                 output_scores=False,
                                                 return_dict_in_generate=True,
                                                 num_return_sequences=num_outputs,
                                                 )

        # Collect inputs/outputs
        input_batches.append(batch_input)
        output_batches_sequences.append(output_batch.sequences)
        # output_batches_scores.append(output_batch.scores)
        del output_batch

    # Flatten inputs/outputs
    input_batches = [in_seq for in_seq in input_batches]
    output_batches_sequences = [
        out_seq for out_seq in output_batches_sequences]
    # output_batches_scores = [out_score for out_score in output_batches_scores]

    # Update output dictionary
    output['Input sequences'] = input_batches
    output['Output sequences'] = output_batches_sequences
    # output['Output scores'] = output_batches_scores
    # output['Sequence probabilities'] = sequence_probability(input_batches, output_batches_sequences, output_batches_scores)

    # Inference time
    end = time.time()
    inference_time = end - start
    output['Inference time'] = inference_time

    return output

## Stopping criteria
* The early stopping criteria will be based on *max_new_tokens* and the *EOS* token.

* Max_new_tokens is estimated by determining the number of tokens in the largest specification example. 10% more tokens is then added.

In [23]:
# Calculate maximum specification length

# Create a single list containing all examples.
# test_set = [(query, query_type, dataset, data_url, spec, context, prompt, input)...]
flattened_specs = [json.dumps(spec) for query, query_type, dataset, data_url, metaData, spec, markType, context, prompt, input in test_set]

# Determine max spec length after tokenization
max_length = 0
i = 0

for spec in flattened_specs:
    i += 1
    length = chosen_tokenizer(json.dumps(
        spec), return_tensors="pt", padding=False).input_ids.shape[1]
    # print(length)

    if length > max_length:
        max_length = length
        print(i)
        print(spec)

# print(max_length)
# Max_new_tokens = max length + 10%
max_length += max_length*0.1
max_length = round(max_length, 0)
print(f"Max output length: {max_length}")

1
"{'$schema': 'https://vega.github.io/schema/vega-lite/v4.17.0.json', 'data': {'url': 'https://raw.githubusercontent.com/nlvcorpus/nlvcorpus.github.io/main/datasets/movies.csv'}, 'mark': {'type': 'bar', 'tooltip': None}, 'encoding': {'column': {'field': 'Content Rating', 'type': 'ordinal'}, 'x': {'field': 'Creative Type', 'scale': {}, 'type': 'nominal', 'axis': {'title': '', 'labels': False, 'ticks': False}}, 'y': {'aggregate': 'mean', 'field': 'Production Budget', 'type': 'quantitative', 'axis': {'title': 'AVG (Production Budget)', 'format': '~s'}}, 'color': {'field': 'Creative Type', 'type': 'nominal'}}}"
139
"{'$schema': 'https://vega.github.io/schema/vega-lite/v4.17.0.json', 'mark': {'type': 'point', 'tooltip': True}, 'encoding': {'x': {'field': 'Worldwide Gross', 'type': 'quantitative', 'aggregate': None, 'axis': {'format': 's'}}, 'y': {'field': 'Production Budget', 'type': 'quantitative', 'aggregate': None, 'axis': {'format': 's'}}, 'tooltip': {'field': 'Title'}}, 'transform': [

## Inference

In [24]:
# Conduct inference
output_test_set = model_inference(tokenized_test_set)
# print(output_test_set.keys())
# print(len(output_test_set['Output sequences']))

## Save raw output

In [25]:
# Save via seriealisation and compression using pickle using bz2
print(raw_results_file_name)
save_object(raw_results_file_name, output_test_set)

3shot_10fold_OOD_test-gpt-neo-125M-20221127-182038_raw_results


# **Process outputs**

## Functions

In [26]:
# Split list function
# Source: https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
def break_list(l, n):
    """ Turn a list into a list of lists with size n"""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [27]:
# Decode function
def decode_outputs(raw_sequences):
    """Decode raw output sequences for a given batch"""
    return chosen_tokenizer.decode(raw_sequences, skip_special_tokens=False)

## Decode outputs

In [28]:
# Decode raw output generated by the model
output_test_set['Decoded sequences'] = []

# Decode
for batch in output_test_set['Output sequences']:
    output_test_set['Decoded sequences'].append(
        [decode_outputs(output) for output in batch])

# Flatten
output_test_set['Decoded sequences'] = [
    decode for batch in output_test_set['Decoded sequences'] for decode in batch]

# Group together independantly sampled outputs
if num_outputs > 1:
    output_test_set['Decoded sequences'] = list(break_list(
        output_test_set['Decoded sequences'], num_outputs))

## Reduce memory consumption

To save memory, raw output sequences, output scores and raw input sequences are removed from the output_test_set

In [29]:
# Delete specified keys
for rem_key in ['Output scores', 'Input sequences', 'Output sequences']:
    output_test_set = delete_key(output_test_set)

## Save decoded output

In [30]:
# Save via seriealisation and compression using pickle using bz2
print(results_file_name)
save_object(results_file_name, output_test_set)

3shot_10fold_OOD_test-gpt-neo-125M-20221127-182038_results
