# Set-up

## Select Colab or HPC

In [1]:
PLATFORM = 'colab'

## Import dependencies

In [2]:
import argparse
import bz2
import gc
import json
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.model_selection import ParameterGrid
import time
import torch

In [3]:
if PLATFORM == 'colab':
    
    # Install Hugging Face library using a shell command
    import os
    os.system("pip install transformers")

    # Mount Google Drive and CD using a shell command
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir("/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K")

ModuleNotFoundError: No module named 'google'

In [4]:
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM, set_seed

## Top level functions

In [5]:
def break_list(l, n):
    """ Turn a list into a list of lists with size n
    Source: https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
    """
    for i in range(0, len(l), n):
        yield l[i:i + n]


def decode_outputs(raw_sequences):
    """Decode raw output sequences for a given batch"""
    return chosen_tokenizer.decode(raw_sequences, skip_special_tokens=False)


def save_object(fname, data):
    """Pickle a file and compress it.
    Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
    """
    with bz2.open(fname, "wb") as f:
        pickle.dump(data, f)


def load_object(fname):
    """Load compressed pickle file
    Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
    """
    with bz2.open(fname, "rb") as f:
        data = pickle.load(f)
    return data


"""Delete K valued key using dictionary comprehension and recursion. 
Source of code: https://www.geeksforgeeks.org/python-remove-k-valued-key-from-nested-dictionary
"""
delete_key = lambda input: {key: delete_key(value) if isinstance(value, dict) else value
      for key, value in input.items() if key != rem_key}


def set_seed_value(seed_value):
    """Create a function for setting/resetting the fixed seed value for pseudo-random generators.
    Source: https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752
    Source: https://discuss.pytorch.org/t/does-pytorch-change-its-internal-seed-during-training/46505/4
    Source: https://huggingface.co/docs/transformers/internal/trainer_utils
    Source: https://huggingface.co/Narsil/gpt2
    """
    torch.manual_seed(seed_value) # 1. Torch
    random.seed(seed_value) # 2. Python
    np.random.seed(seed_value) # 3. Numpy
    # 4. HuggingFace helper function to set the seed in random , numpy , torch
    set_seed(seed_value)

In [24]:
def batch_generator(tokenized_inputs, BATCH_SIZE):
    """Generator object used to deliver batches of the tokenized inputs
    and their associated attention masks to the model thereby reducing memory consumption. 
    """
    for i in range(0, len(tokenized_inputs), BATCH_SIZE):
        yield tokenized_inputs[i: i + BATCH_SIZE]


def model_inference(tokenized_inputs, BATCH_SIZE, TEMPERATURE, TOPK, TOPP, NUM_OUTPUTS, MAX_OUTPUT_LENGTH):
    """This function enables batched inference using the chosen model on either the CPU or GPU.
    The latter includes optional parallelisation."""
    set_seed_value(0)
    inference_results = {'Inference time': None,
                         'Output sequences': None
                        }
    model_outputs = []

    # Iterate over generator objects
    input_ids_generator = batch_generator(tokenized_test_set['input_ids'],
                                           BATCH_SIZE
                                          )
    attn_mask_generator = batch_generator(tokenized_test_set['attention_mask'],
                                           BATCH_SIZE
                                          )
    start = time.time()
    generator_count = 0
    for input_batch, attn_batch in zip(input_ids_generator, attn_mask_generator):
        generator_count += 1
        input_batch = input_batch.to(TORCH_DEVICE)
        attn_batch = attn_batch.to(TORCH_DEVICE)

        if DEVICE_TYPE == "cuda" and DATA_PARALLEL == "Y":
            output = chosen_model.module.generate(input_batch,
                                                  attention_mask=attn_batch,
                                                  num_beams=1,
                                                  do_sample=True,
                                                  top_k=TOPK,
                                                  temperature=TEMPERATURE,
                                                  top_p=TOPP,
                                                  early_stopping=True,
                                                  max_new_tokens=MAX_OUTPUT_LENGTH,
                                                  eos_token_id=50256,
                                                  output_scores=False,
                                                  return_dict_in_generate=True,
                                                  num_return_sequences=NUM_OUTPUTS,
                                                 )

            # Release GPU memory
            # Step 1: Detach tensors, create a copy on the CPU and overwrite variables
            # Step 2: Deleting unused objects by trigerring a manual garbage collection process and releasing all unoccupied cached memory
            output.sequences = output.sequences.detach().cpu()
            attn_batch = attn_batch.detach().cpu()
            input_batch = input_batch.detach().cpu()
            gc.collect()
            torch.cuda.empty_cache()

        elif DEVICE_TYPE == "cuda" and DATA_PARALLEL == "N":
            output = chosen_model.generate(input_batch,
                                           attention_mask=attn_batch,
                                           num_beams=1,
                                           do_sample=True,
                                           top_k=TOPK,
                                           temperature=TEMPERATURE,
                                           top_p=TOPP,
                                           max_new_tokens=MAX_OUTPUT_LENGTH,
                                           eos_token_id=50256,
                                           output_scores=False,
                                           return_dict_in_generate=True,
                                           num_return_sequences=NUM_OUTPUTS,
                                          )

            # Release GPU memory
            # Step 1: Detach tensors, create a copy on the CPU and overwrite variables
            # Step 2: Deleting unused objects by trigerring a manual garbage collection process and releasing all unoccupied cached memory
            output.sequences = output.sequences.detach().cpu()
            attn_mask = attn_mask.detach().cpu()
            input_ids = input_ids.detach().cpu()
            gc.collect()
            torch.cuda.empty_cache()

        else:
            output = chosen_model.generate(input_batch,
                                           attention_mask=attn_batch,
                                           num_beams=1,
                                           do_sample=True,
                                           top_k=TOPK,
                                           temperature=TEMPERATURE,
                                           top_p=TOPP,
                                           early_stopping=True,
                                           max_new_tokens=MAX_OUTPUT_LENGTH,
                                           eos_token_id=50256,
                                           output_scores=False,
                                           return_dict_in_generate=True,
                                           num_return_sequences=NUM_OUTPUTS,
                                          )
            
        model_outputs.append(output.sequences)
        del output

    end = time.time()
    inference_time = end - start
    inference_results['Inference time'] = inference_time
    model_outputs = [output_sequence for output_sequence in model_outputs]
    inference_results['Output sequences'] = model_outputs
    return inference_results


def sequence_probability(tokenized_inputs, output_sequences, output_scores):
    """Calculate sequence probabilities from output scores.
    Code adapted from: 1) https://discuss.huggingface.co/t/generation-probabilities-how-to-compute-probabilities-of-output-scores-for-gpt2/3175
    2) https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fstackoverflow.com%2Fquestions%2F17187507%2Fwhy-use-softmax-as-opposed-to-standard-normalization
    Note:
    model_inputs = tokenized_prompts = {input_ids: tensor([prompt 1], [prompt 2]...), attention_mask: tensor([prompt 1], [prompt 2]...)}
    model_outputs = output_batch = SampleDecoderOnlyOutput([(sequences, tensor([output 1], [output 2]...), (scores, tensor([output 1], [output 2]...))]))
    """
    prompt_length = tokenized_inputs.shape[-1]
    vocab_size = output_scores[0].shape[-1]
    generated_tokens = output_sequences[:, prompt_length:] # generated tokens ONLY (i.e. exclude prompt)

    # Stack the logits generated at each step and calculate probabilities across the entire vocab at each step
    # Note: PyTorch torch.stack() method joins (concatenates) a sequence of tensors (two or more tensors) along a new dimension.
    vocab_logits = torch.stack(output_scores, dim=1)
    vocab_probs = logits.softmax(-1)

    # Collect probability of generated tokens & calculate sequence probability
    generated_tokens_probs  = torch.gather(vocab_probs, 2,
                                          generated_tokens[:, :, None]).squeeze(-1)
    generated_sequence_probs = generated_tokens_probs.prod(-1)

    # # Print summary information
    # print(f"Length of padded prompts: {prompt_length}")
    # print(f"Number of generated sequences: {generated_tokens.shape[0]}")
    # print(f"Length of generated sequences: {generated_tokens.shape[-1]}")
    # print(f"Tensor shape - Generated sequences: {generated_tokens.shape}")
    # print("\n")
    # print(f"Vocab size: {vocab_size}")
    # print(f"Tensor shape - Vocab token logits at each step: {vocab_logits.shape}")
    # print(f"Tensor shape - Vocab token probabilities at each step: {vocab_logits.shape}")
    # print(f"Tensor shape - Generated token probabilities: {generated_tokens_probs.shape}")
    # print(f"Tensor shape - Generated sequence probabilities: {generated_sequence_probs.shape}")
    # print(f"Sequence probabilities: {generated_sequence_probs.tolist()}")
    
    return generated_sequence_probs.tolist()

## Parameters

### Command line arguments

In [7]:
# Create parser for command line arguments
# Source: https://docs.python.org/3/library/argparse.html
parser = argparse.ArgumentParser(description="Parse command line arguments")
parser.add_argument('-b',
                    '--batch_size',
                    metavar='batch size',
                    type=int,
                    nargs='?',
                    default=22,
                    help='Batch size.')

parser.add_argument('-dp',
                    '--data_parallel',
                    metavar='data_parallel',
                    type=str,
                    nargs='?',
                    default='N',
                    help='Distributed inference Y/N.')

parser.add_argument('-d',
                    '--device',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='cpu',
                    help='Cuda or cpu.')

parser.add_argument('-f',
                    '--filename',
                    metavar='prompt filename',
                    type=str,
                    nargs='?',
                    default='ID_3shot_test_set',
                    help='Input filename.')

parser.add_argument('-m',
                    '--model',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='gpt-neo-125M',
                    help='Pre-trained transformer model.')

parser.add_argument('-n',
                    '--number_outputs',
                    metavar='number_outputs',
                    type=int,
                    nargs='?',
                    default=1,
                    help='The number of independantly sampled outputs per problem.')

parser.add_argument('-t',
                    '--temperature',
                    metavar='temperature',
                    type=int,
                    nargs='?',
                    default=1,
                    help='Softmax temperature.')

parser.add_argument('-k',
                    '--topk',
                    metavar='topk',
                    type=int,
                    nargs='?',
                    default=0,
                    help='The K most likely next words.')

parser.add_argument('-pr',
                    '--topp',
                    metavar='topp',
                    type=int,
                    nargs='?',
                    default=10,
                    help='Probability threshold');

In [8]:
if PLATFORM=='colab':
    # args = parser.parse_args([])
    args = parser.parse_args(['--batch_size', '1',
                              '--data_parallel', 'N',
                              '--device', 'cpu',
                              '--filename', 'ID_3shot_test_set',
                              '--model', 'gpt-neo-125M',
                              '--number_outputs', '1',
                              '--temperature', '1',
                              '--topk', '0',
                              '--topp', '5'])

In [9]:
# Parses arguments through the parse_args() method
if PLATFORM=='HPC':
    args = parser.parse_args()

BATCH_SIZE = args.batch_size
DATA_PARALLEL = args.data_parallel
DEVICE_TYPE = args.device
FILENAME = args.filename
MODEL_TYPE = args.model
NUM_OUTPUTS = args.number_outputs
TEMPERATURE = args.temperature
TOPK = args.topk
TOPP = args.topp/10

### File parameters

In [10]:
TIME_STRING = time.strftime("%Y%m%d-%H%M%S")
PROMPT_FILENAME = FILENAME + '.pickle'
RAW_RESULTS_FILENAME = FILENAME + "-" + MODEL_TYPE + "-" + TIME_STRING + '_raw_results'
RESULTS_FILENAME = FILENAME + "-" + MODEL_TYPE + "-" + TIME_STRING + '_results'

if PLATFORM == 'colab':
    MODEL_DIRECTORY =  'EleutherAI/' + MODEL_TYPE
    LOCAL_MODEL_DIRECTORY = 'C:/Users/billy/OneDrive/Documents/Python Scripts/1. Portfolio/1. NL2VIS/' + MODEL_TYPE
    PROMPT_DIRECTORY = 'C:/Users/billy/OneDrive/Documents/Python Scripts/1. Portfolio/1. NL2VIS/'

elif platform == 'hpc':
    ARCHIVE_FOLDER = "/mnt/data/users/adbz866/"
    LOCAL_MODEL_DIRECTORY = archive_folder + model_type
    PROMPT_DIRECTORY = "/mnt/scratch/users/adbz866/"

In [11]:
print("Time:", TIME_STRING)
print("Prompt directory:", PROMPT_DIRECTORY)
print("Prompt file name:", PROMPT_FILENAME)
print("\n")
print("Local model directory:", LOCAL_MODEL_DIRECTORY)
print("Platform:", PLATFORM)
print("Model:", MODEL_TYPE)
print("Device:", DEVICE_TYPE)
print("Top-K:", TOPK)
print("Top-P:", TOPP)
print("Temperature:", TEMPERATURE)
print("Batch size:", BATCH_SIZE)
print("Data parallelisation:" , DATA_PARALLEL)
print("Number of outputs per problem", NUM_OUTPUTS )

Time: 20230829-125135
Prompt directory: C:/Users/billy/OneDrive/Documents/Python Scripts/1. Portfolio/1. NL2VIS/
Prompt file name: ID_3shot_test_set.pickle


Local model directory: C:/Users/billy/OneDrive/Documents/Python Scripts/1. Portfolio/1. NL2VIS/gpt-neo-125M
Platform: colab
Model: gpt-neo-125M
Device: cpu
Top-K: 0
Top-P: 0.5
Temperature: 1
Batch size: 1
Data parallelisation: N
Number of outputs per problem 1


## Torch device
* The torch.device objects represents the device on which torch.tensors will be allocated.ce

In [12]:
# Source: https://pytorch.org/docs/stable/tensor_attributes.html#torch.device
if DEVICE_TYPE == "cuda":
    TORCH_DEVICE = torch.device("cuda:0")
else:
    TORCH_DEVICE = torch.device(DEVICE_TYPE)

print(f"Torch device: {TORCH_DEVICE}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

Torch device: cpu
Number of GPUs: 0


# Instantiate models

## Tokenizer
* Tokenize with padding.

In [13]:
# local_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIRECTORY)
# local_tokenizer.save_pretrained(save_directory=LOCAL_MODEL_DIRECTORY)
# del local_tokenizer

chosen_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIRECTORY)
print(type(chosen_tokenizer))

<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>


## Pre-trained transformer
* Data parallelism is implemented as per [here](https://stackoverflow.com/questions/61736317/huggingface-transformers-gpt2-generate-multiple-gpus). This means the input will be distributed across the available GPUs.

In [14]:
# local_model = AutoModelForCausalLM.from_pretrained(MODEL_DIRECTORY)
# local_model.save_pretrained(save_directory=LOCAL_MODEL_DIRECTORY)
# del local_model

chosen_model = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL_DIRECTORY)
chosen_model.to(TORCH_DEVICE)
if DEVICE_TYPE == "cuda" and DATA_PARALLEL == "Y":
    chosen_model = torch.nn.DataParallel(chosen_model, device_ids=[0, 1])

## Configure padding
* EOS = end of sequence token
* BOS = beginning of sequence token

In [15]:
# Tokenizer
chosen_tokenizer.padding_side = "left"
chosen_tokenizer.pad_token = chosen_tokenizer.eos_token
print(chosen_tokenizer.eos_token)
print(chosen_tokenizer.encode(chosen_tokenizer.eos_token))
print(chosen_tokenizer.bos_token)
print(chosen_tokenizer.encode(chosen_tokenizer.bos_token))

# Model
if DEVICE_TYPE == "cuda" and  DATA_PARALLEL == "Y":
    chosen_model.module.config.pad_token_id = chosen_model.module.config.eos_token_id
else:
    chosen_model.config.pad_token_id = chosen_model.config.eos_token_id
print(chosen_model.config.eos_token_id)
print(chosen_model.config.bos_token_id)

<|endoftext|>
[50256]
<|endoftext|>
[50256]
50256
50256


# Import test data

In [16]:
# Load test data via deseriealisation using pickle
with open(PROMPT_DIRECTORY + "/" + PROMPT_FILENAME, 'rb') as f:
    test_set = pickle.load(f)

# Pre-processing

In [17]:
# Tokenize prompts (with padding) using call()
tokenized_test_set = test_set['model_input'].tolist()
tokenized_test_set = chosen_tokenizer(tokenized_test_set, 
                                      return_tensors="pt", 
                                      padding=True)

print(tokenized_test_set['input_ids'].shape)
print(tokenized_test_set['attention_mask'].shape)

torch.Size([149, 1663])
torch.Size([149, 1663])


# Inference

## Stopping criteria
* The early stopping criteria is based on *max_new_tokens* and the *EOS* token.

* Max_new_tokens is estimated by determining the number of tokens in the largest specification example. 10% more tokens is then added.

In [22]:
# Calculate maximum specification length
i = 0
MAX_OUTPUT_LENGTH = 0
for spec in test_set['spec'].tolist():
    i += 1
    length = chosen_tokenizer(json.dumps(spec),
                              return_tensors="pt",
                              padding=False).input_ids.shape[1]
    
    if length > MAX_OUTPUT_LENGTH:
        MAX_OUTPUT_LENGTH = length
        
MAX_OUTPUT_LENGTH += MAX_OUTPUT_LENGTH*0.1
MAX_OUTPUT_LENGTH = round(MAX_OUTPUT_LENGTH, 0)
print(f"Max output length: {MAX_OUTPUT_LENGTH}")

Max output length: 282.0


## Inference

Sequence probabilities are not output below. However, they can be using the *sequence_probabilities()* function defined earlier. In this instance, note the below:
* Sequence probabilities are calculated from output scores.
* However, for each generated token, output scores are provided across all tokens in the model's vocab (~50,000). Storing output scores for every sample therefore consumes a lot of memory.
* To counter this, calculate sequence probabilities immediately for each batch before deleting output scores using the function provided.

In [None]:
inference_results = model_inference(tokenized_test_set, BATCH_SIZE, TEMPERATURE, TOPK, TOPP, NUM_OUTPUTS, MAX_OUTPUT_LENGTH)
# print(inference_results.keys())
# print(len(inference_results['Output sequences']))

# Process outputs

In [None]:
# Decode & flatten batches
inference_results['Decoded sequences'] = []
for batch in inference_results['Output sequences']:
    inference_results['Decoded sequences'].append([decode_outputs(output) for output in batch])
output_test_set['Decoded sequences'] = [decode for batch in output_test_set['Decoded sequences'] for decode in batch]

# Group together independantly sampled outputs
if num_outputs > 1:
    inference_results['Decoded sequences'] = list(break_list(inference_results['Decoded sequences'],
                                                             num_outputs
                                                            )
                                                 )

# Save outputs
* To save memory, unnecessary keys are removed from the output containing decoded sequences.

In [None]:
# Raw data
save_object(RAW_RESULTS_FILENAME,
            inference_results)

# Decoded output
for rem_key in ['Output scores', 'Input sequences', 'Output sequences']:
    inference_results = delete_key(inference_results)
save_object(RESULTS_FILENAME,
            inference_results)