# Set-up

## Select Colab or HPC

In [2]:
PLATFORM = 'colab'

## Import dependencies

In [3]:
import argparse
import bz2
import gc
import json
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.model_selection import ParameterGrid
import time
import torch

In [4]:
if PLATFORM == 'colab':
    
    # Install Hugging Face library using a shell command
    import os
    os.system("pip install transformers")

    # Mount Google Drive and CD using a shell command
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir("/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K")

ModuleNotFoundError: No module named 'google'

In [5]:
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM, set_seed

## Top level functions

In [7]:
def save_object(fname, data):
    """Pickle a file and compress it.
    Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
    """
    with bz2.open(fname, "wb") as f:
        pickle.dump(data, f)


def load_object(fname):
    """Load compressed pickle file
    Source: https://betterprogramming.pub/load-fast-load-big-with-compressed-pickles-5f311584507e
    """
    with bz2.open(fname, "rb") as f:
        data = pickle.load(f)
    return data


"""Delete K valued key using dictionary comprehension and recursion. 
Source of code: https://www.geeksforgeeks.org/python-remove-k-valued-key-from-nested-dictionary
"""
delete_key = lambda input: {key: delete_key(value) if isinstance(value, dict) else value
      for key, value in input.items() if key != rem_key}


def set_seed_value(seed_value):
    """Create a function for setting/resetting the fixed seed value for pseudo-random generators.
    Source: https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752
    Source: https://discuss.pytorch.org/t/does-pytorch-change-its-internal-seed-during-training/46505/4
    Source: https://huggingface.co/docs/transformers/internal/trainer_utils
    Source: https://huggingface.co/Narsil/gpt2
    """
    torch.manual_seed(seed_value) # 1. Torch
    random.seed(seed_value) # 2. Python
    np.random.seed(seed_value) # 3. Numpy
    # 4. HuggingFace helper function to set the seed in random , numpy , torch
    set_seed(seed_value)

## Parameters

### Command line arguments

In [32]:
# Create parser for command line arguments
# Source: https://docs.python.org/3/library/argparse.html
parser = argparse.ArgumentParser(description="Parse command line arguments")
parser.add_argument('-b',
                    '--batch_size',
                    metavar='batch size',
                    type=int,
                    nargs='?',
                    default=22,
                    help='Batch size.')

parser.add_argument('-dp',
                    '--data_parallel',
                    metavar='data_parallel',
                    type=str,
                    nargs='?',
                    default='N',
                    help='Distributed inference Y/N.')

parser.add_argument('-d',
                    '--device',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='cpu',
                    help='Cuda or cpu.')

parser.add_argument('-f',
                    '--filename',
                    metavar='prompt filename',
                    type=str,
                    nargs='?',
                    default='ID_3shot_test_set',
                    help='Input filename.')

parser.add_argument('-m',
                    '--model',
                    metavar='device',
                    type=str,
                    nargs='?',
                    default='gpt-neo-125M',
                    help='Pre-trained transformer model.')

parser.add_argument('-n',
                    '--number_outputs',
                    metavar='number_outputs',
                    type=int,
                    nargs='?',
                    default=1,
                    help='The number of independantly sampled outputs per problem.')

parser.add_argument('-t',
                    '--temperature',
                    metavar='temperature',
                    type=int,
                    nargs='?',
                    default=1,
                    help='Softmax temperature.')

parser.add_argument('-k',
                    '--topk',
                    metavar='topk',
                    type=int,
                    nargs='?',
                    default=0,
                    help='The K most likely next words.')

parser.add_argument('-pr',
                    '--topp',
                    metavar='topp',
                    type=int,
                    nargs='?',
                    default=10,
                    help='Probability threshold');

In [26]:
if PLATFORM=='colab':
    # args = parser.parse_args([])
    args = parser.parse_args(['--batch_size', '1',
                              '--data_parallel', 'N',
                              '--device', 'cpu',
                              '--filename', '3shot_10fold_OOD_test',
                              '--model', 'gpt-neo-125M',
                              '--number_outputs', '10',
                              '--temperature', '1',
                              '--topk', '0',
                              '--topp', '5'])

In [27]:
# Parses arguments through the parse_args() method
if PLATFORM=='HPC':
    args = parser.parse_args()

BATCH_SIZE = args.batch_size
DATA_PARALLEL = args.data_parallel
DEVICE_TYPE = args.device
FILENAME = args.filename
MODEL_TYPE = args.model
NUM_OUTPUTS = args.number_outputs
TEMPERATURE = args.temperature
TOPK = args.topk
TOPP = args.topp/10

### File parameters

In [28]:
TIME_STRING = time.strftime("%Y%m%d-%H%M%S")
PROMPT_FILENAME = FILENAME + '.pickle'
RAW_RESULTS_FILENAME = FILENAME + "-" + MODEL_TYPE + "-" + TIME_STRING + '_raw_results'
RESULTS_FILENAME = FILENAME + "-" + MODEL_TYPE + "-" + TIME_STRING + '_results'

if PLATFORM == 'colab':
    MODEL_DIRECTORY = "/" + MODEL_TYPE
    LOCAL_MODEL_DIRECTORY = "/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K/" + MODEL_TYPE
    PROMPT_DIRECTORY = "/content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K"

elif platform == 'hpc':
    ARCHIVE_FOLDER = "/mnt/data/users/adbz866/"
    LOCAL_MODEL_DIRECTORY = archive_folder + model_type
    PROMPT_DIRECTORY = "/mnt/scratch/users/adbz866/"

In [29]:
print("Time:", TIME_STRING)
print("Prompt directory:", PROMPT_DIRECTORY)
print("Prompt file name:", PROMPT_FILENAME)
print("\n")
print("Local model directory:", LOCAL_MODEL_DIRECTORY)
print("Platform:", PLATFORM)
print("Model:", MODEL_TYPE)
print("Device:", DEVICE_TYPE)
print("Top-K:", TOPK)
print("Top-P:", TOPP)
print("Temperature:", TEMPERATURE)
print("Batch size:", BATCH_SIZE)
print("Data parallelisation:" , DATA_PARALLEL)
print("Number of outputs per problem", NUM_OUTPUTS )

Time: 20230827-103709
Prompt directory: /content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K
Prompt file name: 3shot_10fold_OOD_test.pickle


Local model directory: /content/drive/MyDrive/Colab Notebooks/Final project/2. Models/1. Transformer-based LM/1. Autoregressive LMs/2. Top-K/gpt-neo-125M
Platform: colab
Model: gpt-neo-125M
Device: cpu
Top-K: 0
Top-P: 0.5
Temperature: 1
Batch size: 1
Data parallelisation: N
Number of outputs per problem 10


## Torch device

In [31]:
# Source: https://pytorch.org/docs/stable/tensor_attributes.html#torch.device
if DEVICE_TYPE == "cuda":
    TORCH_DEVICE = torch.device("cuda:0")
else:
    TORCH_DEVICE = torch.device(DEVICE_TYPE)

print(f"Torch device: {TORCH_DEVICE}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

Torch device: cpu
Number of GPUs: 0


# Instantiate models

## Tokenizer

In [None]:
# local_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIRECTORY)
# local_tokenizer.save_pretrained(save_directory=LOCAL_MODEL_DIRECTORY)
# del local_tokenizer

chosen_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIRECTORY)
print(type(chosen_tokenizer))

## Pre-trained transformer

In [None]:
# local_model = AutoModelForCausalLM.from_pretrained(MODEL_DIRECTORY)
# local_model.save_pretrained(save_directory=LOCAL_MODEL_DIRECTORY)
# del local_model

chosen_model = AutoModelForCausalLM.from_pretrained(LOCAL_MODEL_DIRECTORY)
chosen_model.to(torch_device)
if DEVICE_TYPE == "cuda" and DATA_PARALLEL == "Y":
    chosen_model = torch.nn.DataParallel(chosen_model, device_ids=[0, 1])

## Configure padding
* EOS = end of sequence token
* BOS = beginning of sequence token

In [None]:
# Tokenizer
chosen_tokenizer.padding_side = "left"
chosen_tokenizer.pad_token = chosen_tokenizer.eos_token
print(chosen_tokenizer.eos_token)
print(chosen_tokenizer.encode(chosen_tokenizer.eos_token))
print(chosen_tokenizer.bos_token)
print(chosen_tokenizer.encode(chosen_tokenizer.bos_token))

# Model
if DEVICE_TYPE == "cuda" and  DATA_PARALLEL == "Y":
    chosen_model.module.config.pad_token_id = chosen_model.module.config.eos_token_id
else:
    chosen_model.config.pad_token_id = chosen_model.config.eos_token_id
print(chosen_model.config.eos_token_id)
print(chosen_model.config.bos_token_id)

# Import test data

In [None]:
# Load test data via deseriealisation using pickle
with open(prompt_directory + "/" + prompt_filename, 'rb') as f:
    test_set = pickle.load(f)

# Pre-processing

In [None]:
# Tokenize prompts (with padding) using call()

# Extract inputs
# Output: [validation set 1, validation set 2, validation set 3...] where each validation set contains a list with model inputs
tokenized_test_set = [test_sample[-1] for test_sample in test_set]

# Tokenize inputs
# Output: [validation set 1, validation set 2, validation set 3...] where each validation set contains a tensor with tokenized inputs
tokenized_test_set = chosen_tokenizer(
    tokenized_test_set, return_tensors="pt", padding=True)

print(tokenized_test_set['input_ids'].shape)
print(tokenized_test_set['attention_mask'].shape)