# Setup

## Imports and Environment

In [1]:
# Read and execute saffu files for using functionality
exec(open("../saffu/configuration_saffu.py").read())
exec(open("../saffu/tokenization_saffu.py").read())
exec(open("../saffu/utilities_saffu.py").read())
exec(open("../saffu/data_saffu.py").read())
exec(open("../saffu/modeling_saffu.py").read())
exec(open("../saffu/training_saffu.py").read())
exec(open("../saffu/inference_saffu.py").read())
exec(open("../saffu/tuning_saffu.py").read())
exec(open("../saffu/load_data.py").read())

## Set environment variables
# Creates logger object named __main__ for debug messages
logger = logging.get_logger(__name__)

# Doesn't split memory chunks of more than 256 MB
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

# Makes code synchronous meaning GPU finishes running then CPU rund
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Enable dynamic shape allocation of tensor sizes without predefining them
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Set the gpu or cpu device
devstr = "cuda:2" # "cpu" 
gpu = False if (devstr == 'cpu') else True
device = 'cpu' if (devstr == 'cpu') else (torch.device(devstr if torch.cuda.is_available() else 'cpu') 
                                          if devstr else torch.cuda.current_device())
# Observe the device
print(device)

cuda:2


## Dataset setup

### Params

In [2]:
# Define the dataset being used, can also combine different ones with a +
data_set = "helpful-base" # +harmless-base+babylm_10M+babylm_100M+BWB

# Define model size from tiny, micro, small, medium, big
model_size = "tiny"

# Define training size in millions of word-tokens so helpful-base = 5 = 5 million tokens
training_sizes = {
    "helpful-base": 5, "harmless-base": 5, "babylm_10M": 10, "babylm_100M": 100, "BWB": 1000
}

# Define the % of data held out for development so 1/10 of total available below
devsample = 10 

# Total size of all datasets in millions, currently 5 million should be
dataset_size = sum([training_sizes[data_subset] for data_subset in data_set.split("+")])

# Get downsample size which would be 1 = 1 million below
downsample = int(dataset_size/5) # roughly 5 million word-tokens per split

# Hyperparameter for learning rate probably
eta = 0.05 # 0.05

# Empty lists to store document or conversation level data for normal, dev and test
docs, ddocs, tdocs = [], [], []
convos, dconvos, tconvos = [], [], []

### Loading conversations for each dataset

In [3]:
########################## BASE ##############################################
# Contains human assistant interactions
if ("helpful-base" in data_set) or ("harmless-base" in data_set):
    train_conversations = []; dev_conversations = []; test_conversations = []
    
    # Load one of the helpful subsets
    if "helpful-base" in data_set:
        
        # Load the human-assistant training examples split by chosen and rejected
        train_conversations += load_hh_rlhf("/cephfs/data/hh_rlhf/backup/helpful_train.json")        
        dev_conversations += load_hh_rlhf("/cephfs/data/hh_rlhf/backup/helpful_dev.json")
        # test_conversations += load_hh_rlhf("/cephfs/data/hh_rlhf/backup/helpful_test.json")
    
    # Load one of the harmless subsets
    if "harmless-base" in data_set:
        train_conversations += load_hh_rlhf("/cephfs/data/hh_rlhf/backup/harmless_train.json")
        dev_conversations += load_hh_rlhf("/cephfs/data/hh_rlhf/backup/harmless_dev.json")
        # test_conversations += load_hh_rlhf("/cephfs/data/hh_rlhf/backup/harmless_test.json")
    
    # Define variables to store train, dev and test convos
    convos, dconvos, test_threads = defaultdict(list), defaultdict(list), defaultdict(list)
    
    # Loop through all conversations in training
    for conversation in train_conversations:
        
        # Choose the first interaction as the key to access rest of conversation from a start interaction
        initial_thread = "".join([turn[0] + turn[1] for turn in conversation['chosen'][:2]])
        
        # Store the full conversation including the initial interaction in key in a tuple
        # First element of tuple is the total no. of statements by humans+assistant in the full convo
        # Second element is a list of lists where each individual list consists of a statement
        convos[initial_thread].append((len(conversation['chosen']), conversation['chosen']))

    # Loop through all conversations in dev
    for conversation in dev_conversations:
        initial_thread = "".join([turn[0] + turn[1] for turn in conversation['chosen'][:2]])
        dconvos[initial_thread].append((len(conversation['chosen']), conversation['chosen']))
    
    # First sort by descending length of convo for each key, longest convo first
    # It then returns the longest convo for each key ensuring we keep the longest interaction
    convos = [sorted(convos[initial_thread], reverse = True)[0][1] for initial_thread in convos]
    dconvos = [sorted(dconvos[initial_thread], reverse = True)[0][1] for initial_thread in dconvos]
    
    # Variable to store text snippets from train conversations along with rejected responses
    docs = []
    for conversation in train_conversations:
        
        # 1st part grabs the last 2 sentences from the chosen interaction
        # 2nd part adds the assistant's rejected response to it for better training
        docs += [x[1] for x in conversation['chosen'][-2:]] + [conversation['rejected'][-1][1]]
    
    # Prints the final number of threads being used for training and development
    print("Numbers of training and development threads: ", len(convos), len(dconvos))

Numbers of training and development threads:  27281 2280


In [4]:
########################## BABYLM ##############################################
# Contains longer textual passages which might be narrative driven
# Here docs contains textual segments which might be important for language modelling
if ("babylm_10M" in data_set) or ("babylm_100M" in data_set):
    
    # If 10M dataset
    if ("babylm_10M" in data_set):
        
        # List all the files in directory and check if they have the training extesion
        for fname in tqdm(os.listdir("/cephfs/data/babylm_data/babylm_10M/"), desc = "Loading 10M training tokens"):
            if ".train" == fname[-6:]:
                
                # Load the docs and convos for this dataset
                docs, convos = load_BBLM("/cephfs/data/babylm_data/babylm_10M/", fname, docs, convos)

    # If 100M dataset
    if ("babylm_100M" in data_set):
        for fname in tqdm(os.listdir("/cephfs/data/babylm_data/babylm_100M/"), desc = "Loading 100M training tokens"):
            if ".train" == fname[-6:]:
                docs, convos = load_BBLM("/cephfs/data/babylm_data/babylm_100M/", fname, docs, convos)
    
    # Load all the dev docs and convos regardless
    for fname in tqdm(os.listdir("/cephfs/data/babylm_data/babylm_dev/"), desc = "Loading development tokens"):
        if ".dev" == fname[-4:]:
            ddocs, dconvos = load_BBLM("/cephfs/data/babylm_data/babylm_dev/", fname, ddocs, dconvos)          

In [5]:
########################## BWB ##############################################
# Perhaps textual data from books, articles, or informational documents.
if "BWB" in data_set:
    bwb_train_docs, bwb_test_docs = load_bwb("/cephfs/data/bwb/datasets/")
    docs += bwb_train_docs
    # dconvos += [[["Human: ", x + "\n\n"]] for x in tqdm(bwb_test_docs, desc = "Loading BWB testing texts") if x]
    convos += [[["Human: ", x + "\n\n"]] for x in tqdm(bwb_train_docs, desc = "Loading BWB training texts") if x]
    del bwb_train_docs, bwb_test_docs

In [6]:
print(docs[:20])

['Okay. What else is needed to play, and what are the rules?\n\n', 'A horseshoe is usually made out of metal and is about 3 to 3.5 inches long and around 1 inch thick. The horseshoe should also have a 2 inch by 3 inch flat at the bottom where the rubber meets the metal. We also need two stakes and six horseshoes.', 'Horseshoes are either metal or plastic discs. The horseshoes come in different weights, and the lighter ones are easier to throw, so they are often the standard for beginning players.', 'any other ideas? they are fidgeting\n\n', 'Kids are great at fidgeting, so let’s make sure we give them permission to fidget.', 'Yeah, definitely. What works for you personally? What works for other people?', 'Thanks. And what are some typical forms required?\n\n', 'At the marina level, there’s usually the rental contract that covers the slip, and the paperwork to get insurance for the boat. You’ll also need to register your boat with the state, usually by mail, but they’ll also likely be a

# Tokenizer

## Setup

### Define Params

In [7]:
# Get the configuration params for current model medium
config = get_config(model_size = model_size)

# Name the current tokenizer combo of dataset+model names
tokenizer_name = f"{data_set}-{model_size}" # helpful-base-medium

# Create the tokenizer object inherited from HF PreTrainedTokenizer class therefore init params not in custom
tokenizer = SAFFUTokenizer(config)

# Determine the directory where you wanna retreive tokenizer from
tokenizer_directory = "../../code/cache/"

# Determine the directory where you wanna store tokenizer
save_directory = './cache/'

# Form the vocab file with a of directory, model path in tokenization_saffu.py, and name if given
vocab_file = os.path.join(tokenizer_directory, tokenizer._model_path,
                          (tokenizer_name + "-" if tokenizer_name else "") + "vocab.json")

# True if retraining the tokenizer, False to load an existing one available
reload = False

### Preload existing or train new vocab

In [8]:
# If we are not reloading and the vocab_file path exists then
if not reload and os.path.exists(vocab_file):
    
    # Print message for loading tokenizer
    print(f"Loading tokenizer: {tokenizer_name}\n")
    
    # Store the loaded tokenizer from the directory into result
    result = tokenizer.load(tokenizer_name, load_directory = tokenizer_directory)

# If we are either reloading or the vocab_file path doesn't exist then    
else:
    
    # Print the training message
    print(f"Training tokenizer: {tokenizer_name}")
    
    # Train our tokenizer
    tokenizer.train(tokenizer.pretokenize_documents(docs))
    
    # Save the vocabulary in the the directory specified
    tokenizer.save_vocabulary(tokenizer_name, save_directory = tokenizer_directory)

# Set the vocabulary breaking the words into subwords using BPE
tokenizer.set_vocabulary() # Prints message showing % of original tokens represented by new vocab

Training tokenizer: helpful-base-tiny


Pre-tokenizing 131505 documents: 100%|█| 131505/131505 [00:07<00:00, 18719.00it/
Counting token frequencies: 100%|███| 131505/131505 [00:00<00:00, 604560.46it/s]


Training bpe tokenizer

numbers of samples, pre-tokens, and target bpe pieces for covering of pre-tokens:  131505 4667373 4096


Initializing: 100%|████████████████████| 87646/87646 [00:02<00:00, 35516.34it/s]
Fitting:  88%|█████████████████████████▌   | 3609/4096 [00:30<00:04, 119.44it/s]


Built a vocabulary of 4096 types


Building sub-token reference dictionary: 100%|█| 87646/87646 [00:14<00:00, 6184.


Portion of model's 87646 reference tokens covered: 0.9995664377153549
Portion of model's 87646 reference tokens covered: 0.9995664377153549


### Print metrics and check tokenizer

In [9]:
# Print new vocab size for this experiment after BPE
print("Vocabulary size for experiment: ", len(tokenizer._vocabulary))

# Print the first 10 tokens in vocabulary
print("\n".join(f"{idx}: {token}" for i, (token, idx) in enumerate(tokenizer._vocabulary.items()) if i < 10))

# Augment the vocabulary with Human:, Assistant: prompts which have been BPE'd increasing vocab size by 2
if ("Assistant: " not in tokenizer._vocabulary) and ("Human: " not in tokenizer._vocabulary):
    tokenizer.augment_vocabulary(["Assistant: ", "Human: "])

# Name the data file storing metadata possibly regarding this configuration
data_file = os.path.join(tokenizer_directory, tokenizer._model_path,
                         (tokenizer_name + "-" if tokenizer_name else "") + 
                         f"data-space_{tokenizer.config._space}-r_{tokenizer.config._r}-b_{tokenizer.config._b}-heads_{tokenizer.config._heads}-N_{tokenizer.config._N}.json")

# Check the tokenize function with example sentence
print(tokenizer._tokenize("These casseroles disgust Kayla."))

# Check if vocabulary has BPE tokens after tokenize
[x in tokenizer._vocabulary for x in tokenizer._tokenize("These casseroles disgust Kayla.")]

Vocabulary size for experiment:  6387
0: <pad>
1: <oov>
2: <sod>
3: <eod>
4: <frg>
5: ,
6:  the
7:  you
8:  to
9: 

['The', 'se', ' ca', 'ss', 'er', 'ol', 'es', ' dis', 'gu', 'st', ' K', 'ay', 'la', '.']


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

# Modelling

In [10]:
# Execute the modeling file to read classes from there and use funcs
exec(open("../saffu/modeling_saffu.py").read())

# Define the model as an object of the decoder class which takes encoder object in constructor and cast to device
model = SAFFUDecoder(config, SAFFUEncoder(config, tokenizer)).to(device)

# Defint the stage of the model and whether we are reloading or creating new
stage = "init"; reload = False

# If reloading or the path doesn't exist then save this model
if reload or (not os.path.exists(f"../../code/models_to_test/{data_set}-{model_size}-{stage}.state")):
    save_model(model, data_set, model_size, stage)

SAFFUDecoder(
  (encoder): SAFFUEncoder(
    (logsoft): LogSoftmax(dim=0)
    (_V): Embedding(25252, 512)
    (BS): ModuleList(
      (0): SAFFULayer(
        (activate): LogSoftmax(dim=0)
        (logsoft): LogSoftmax(dim=0)
        (_W): Linear(in_features=1024, out_features=1024, bias=False)
        (_U): Linear(in_features=512, out_features=256, bias=False)
      )
    )
    (RS): ModuleList(
      (0): SAFFULayer(
        (activate): LogSoftmax(dim=0)
        (logsoft): LogSoftmax(dim=0)
        (_W): Linear(in_features=8, out_features=8, bias=False)
        (_U): Linear(in_features=4096, out_features=256, bias=False)
      )
    )
    (_D): Linear(in_features=512, out_features=128, bias=False)
  )
  (_Uc): Linear(in_features=640, out_features=25252, bias=False)
  (_Ud): Linear(in_features=128, out_features=101009, bias=False)
  (logsoft): LogSoftmax(dim=0)
)