# Setup

## Notebook formatting

## Imports

In [10]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the utilities and the dataloader
from utils import selfutil, saffuutil 
from classes import TestRNN

# Now reload the modules to ensure they are up-to-date
importlib.reload(selfutil)
importlib.reload(saffuutil)
importlib.reload(TestRNN)

# Import the funcs needed from utils
from utils.saffuutil import load_saffutok, dir2convos, get_saffuloader, saffutok_traindata

# Import the SAFFUDataLoader class
from classes.TestRNN import TestRNN

# Other regular imports
import torch.nn as nn
import torch
from tqdm import tqdm_notebook as tqdm
import gc
import os
import pandas as pd
import math
import time

# Tokenizer 

## Imports

In [2]:
# Read and execute saffu files for using functionality
exec(open("../saffu/configuration_saffu.py").read())
exec(open("../saffu/tokenization_saffu.py").read())
exec(open("../saffu/utilities_saffu.py").read())
exec(open("../saffu/data_saffu.py").read())
exec(open("../saffu/modeling_saffu.py").read())
exec(open("../saffu/training_saffu.py").read())
exec(open("../saffu/inference_saffu.py").read())
exec(open("../saffu/tuning_saffu.py").read())
exec(open("../saffu/load_data.py").read())

## Set environment variables
# Creates logger object named __main__ for debug messages
logger = logging.get_logger(__name__)

# Doesn't split memory chunks of more than 256 MB
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

# Makes code synchronous meaning GPU finishes running then CPU rund
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Enable dynamic shape allocation of tensor sizes without predefining them
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Set the gpu or cpu device
devstr = "cuda:1" # "cpu" 
gpu = False if (devstr == 'cpu') else True
device = 'cpu' if (devstr == 'cpu') else (torch.device(devstr if torch.cuda.is_available() else 'cpu') 
                                          if devstr else torch.cuda.current_device())
# Observe the device
print(device)

cuda:1


## Params

In [3]:
# Define the dataset being used, can also combine different ones with a +
data_set = "train_big" # +harmless-base+babylm_10M+babylm_100M+BWB

# Define model size from tiny, micro, small, medium, big
model_size = "tiny"

# Size of different datasets in millions of word tokens
training_sizes = {"helpful-base": 5, "harmless-base": 5, "babylm_10M": 10, "babylm_100M": 100, "BWB": 1000,
                 "train":2.1877, "train_big":5} 

# Define the % of data held out for development so 1/10 of total available below
devsample = 10 

# Total size of all datasets in millions, currently 2.1877 million should be
dataset_size =  sum([training_sizes[data_subset] for data_subset in data_set.split("+")])

# Get downsample size which would be 1 = 1 million below
downsample = max(int(dataset_size / 5), 1) # roughly 5 million word-tokens per split

# Hyperparameter for learning rate probably
eta = 0.05 # 0.05

# Empty lists to store document or conversation level data for normal, dev and test
docs, ddocs, tdocs = [], [], []
convos, dconvos, tconvos = [], [], []

# Get the configuration params for current model medium
config = get_config(model_size = model_size)

# Name the current tokenizer combo of dataset+model names
tokenizer_name = f"{data_set}-{model_size}" # helpful-base-medium

# Create the tokenizer object inherited from HF PreTrainedTokenizer class therefore init params not in custom
tokenizer = SAFFUTokenizer(config)

# Determine the directory where you wanna retreive tokenizer from
tokenizer_directory = "./cache/"

# Determine the directory where you wanna store tokenizer
save_directory = './cache/'

# Form the vocab file with a of directory, model path in tokenization_saffu.py, and name if given
vocab_file = os.path.join(tokenizer_directory, tokenizer._model_path,
                          (tokenizer_name + "-" if tokenizer_name else "") + "vocab.json")

# True if retraining the tokenizer, False to load an existing one available
reload = False

# Now call load func to setup tokenizer
load_saffutok(reload, vocab_file, tokenizer, tokenizer_name, tokenizer_directory, train_dir = '../data/train_big/')


# Name the data_file path
data_file = os.path.join(tokenizer_directory, tokenizer._model_path,
                         (tokenizer_name + "-" if tokenizer_name else "") + 
                         f"data-space_{tokenizer.config._space}-r_{tokenizer.config._r}-b_{tokenizer.config._b}-heads_{tokenizer.config._heads}-N_{tokenizer.config._N}.json")

Loading tokenizer: train_big-tiny



In [4]:
# Print new vocab size for this experiment after BPE
print("Vocabulary size for experiment: ", len(tokenizer._vocabulary))

Vocabulary size for experiment:  7206


In [5]:
print(tokenizer._tokenize("These casseroles disgust Kayla."))
tokenizer._vocabulary

['Th', 'ese', ' ca', 'ss', 'er', 'ol', 'es', ' di', 's', 'gu', 'st', ' K', 'ay', 'la', '.']


{'\n': 360,
 '\n ': 6432,
 ' ': 924,
 ' \t': 7013,
 ' \t3': 6242,
 ' \n': 1963,
 '  ': 207,
 '  \n': 2097,
 '   ': 2032,
 '    ': 2382,
 '     ': 1934,
 '      ': 2387,
 '       ': 2389,
 '        ': 1861,
 '         ': 1910,
 '          ': 2393,
 '           ': 2407,
 '            ': 2365,
 '             ': 2420,
 '              ': 2628,
 '               ': 2658,
 '                ': 2832,
 '                 ': 2903,
 '                  ': 2838,
 '                   ': 3167,
 '                    ': 3295,
 '                     ': 3392,
 '                      ': 3594,
 '                       ': 3508,
 '                        ': 3795,
 '                         ': 3750,
 '                          ': 4197,
 '                           ': 4196,
 '                            ': 4847,
 '                             ': 6342,
 '                              ': 5347,
 '                               ': 5077,
 '                                ': 5038,
 '                                 ': 

# Modeling

## Setup

In [6]:
# Take the tokenizer and create Encoder then use that to create decoder and full model
model = SAFFUDecoder(config, SAFFUEncoder(config, tokenizer)).to(device)

# Define the current stage of the model initial
stage = "init"

# Set to determine whether we are reloading or not
reload = False

# If in reload mode or the path doesnt exist for this dataset-model-stage combo then save_model
if reload or (not os.path.exists(f"./models_to_test/{data_set}-{model_size}-{stage}.state")):
    
    """
    Saves the following information about current dataset-model_size-stage combo of the model
    
    1. state - Weights/params of the model which can be loaded later for resuming training or inference
    2. losses - Training losses over epochs
    3. counts - Frequency of words/subwords important for BPE
    4. vocabulary - Mapping of words to indices to form vocab with keys as words and index as value
    5. raw_td - Merge pairs dictating how subwords were combined to form new tokens, important for BPE
    6. subtoken_reference - Maps text to its subwords, important for mapping output to og format
    7. docsizes - Sizes of docs or number of tokens per doc
    8. reference - Metadata related to training data or model perhaps
    """
    save_model(model, data_set, model_size, stage)

Step by step explanation of model:

**Encoder**
- 1. logsoft (LogSoftmax)
    - a. Converts input tokens to log-probability representation for smoothing, preventing over/underflow, normalizing. 
    - b. Could be that the input tokens are treated as if they already carry certain relationships and LogSoftmax helps the model understand them probabilistically before passing them to Embeding layer _V
- 2. _V (Embedding Layer)
    - a. Is of dim: vocab_size x embed_size and converts each incoming word into embed_size vector
    - b. Frozen during warm start and explicitly initialized to avoid changing during early training

In [7]:
print(f'Full Model:\n{model}')
print(f'\nEncoder Part to Use:\n{model.encoder}')

Full Model:
SAFFUDecoder(
  (encoder): SAFFUEncoder(
    (logsoft): LogSoftmax(dim=0)
    (_V): Embedding(7206, 128)
    (BS): ModuleList(
      (0): SAFFULayer(
        (activate): LogSoftmax(dim=0)
        (logsoft): LogSoftmax(dim=0)
        (_W): Linear(in_features=256, out_features=256, bias=False)
        (_U): Linear(in_features=128, out_features=64, bias=False)
      )
    )
    (RS): ModuleList(
      (0): SAFFULayer(
        (activate): LogSoftmax(dim=0)
        (logsoft): LogSoftmax(dim=0)
        (_W): Linear(in_features=2, out_features=2, bias=False)
        (_U): Linear(in_features=256, out_features=64, bias=False)
      )
    )
    (_W): FFULayer(
      (activate): LogSoftmax(dim=0)
      (_U): Linear(in_features=160, out_features=160, bias=False)
    )
    (_D): FFULayer(
      (activate): LogSoftmax(dim=0)
      (_U): Linear(in_features=128, out_features=32, bias=False)
    )
  )
  (_Uc): FFULayer(
    (activate): LogSoftmax(dim=0)
    (_U): Linear(in_features=160, out

# CHecker to Clear Parsing Up

# Data Setup for SAFFU

## Create Dataloaders

In [8]:
# Define the training directory and get loader
train_dir = '../data/train_big/'
train_loader = get_saffuloader(train_dir, tokenizer)

Processing Files: 100%|█████████████████████████| 10/10 [00:22<00:00,  2.20s/it]


In [17]:
# # Define the validation directory and get loader
# val_dir = '../data/train_small/'
# val_loader = get_saffuloader(val_dir, tokenizer)

In [18]:
# # Code to observe loaders if required
# print(train_loader.file_paths[0])
# print(train_loader.x_tok[0][17,1,:])
# print(train_loader.y_tok[0][17,1,:])

## Generate Docs and Convos/Dconvos

In [12]:
# Get the docs list and observe
docs = saffutok_traindata(train_dir)

# Code to observe docs
print(f'\nDocs length ({len(docs)}), first 50 elems:\n{docs[0]}')

PARA - Generating Docs:   0%|          | 0/1240 [00:00<?, ?it/s]


Docs length (8115697), first 50 elems:



In [14]:
# Put every integer divisible by 5 in dconvos and rest in convos essentially splitting it into 80-20 train-val split
convos = [[doc] for i, doc in tqdm(enumerate(docs), desc="Generating convos") if (i % 5) and doc]
dconvos = [[doc] for i, doc in tqdm(enumerate(docs), desc="Generating dconvos") if (not i % 5) and doc]

# Code to observe convos and dconvos
print(f'\nConvos length ({len(convos)}), first 10 elems:\n{convos[:10]}')
print(f'\nDConvos length ({len(dconvos)}), first 10 elems:\n{dconvos[:10]}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  convos = [[doc] for i, doc in tqdm(enumerate(docs), desc="Generating convos") if (i % 5) and doc]


Generating convos: 0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  dconvos = [[doc] for i, doc in tqdm(enumerate(docs), desc="Generating dconvos") if (not i % 5) and doc]


Generating dconvos: 0it [00:00, ?it/s]


Convos length (1884116), first 10 elems:
[['TRANSWESTERN PIPELINE COMPANY'], ['Rate calculation is based on the spread of two indices less variable charges (fuel/usage) less fixed rate or spread.  PG&E to provide index price calc.'], ['Settlement Based Max Reservation Rates and TCR Surcharges changed eff 11/1/00; GRD rate changed eff 1/1/01.'], ['DEALS ABOVE MAX TARIFF RATE'], ['RLSE'], ['REPLACEMENT'], ['REPLACEMENT'], ['OFFER'], ['REPLACEMENT'], ['REPLACEMENT']]

DConvos length (469624), first 10 elems:
[['CAPACITY RELEASE REPORT'], ['                                                                                                                                                                                                                                                               '], ['CREDIT TO'], ['RE-'], ['Rate'], ['RLSE SHIPPER'], ['DELIVERY POI'], ['VOL'], ['MAX RATE**'], ['ARR']]


# Tuning

## Warm Start

In [15]:
# Set grad to false to freeze embedding layer since we are warm starting
model.encoder._V.weight.requires_grad = False

# Define params
seed = 691; ignore_space = False; ignore_case = False; warm_start = True; verbose = True; 
warm_vecs = 1*(2**0 + 0.99999); identity_ratio = 2**(-1); icf = True 
log_label = False; nlabels = 1*(2**0); centroids = False; label_iterations = 1*(2**10) # None 
epochs = int(np.max([int(downsample/5), 1]))*(2**5)
#epochs = int(np.max([int(downsample/5), 1]))
patience = 2**1

# Decide if new run
reload = False

# Generate warm file
warm_file = "".join([data_file[:-5] + "-", 
                     f"b_{tokenizer.config._bits}-hb_{tokenizer.config._hidden}-",
                     f"we_{int(tokenizer.config._wave_encode)}-oa_{tokenizer.config._o_agg}-ra_{tokenizer.config._r_agg}-ba_{tokenizer.config._b_agg}-",
                     f"mr_{int(tokenizer.config._mask_r)}-mb_{int(tokenizer.config._mask_b)}-md_{tokenizer.config._model_documents}-",
                     f"is_{int(ignore_space)}-ic_{int(ignore_case)}-ws_{int(warm_start)}-wv_{int(warm_vecs)}-ds_{downsample}-seed_{seed}"])

# Create the tuner
tuner = SAFFUTuner(ignore_case, ignore_space, devstr, warm_vecs, identity_ratio = identity_ratio, 
                   label_iterations = label_iterations, log_label = log_label, nlabels = nlabels, centroids = centroids, icf = icf)

# Initiate warm start
tuner.warm_start(model, convos, dconvos, downsample*10, seed, epochs, patience, devsample = devsample, model_file = warm_file, reload = reload, verbose = verbose)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


Warming V-matrix:   0%|          | 0/10 [00:00<?, ?it/s]

Aggregated data from 22255515 token pieces.


RuntimeError: sparse tensors do not have strides

device(type='cuda', index=1)

In [118]:
class TestRNN2(nn.Module):

    # Constructor of the RNN_LM class, initializing the layers and weights
    def __init__(self, saffu_model, dropout_rate=0.0, nonlinearity='relu'): # hidden_state_dim, rnn_layers, embedding_matrix

        # Ensures functions of parent class nn.Module are called in subclass RNN_LM
        super(TestRNN2, self).__init__()

        # Rows of embed matrix = Each word in the vocabulary
        # self.vocab_size = embedding_matrix.shape[0]  # vocab_size = 34057
        self.vocab_size = len(saffu_model.encoder.tokenizer._vocabulary)

        # Cols of embed matrix = Length of each embedding vector
        # self.embedding_dim = embedding_matrix.shape[1]  # embed_dim = 50
        self.embedding_dim = saffu_model.encoder.config._bits

        # The dimension of the hidden state vector 'h' for each step/token
        # self.hidden_dim = hidden_state_dim  # hid_dim = 100
        self.hidden_dim = saffu_model.encoder.config._encoder_dim

        # Number of recurrent layers we will use
        # self.rnn_layers = rnn_layers  # rnn_layers = 2
        self.saffu_model = saffu_model

        # Creates an embedding layer from the pre-trained embedding matrix that maps input tokens to their corresponding word vectors
        # If freezing then embeddings don't change during training, we need False because we need them to finetune to our task
        # self._embed = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

        # Randomly zeroes out a percentage of input units determined by dropout_rate for each update during training
        self._drop = nn.Dropout(dropout_rate)

        # RNN layer with 'relu' nonlinearity but not managing exploding gradients, dropout and multiple recurrent layers
        # self._rnn = nn.RNN(
        #     self.embedding_dim,
        #     self.hidden_dim,
        #     self.rnn_layers,
        #     nonlinearity=nonlinearity,
        #     dropout=dropout_rate
        # )

        # Linear layer to map the concatenated hidden states to logits (1 to predict bold or not)
        self._pred = nn.Linear(2 * self.hidden_dim, 1)

    def cell_hs(self, x):

        # Set the manual seed for reproducibility
        torch.manual_seed(0)

        # Initialize H_local as a zero tensor with the appropriate shape (num_cells, hidden_dim)
        # H_local = torch.zeros(x.shape[1] * x.shape[2], self.hidden_dim, device=x.device) # cells x hidden_dim
        H_local = torch.zeros(x.shape[0], x.shape[1] * x.shape[2], self.hidden_dim, device=x.device) # batch x cells x hidden_dim
        
#         # DEBUG PRINT
#         print(f'Input x: {x.shape}')
#         print(f'\nH_local before {H_local.shape}:\n{H_local}')

        # Iterate over each cell
        for cell in tqdm(range(x.shape[1] * x.shape[2]),desc="Getting Cells"):
            
            # Get the current row and col
            row = cell // x.shape[2]
            col = cell % x.shape[2]
            
            # Extract cell tokens across batches for current cell
            celltoks_across_batch = x[:, row, col, :] # batch_size x tokens equivalent to 'blocks', except it is missing leading pad tokens

            # Get tokens in embedding dim and apply dropout
            # embedded_toks = self._drop(self._embed(celltoks_across_batch)) # batch_size x tokens x embed_dim

            # Now run RNN on dropout
            # _, h = self._rnn(embedded_toks)
            _, h, _ = self.saffu_model.encoder(self.saffu_model.encoder.inflate(celltoks_across_batch), celltoks_across_batch, False, 
                                               range(celltoks_across_batch.shape[0]), range(celltoks_across_batch.shape[1] - (self.saffu_model.encoder.config._r + 1))).to_tuple() # (batch_size * tokens) x hidden
            h = torch.reshape(h, (celltoks_across_batch.shape[0], celltoks_across_batch.shape[1] - (self.saffu_model.encoder.config._r + 1), -1)) # batch_size x tokens x hidden
            
            # Store hidden state from last rnn layer for last token in H_local tensor
            # H_local[cell] = h[-1, -1, :]
            H_local[:, cell, :] = h[:,-1,:]
            
#             # DEBUG PRINT
#             if cell == 0:
#                 print(f'\nInside Cell {cell}\nRow {row}, Col {col}')
#                 print(f'\nCell Across {celltoks_across_batch.shape}:\n{celltoks_across_batch}')
#                 print(f'\nCell Embedded Toks {embedded_toks.shape}:\n{embedded_toks}')
#                 print(f'\nRNN H {h.shape}:\n{h}')
#                 print(f'\nLast RNN Layer Last Token HS {H_local[cell].shape}:\n{H_local[cell]}')

            # Delete intermediate tensors to free up memory
            del celltoks_across_batch
            # del embedded_toks
            del h


        # Now get the sum of all the HS in size cells x hidden_dim and subtract individual HS
        # ans = H_local.sum(dim=0, keepdim=True) - H_local # cells x hidden_dim
        ans = H_local.sum(dim=1, keepdim=True) - H_local # batch x cells x hidden_dim
        
#         # DEBUG PRINT
#         print(f'\nFinal H_local {H_local.shape}:\n{H_local}')
#         print(f'\nFinal Returned Tensor {ans.shape}:\n{ans}')

        # Delete the H_local 
        del H_local
        
        # Calculate the global sum and return the adjusted tensor
        return ans
    
    # Forward function
    def forward(self, x):
        
        # Set the manual seed
        torch.manual_seed(0)

        # Global hidden states containing info around current cell already on gpu
        H_global = self.cell_hs(x) # cells x hidden_dim

        # Tensor to store the full macro cube of size batch x rows x cols
        S_cube = torch.zeros((x.shape[0], x.shape[1], x.shape[2]), device=x.device)
        
#         # DEBUG PRINT
#         print(f'\nInput x {x.shape}')
#         print(f'\nInitial H_global {H_global.shape}:\n{H_global}')
#         print(f'\nInitial S_cube {S_cube.shape}:\n{S_cube}')

        # Loop through all rows x cols cells
        for cell in range(x.shape[1] * x.shape[2]):
            
            # Get the current row and col
            row = cell // x.shape[2]
            col = cell % x.shape[2]
            
            # Extract cell tokens across batches for current cell
            celltoks_across_batch = x[:, row, col, :] # batch_size x tokens
            
            # Get tokens in embedding dim and apply dropout
            # embedded_toks = self._drop(self._embed(celltoks_across_batch)) # batch_size x tokens x embed_dim

            # Now run RNN on embedded toks
            # z, _ = self._rnn(embedded_toks) # batch_size x tokens x hidden_dim
            _, h, _ = self.saffu_model.encoder(self.saffu_model.encoder.inflate(celltoks_across_batch), celltoks_across_batch, False, 
                                               range(celltoks_across_batch.shape[0]), range(celltoks_across_batch.shape[1] - (self.saffu_model.encoder.config._r + 1))).to_tuple() # (batch_size * tokens) x hidden
            h = torch.reshape(h, (celltoks_across_batch.shape[0], celltoks_across_batch.shape[1] - (self.saffu_model.encoder.config._r + 1), -1)) # batch_size x tokens x hidden
            
            # Get z for last token across all batches and hidden dim
            # z_lasttok = z[:, -1, :] # batch_size x hidden_dim
            
            # Extract H_global for current cell and introduce first dimension, then expand first dim to batch_size
            # H_cell = H_global[cell].unsqueeze(0).expand(x.shape[0], -1) # batch_size x hidden_dim

            # Concatenate global/local context of cell along first dim batch_size then apply dropout
            # concat_hs = self._drop(torch.cat((z_lasttok, H_cell), dim = 1)) # batch_size x (2 * hidden_dim)

            concat_hs = self._drop(torch.cat((h[:, -1, :], H_global[:, cell, :]), dim = 2)) # batch_size x (2 * hidden_dim)
            
            # Make preds using this HS and adjust to be batch_size, set to current location in S_cube
            S_cube[:, row, col] = self._pred(concat_hs).view(-1) # batch_size
            
#             # DEBUG PRINT
#             if cell == 0:
#                 print(f'\nInside Cell {cell}\nRow {row}, Col {col}')
#                 print(f'\nCell Across {celltoks_across_batch.shape}:\n{celltoks_across_batch}')
#                 print(f'\nCell Embedded Toks {embedded_toks.shape}:\n{embedded_toks}')
#                 print(f'\nRNN Z {z.shape}:\n{z}')
#                 print(f'\nRNN Z Last Token {z_lasttok.shape}:\n{z_lasttok}')
#                 print(f'\nH_cell global HS for cell {H_cell.shape}:\n{H_cell}')
#                 print(f'\nConcatenated HS {concat_hs.shape}:\n{concat_hs}')
#                 print(f'\nPredictions {S_cube[:, row, col].shape}:\n{S_cube[:, row, col]}')


            # Delete intermediate tensors to free up memory
            del celltoks_across_batch
            # del embedded_toks
            # del z
            # del z_lasttok
            # del H_cell
            del concat_hs
        
        
#         # DEBUG PRINT
#         print(f'\nFinal S_cube {S_cube.shape}:\n{S_cube}')
        
        # Delete H_global finally
        del H_global
        
        # Return the final S_cube
        return S_cube
        
                


# Trying New Model with SAFFU

In [119]:
rnn_model2 = TestRNN2(model).to(device)
rnn_model2

TestRNN2(
  (saffu_model): SAFFUDecoder(
    (encoder): SAFFUEncoder(
      (logsoft): LogSoftmax(dim=0)
      (_V): Embedding(6130, 128)
      (BS): ModuleList(
        (0): SAFFULayer(
          (activate): LogSoftmax(dim=0)
          (logsoft): LogSoftmax(dim=0)
          (_W): Linear(in_features=256, out_features=256, bias=False)
          (_U): Linear(in_features=128, out_features=64, bias=False)
        )
      )
      (RS): ModuleList(
        (0): SAFFULayer(
          (activate): LogSoftmax(dim=0)
          (logsoft): LogSoftmax(dim=0)
          (_W): Linear(in_features=2, out_features=2, bias=False)
          (_U): Linear(in_features=256, out_features=64, bias=False)
        )
      )
      (_W): FFULayer(
        (activate): LogSoftmax(dim=0)
        (_U): Linear(in_features=160, out_features=160, bias=False)
      )
      (_D): FFULayer(
        (activate): LogSoftmax(dim=0)
        (_U): Linear(in_features=128, out_features=32, bias=False)
      )
    )
    (_Uc): FFULayer

In [120]:


# CUDA Vars to avoid randomization
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create a DataLoader from your check_loader
# test_loader = torch.utils.data.DataLoader(val_loader, batch_size=2, shuffle=False)
test_loader = torch.utils.data.DataLoader(train_loader, batch_size=2, shuffle=False)

# Get one batch from the DataLoader
batch = next(iter(test_loader))

# Extract single x_tok example from batch
ex_input = batch['x_tok'][0]

# Extract single cell's tokens
cell_tokens = ex_input[0,0,:]
#cell_tokens = cell_tokens.unsqueeze(0)

# Observe shape and tensor
print(f'Single x_tok tensor from val_loader {ex_input.shape}')
print(f'\nTokens for first cell shaped {cell_tokens.shape}:\n{cell_tokens}')

# # Pass the single tensor through SAFFU model
# one_pass = model.forward(cell_tokens, cell_tokens.tolist())

Single x_tok tensor from val_loader torch.Size([100, 100, 35])

Tokens for first cell shaped torch.Size([35]):
tensor([0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [121]:
print(batch['x_tok'].shape, device)

torch.Size([2, 100, 100, 35]) cuda:1


In [122]:
rnn_model2(batch['x_tok'].to(device))

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 1 has a total capacity of 11.77 GiB of which 12.25 MiB is free. Process 1307884 has 1.54 GiB memory in use. Including non-PyTorch memory, this process has 10.21 GiB memory in use. Of the allocated memory 9.82 GiB is allocated by PyTorch, and 24.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [18]:
# Checking input with single cell tokens in encoder
output = model.encoder(cell_tokens)
output

TypeError: forward() missing 4 required positional arguments: 'blocks', 'is_inference', 'bis', and 'ms'

# Tuning

## Setup params

In [11]:
# Total params in the model so the number of elements for each param name all together
total_params = 0

# Total learnable params
total_learnable = 0

# Iterate through each macro param in model's params like Psi_b, _V.weight, RS.0._U.weight etc.   
for name, param in model.named_parameters():
    
    # Current param count is product of tensor dims (log-exp avoids overflow -> one less than actual)
    curr_params = int(np.exp(sum(np.log(param.shape))))
    
    # Add curr_params to learnable params if curr_param requires grad
    total_learnable += curr_params if param.requires_grad else 0
    
    # Add curr_params to total_params
    total_params += curr_params

# Print the ratio of learnable to all params
print(f"Total numbers of learnable/all parameters: {total_learnable}/{total_params}")

# Set grad to false to freeze embedding layer since we are warm starting
model.encoder._V.weight.requires_grad = False

# Set seed for reproducibility
seed = 691

# Define whether warm starting or not
warm_start = True

# Determine if verbose output needed when running tuner.warm_start()
verbose = True

# Adds spaces with tokens if set to False else removes them if True
ignore_space = False

# Case sensitive if False and lowercases everything if True
ignore_case = False

# Used for initializating _V (embeddings) matrix during the warm start. Higher = Richer init repr for each token
warm_vecs = 1*(2**0 + 0.99999) # = 2

# Do-nothing transform, sets part embeddings to identity stabilizing over-aggressive/random init states
identity_ratio = 2**(-1) # = 0.5

# Inverse Context Freq: co-occurrence counts reweight words based on rarity/importance
# Highlights contextually significant words leading to better differentiation if set to True
icf = True 

# Indicates if token labels are log/linear. Log reduces impact of high freq elements, linear simpler 
log_label = False

# Number of distinct clusters to assign to tokens during training
nlabels = 1*(2**0) # = 1 = No subdivision cluster of toks

# Should model consider centroids during clustering/quantization process
centroids = False

# Iterations to refine label assignments in training. Higher = Reclusturing more for each token getting more accurate groups
label_iterations = 1*(2**10) # = 1024

# Determine epochs and scale by factor of 1024 to account for smaller datasets
epochs = int(np.max([int(downsample/5), 1]))*(2**10)

# Determine if reloading or new
reload = False

# Determine patience to of observing no loss reduction
patience = 2**1

# Finally name the file used for warm start
warm_file = "".join([data_file[:-5] + "-", 
                     f"b_{tokenizer.config._bits}-hb_{tokenizer.config._hidden}-",
                     f"we_{int(tokenizer.config._wave_encode)}-oa_{tokenizer.config._o_agg}-ra_{tokenizer.config._r_agg}-ba_{tokenizer.config._b_agg}-",
                     f"mr_{int(tokenizer.config._mask_r)}-mb_{int(tokenizer.config._mask_b)}-md_{tokenizer.config._model_documents}-",
                     f"is_{int(ignore_space)}-ic_{int(ignore_case)}-ws_{int(warm_start)}-wv_{int(warm_vecs)}-ds_{downsample}-seed_{seed}"])

# Define traning and dev directories
train_dir = '../data/train/'; dev_dir = '../data/train_small'

Total numbers of learnable/all parameters: 2670335/2703611


In [12]:
# Update the convos lists required for tuner
convos = dir2convos(train_dir)
dconvos = dir2convos(dev_dir)

Getting convos list:  26%|█████               | 159/623 [00:05<00:13, 33.95it/s]

ERROR ../data/train/randall_gay_000_1_1.pst.21.xls: Error tokenizing data. C error: Expected 1 fields in line 39, saw 3



Getting convos list: 100%|████████████████████| 623/623 [00:25<00:00, 24.63it/s]
Getting convos list: 100%|██████████████████████| 10/10 [00:01<00:00,  7.68it/s]


In [13]:
convos[0]

[['Cell', ''],
 ['Cell', 'TRANSWESTERN PIPELINE COMPANY'],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell',
  'Rate calculation is based on the spread of two indices less variable charges (fuel/usage) less fixed rate or spread.  PG&E to provide index price calc.'],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell', ''],
 ['Cell'

## Warm Start

In [11]:
# Define the tuner
tuner = SAFFUTuner(ignore_case, ignore_space, devstr, warm_vecs, identity_ratio = identity_ratio,
                   label_iterations = label_iterations, log_label = log_label, nlabels = nlabels,
                   centroids = centroids, icf = icf)

# Warm start with params
tuner.warm_start(model, convos, dconvos, downsample*10, seed, epochs, 
                 patience, devsample = devsample, model_file = warm_file,
                 reload = reload, verbose = verbose)

In [None]:
# total_params, total_learnable = 0, 0
# for name, param in model.named_parameters():
#     total_params += int(np.exp(sum(np.log(param.shape))))
#     if param.requires_grad:
#         total_learnable += int(np.exp(sum(np.log(param.shape)))) # param.shape[0]*param.shape[1]
#         # print(name, param.shape[0]*param.shape[1])

# print(f"Total numbers of learnable/all parameters: {total_learnable}/{total_params}")

# model.encoder._V.weight.requires_grad = False

# seed = 691; ignore_space = False; ignore_case = False; warm_start = True; verbose = True; 
# warm_vecs = 1*(2**0 + 0.99999); identity_ratio = 2**(-1); icf = True 
# log_label = False; nlabels = 1*(2**0); centroids = False; label_iterations = 1*(2**10) # None 
# epochs = int(np.max([int(downsample/5), 1]))*(2**5)
# patience = 2**1
# reload = False
# warm_file = "".join([data_file[:-5] + "-", 
#                      f"b_{tokenizer.config._bits}-hb_{tokenizer.config._hidden}-",
#                      f"we_{int(tokenizer.config._wave_encode)}-oa_{tokenizer.config._o_agg}-ra_{tokenizer.config._r_agg}-ba_{tokenizer.config._b_agg}-",
#                      f"mr_{int(tokenizer.config._mask_r)}-mb_{int(tokenizer.config._mask_b)}-md_{tokenizer.config._model_documents}-",
#                      f"is_{int(ignore_space)}-ic_{int(ignore_case)}-ws_{int(warm_start)}-wv_{int(warm_vecs)}-ds_{downsample}-seed_{seed}"])
# tuner = SAFFUTuner(ignore_case, ignore_space, devstr, warm_vecs, identity_ratio = identity_ratio, 
#                    label_iterations = label_iterations, log_label = log_label, nlabels = nlabels, centroids = centroids, icf = icf)
# tuner.warm_start(model, convos, dconvos, downsample*10, seed, epochs, patience, devsample = devsample, model_file = warm_file, reload = reload, verbose = verbose