## Function Definitions

In [10]:
from pathlib import Path
import sys
sys.path.insert(1, str(Path.cwd().parent))
from tensor_hero.model import Transformer, ColabLazyDataset
import torch
from torch import nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import os
import json
import pickle

def __load_model(model_directory):
    '''
    Loads model and param dict from model_directory. useful for continuing training
    Helper function for initialize_params()
    
    ~~~~ ARGUMENTS ~~~~
    - model_directory (Path): Folder containing model weights and params
        - probably ./model/saved_models/<model name>
    
    ~~~~ RETURNS ~~~~
    - dict: params loaded from model directory
    '''
    with open(str(model_directory / 'params.pkl'), 'rb') as f:
        params = pickle.load(f)
    f.close()

    return params

def initialize_params(params):
    '''
    Takes the original params and modifies them to match the current training objective.
        - Will load params from model directory if desired
        - Initializes new params for training a new model
        - Asks for user input regarding descriptions of new models

    ~~~~ ARGUMENTS ~~~~
    - params (dict): Dictionary containing relevant model information. See definition in main() for more information.

    ~~~~ RAISES ~~~~
    - SystemExit: If the experiment is to be aborted.

    ~~~~ RETURNS ~~~~
    - dict: model and training parameters
    '''
    # Ask user whether they will load a pretrained model to continue training or initialize a new model
    response = str(input('Load from pretrained model? (y/n): ')).lower()
    while response not in ['y', 'n']:
        response = str(input('invalid input\nAre the parameters correct (y/n)?: ')).lower()
        
    if response == 'y':  # If loading from pretrained model, load its weights from its directory
        response = str(input('Enter name of model to load: '))  # This will be the name of the directory under ./model/saved_models
        while not os.path.isdir(Path.cwd() / 'model' / 'saved_models' / response):
            print('Error: {} is not a valid directory'.format(response))
            response = str(input('Enter name of model to load: '))
        model_directory = Path.cwd() / 'model' / 'saved_models' / response
        params = __load_model(model_directory)
        params['LOAD'] = True

    else:  # If initializing new model, create a new directory for it
        while os.path.isdir(Path.cwd() / 'model' / 'saved_models' / params['model_name']):
            new_name = input('Directory already exists.\nEnter new model name: ')
            params['model_name'] = str(new_name)
        os.mkdir(Path.cwd() / 'saved_models' / params['model_name'])
        params['LOAD'] = False

    params['model_file_name'] = params['model_name'] + '.pt'  # Holds weights of model
    params['model_outfile'] = str(Path.cwd() / 'model' / 'saved_models' / params['model_name'] / params['model_file_name'])  # model directory

    # Validate parameters
    print(json.dumps(params, indent=4))
    response = str(input('Are the parameters correct (y/n)?: ')).lower()
    while response not in ['y', 'n']:
        response = str(input('invalid input\nAre the parameters correct (y/n)?: ')).lower()
    if response == 'n':
        raise SystemExit(0)

    # Gather description of experiment from user
    if not 'experiment_description' in params.keys():
        experiment_description = input('Enter experiment description: ')
        params['experiment_description'] = experiment_description

    # Save parameters
    with open(str(Path.cwd() / 'saved_models' / params['model_name'] / 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)
    f.close()
    
    print('parameters saved\n')
    
    return params

def initialize_model(params, device):
    '''
    Takes params and the device (CUDA or CPU) and initializes a transformer model, as defined in ./tensor_hero/model.py
    
    ~~~~ ARGUMENTS ~~~~
    - params (dict): Model and training parameters. Should be the output from initialize_params()
    - device (str): "CUDA" or "CPU"
    
    ~~~~ RETURNS ~~~~
        PyTorch model: Transformer model initialized with params and sent to device. Defined in ./tensor_hero/model.py
    '''
    model = Transformer(
            embedding_size = params['embedding_size'],
            trg_vocab_size = params['trg_vocab_size'],
            num_heads = params['num_heads'],
            num_encoder_layers = params['num_encoder_layers'],
            num_decoder_layers = params['num_decoder_layers'],
            forward_expansion = params['embedding_size']*params['forward_expansion'],
            dropout = params['dropout'],
            max_len = params['max_src_len'],
            device = device,
        ).to(device)

    return model


## Model and Training Initialization

In [11]:
params = {
    'training_data' : 'train separated',     # CHANGEME (these parameters must be changed each experiment)
    'model_name' : 'loss_function_test2',     # CHANGEME
    'optimizer' : 'Adam',                    # CHANGEME (maybe not this one, but you do have to fill it in manually)
    'train_path' : Path.cwd().parent / 'Training_Data' / 'training_ready' / 'train',

    'num_epochs' : 500,
    'batch_size' : 12,
    'shuffle' : True,
    'num_workers' : 4,
    'drop_last' : True,
    'last_global_step' : 0,

    'max_trg_len' : 500, # NOTE: max_trg_len <= max_src_len otherwise side asset error is triggered
    'max_src_len' : 500,
    'trg_vocab_size' : 435,
    'pad_idx' : 434,
    'embedding_size' : 512,

    'lr' : 1e-4,
    'num_heads' : 8,
    'num_encoder_layers' : 2,
    'num_decoder_layers' : 2,
    'dropout' : 0.1,
    'forward_expansion' : 4,

    'date' : datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
}
# ---------------------------------------------------------------------------- #
#                                  DATALOADER                                  #
# ---------------------------------------------------------------------------- #

dl_params = {
    'batch_size' : params['batch_size'],
    'shuffle' : params['shuffle'],
    'num_workers' : params['num_workers'],
    'drop_last' : params['drop_last'],
}

# Define data loaders
train_data = ColabLazyDataset(Path(params['train_path']), params['max_src_len'], params['max_trg_len'], params['pad_idx'])
train_loader = torch.utils.data.DataLoader(train_data, **dl_params)

# ---------------------------------------------------------------------------- #
#                              TRAINING PARAMETERS                             #
# ---------------------------------------------------------------------------- #

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training hyperparameters
learning_rate = params['lr']
num_epochs = params['num_epochs']

model = initialize_model(params, device) 

# torch.save(model.state_dict(), 'model.pt')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# criterion = nn.CrossEntropyLoss() # Multi-class loss, when you have a many class prediction problem
criterion = nn.CrossEntropyLoss(ignore_index=params['pad_idx'])

100%|██████████| 2008/2008 [00:00<00:00, 6456.07it/s]


0 datapoints removed due to exceeding maximum length


## Training Loop

Let's use this loop to parse out the shape of all the inputs and the outputs

In [23]:
model.train() # Put model in training mode, so that it knows it's parameters should be updated
for batch_idx, batch in enumerate(train_loader):
    # Batches come through as a tuple defined in the return statement __getitem__ in the Dataset
    spec, notes = batch[0].to(device), batch[1].to(device)
    
    print('MODEL INPUTS  (the notes are input for training purposes)\n~~~~~~~~~~~~~~~~')
    print(f'spec shape: {spec.shape}')
    print(f'notes shape: {notes.shape}')

    # forward prop
    output = model(spec, notes[..., :-1])           # Don't pass the last element into the decoder, want it to be predicted

    print('\nMODEL OUTPUT\n~~~~~~~~~~~~~~~')
    print(f'output shape: {output.shape}')
    # output = output.reshape(-1, output.shape[2])  # Reshape the output for use by criterion
    notes = notes[..., 1:] # .reshape(-1)           # Same for the notes
    print(f'notes shape: {notes.shape}')
    optimizer.zero_grad()                           # Zero out the gradient so it doesn't accumulate

    loss = criterion(output.permute(0,2,1), notes)  # Calculate loss, this is output vs ground truth
    
    print('\nLOSS FUNCTION INPUT SHAPE\n~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(f'Ground Truth Notes shape: {notes.shape}')
    print(f'Candidate Notes shape: {output.permute(0,2,1).shape}')
    print('\nIt looks like the ground truth is input as notes only and the candidate is a series of probabilities.')
    print('In order to format the candidate like the ground truth, we need to take the argmax over the 2nd dimension')
    print(f'Here is the shape when argmaxed over the second dimension: torch.argmax(output.permute(0,2,1),dim=1).shape = {torch.argmax(output.permute(0,2,1),dim=1).shape}')

    print('\nZooming in on the loss function inputs:')
    print('Let\'s ignore the batch dimension and print just a single ground truth and candidate:')
    print('~~~~~~~~~~~~~~~~~~~~~~~')
    print(f'Single Ground Truth Notes: {notes[0]}')
    print(f'\nSingle Candidate Notes: {output.permute(0,2,1)[0]}')
    print(f'\nArgmax of Single Candidate Notes: {torch.argmax(output.permute(0,2,1), dim=1)[0]}')
    print('\nWow this model is stupid right now, makes sense it hasn\'nt been trained at all yet.')
    
    print(f'\nThis is output by loss(): {loss}')
    
    loss.backward()     # Compute loss for every node in the computation graph

    # This line to avoid the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()    # Update model parameters
    params['last_global_step'] += 1
    break


MODEL INPUTS  (the notes are input for training purposes)
~~~~~~~~~~~~~~~~
spec shape: torch.Size([12, 512, 500])
notes shape: torch.Size([12, 500])

MODEL OUTPUT
~~~~~~~~~~~~~~~
output shape: torch.Size([12, 499, 435])
notes shape: torch.Size([12, 499])

LOSS FUNCTION INPUT SHAPE
~~~~~~~~~~~~~~~~~~~~~~~~~
Ground Truth Notes shape: torch.Size([12, 499])
Candidate Notes shape: torch.Size([12, 435, 499])

It looks like the ground truth is input as notes only and the candidate is a series of probabilities.
In order to format the candidate like the ground truth, we need to take the argmax over the 2nd dimension
Here is the shape when argmaxed over the second dimension: torch.argmax(output.permute(0,2,1),dim=1).shape = torch.Size([12, 499])

Zooming in on the loss function inputs:
Let's ignore the batch dimension and print just a single ground truth and candidate:
~~~~~~~~~~~~~~~~~~~~~~~
Single Ground Truth Notes: tensor([ 35,   1,  39,   2,  43,   1,  46,   0,  50,   4,  54,   3,  58,   2,