# Preamble

## Variables

In [0]:
ON_COLAB = True

In [0]:
if ON_COLAB:
    project_dir = 'drive/My Drive/Colab Notebooks/TextGen'
    local_root = ''
else:
    project_dir = '..'
    local_root = '..'

## Setup and Imports

In [0]:
# Load the "autoreload" extension so that code can change
%load_ext autoreload

# Always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import os
import sys

In [0]:
if ON_COLAB:
    %tensorflow_version 2.x

In [6]:
if ON_COLAB:# Allow access to src functions from google drive
    from google.colab import drive
    drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
if ON_COLAB:
    # Install extra requirements (src, socialml)

    # src files (added via path, colab doesn't seem to like installing it via pip)
    src_path = os.path.join(project_dir)
    sys.path.append(src_path)

In [0]:
from src.models import lstm_seq
from src.data_transform import preprocessing as pp
from src.utils import colab_utils
import sys
import os
import logging
from src.utils.specs import load_spec
import json
import tensorflow as tf
import shutil
import matplotlib.pyplot as plt
import time

logging.basicConfig(level=logging.INFO)
logging.basicConfig(format='%(process)d-%(levelname)s-%(message)s')
logger = logging.getLogger()

# Training

## Make sure the data is in the right place

In [0]:
def create_paths(project_dir, experiment):

    # Setup file paths relative to project directory
    drive_data_loc = os.path.join(project_dir, 'data/processed')
    spec_path = os.path.join(project_dir, 'specs', experiment+'.yaml')
    log_dir = os.path.join(project_dir, 'models/logs', experiment)
    checkpoint_dir = os.path.join(project_dir, 'models/checkpoints', experiment)
    train_script_path = os.path.join(project_dir, 'src/models/train_model.py')

    # If on colab copy contents of data/processed from gdrive to local machine (reading directly from gdrive is slow)
    force_copy = False
    if ON_COLAB:
        if not os.path.exists('data/processed') or force_copy:
            colab_utils.copy_to_local(drive_data_loc, 'data/processed')

    # Data files relative to processed data directory
    data_path = os.path.join(local_root, 'data/processed')

    return data_path, log_dir, checkpoint_dir, spec_path

## Training functions

In [0]:
def _get_callbacks(logdir, checkpoint_dir):
    _clean_and_create_dir(checkpoint_dir)
    _clean_and_create_dir(logdir)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=0)

    # Checkpoint callbacks
    checkpoint = os.path.join(checkpoint_dir, "model-weights-epoch-{epoch:02d}-loss-{loss:.4f}.tf")
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint,
        save_weights_only=True,
        monitor='loss',
        save_best_only=True,
        mode='min',
    )

    return [tensorboard_callback, checkpoint_callback]


def clean_and_create_dir(dir_path):
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.mkdir(dir_path)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

## LSTM Sequence Model

### Load Data

In [11]:
# Load specs from yaml file
experiment = 'exp00'
data_path, log_dir, checkpoint_dir, spec_path = create_paths(project_dir, experiment)

specs = load_spec(spec_path)
tf_dataset_params = specs['tf_dataset_params']
model_params = specs['model_params']
training_params = specs['training_params']
print(json.dumps(specs))

# Load data from text
dataset = pp.load_from_txt(os.path.join(data_path, 'social/train/social.txt'))

# Preprocess the data
logger.info(f'Preprocessing dataset')
dataset, integer_encoder = pp.preprocess_data(dataset, **specs['tf_dataset_params'])

# Vocab list required when building the model
vocab_list = integer_encoder.tokens

clean_and_create_dir(checkpoint_dir)

# Save vocab as of training so we have a copy incase dataset changes
with open(os.path.join(checkpoint_dir, 'vocab.json'), 'w') as f:
    json.dump(vocab_list, f)

{"model_params": {"depth": 2, "n_units": 128}, "tf_dataset_params": {"batch_size": 32, "buffer_size": 10000, "reverse_context": 1, "vocab_size": 20000, "x_max_length": 50, "y_max_length": 50}, "training_params": {"epochs": 10}}


INFO:root:Preprocessing dataset


### Learning Rate and Optimizer

In [0]:
learning_rate = CustomSchedule(1000)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [0]:
plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

### Define Loss Functions

In [0]:
EPOCHS = training_params['epochs']
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

### Checkpoints and callbacks

In [0]:
ckpt = tf.train.Checkpoint(model=model,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

### Training loop

In [0]:
# Build The model
model = lstm_seq.LSTMSeqModel(vocab_list, **model_params)


train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
    tf.TensorSpec(shape=(None, None), dtype=tf.int32),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar_inp, tar_real):
  
    with tf.GradientTape() as tape:
        predictions, _ = model(inp, tar_inp)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    gradients = [(tf.clip_by_value(grad, -0.25, 0.25))
                                  for grad in gradients]
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  
    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [0]:
def print_output(msg):
    print('\r',msg, end='')

In [0]:
for epoch in range(EPOCHS):
    start = time.time()
  
    train_loss.reset_states()
    train_accuracy.reset_states()
  
    for (batch, ((inp, tar_inp), tar_real)) in enumerate(dataset):
        train_step(inp, tar_inp, tar_real)
    
        if batch % 3 == 0:
            output = f'Epoch {epoch+1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}'
            print_output(output)

        
    if (epoch + 1) % 1 == 0:
        ckpt_save_path = ckpt_manager.save()
        print (f'\nSaving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
    
    print(f'Epoch {epoch+1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Time taken for 1 epoch: {time.time() - start}')

 Epoch 1 Batch 1677 Loss 9.8158 Accuracy 0.0182
Saving checkpoint for epoch 1 at drive/My Drive/Colab Notebooks/TextGen/models/checkpoints/exp00/ckpt-1
Epoch 1 Loss 9.8158 Accuracy 0.0182
Time taken for 1 epoch: 1341.0481584072113
 Epoch 2 Batch 1677 Loss 9.3558 Accuracy 0.0183
Saving checkpoint for epoch 2 at drive/My Drive/Colab Notebooks/TextGen/models/checkpoints/exp00/ckpt-2
Epoch 2 Loss 9.3558 Accuracy 0.0183
Time taken for 1 epoch: 1285.6143164634705
 Epoch 3 Batch 1677 Loss 8.5531 Accuracy 0.0182
Saving checkpoint for epoch 3 at drive/My Drive/Colab Notebooks/TextGen/models/checkpoints/exp00/ckpt-3
Epoch 3 Loss 8.5531 Accuracy 0.0182
Time taken for 1 epoch: 1299.303772687912
 Epoch 4 Batch 1677 Loss 7.8056 Accuracy 0.0181
Saving checkpoint for epoch 4 at drive/My Drive/Colab Notebooks/TextGen/models/checkpoints/exp00/ckpt-4
Epoch 4 Loss 7.8056 Accuracy 0.0181
Time taken for 1 epoch: 1295.3534667491913
 Epoch 5 Batch 1677 Loss 7.2746 Accuracy 0.0181
Saving checkpoint for epoch 5