In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import logging
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# read-in cleaned data and parse to corpus and tokenizer
# path = '../data/'
path = '/content/drive/MyDrive/Colab Notebooks/'
filepath = path + 'allphishsets.csv'

df = pd.read_csv(filepath)
df = df.sort_values(by=['showdate', 'set', 'position'],
                    ascending=[True, True, True])

df.loc[df['times_played'] <= 2, 'slug'] = 'wildcard'
df.loc[df['times_played'] <= 2, 'times_played'] = 510

songstring = df[['showdate', 'set', 'slug']].groupby(['showdate', 'set'])['slug']\
                                            .apply(lambda x: '|'.join(x)).reset_index()

songstring['full'] = songstring.apply(lambda row: f"set-{row['set']}|{row['slug']}", axis=1)

songstring = songstring[['showdate', 'full']].groupby(['showdate'])['full']\
                                             .apply(lambda x: '|'.join(x)).reset_index()

songstring['full'] += '|eos'

songstring = songstring[songstring['showdate']>'1990-01-01']

corpus = [''.join(map(lambda s: s.replace('|', ' '), f))
          for f in songstring['full']]

corpus = ' '.join(corpus).split(' ')

def PrepareDataset(corpus: list, n: int,
                   batch_size: int, train_split: float):
    """
    Prepares Datasets for training and validation data from Setlist data
    Args:
      corpus :: list :: full corpus of songs composed of song sequences
      n :: int :: sequence length to trim
      batch_size :: int :: batch size for datasets
      train_split :: float :: values between 0 and 1, splits the data for
                              training and validation
    """
    texts = []
    for i in range(len(corpus)-n):
        texts.append(' '.join(corpus[i:i+n]))

    texts = texts[::n]

    tokenizer = Tokenizer(filters='')
    tokenizer.fit_on_texts(texts)

    x_inputs = []
    x_outputs = []
    for line in texts:
        token_list = tokenizer.texts_to_sequences([line])[0]
        x_inputs.append(token_list[:-1])  #drop last song
        x_outputs.append(token_list[1:])  #drop first song

    x_inputs = np.array(x_inputs)
    x_outputs = np.array(x_outputs)

    buffer_size = len(x_inputs)
    train_size = int(train_split*buffer_size)

    dataset = tf.data.Dataset.from_tensor_slices((x_inputs, x_outputs))
    shuffled_data = dataset.shuffle(buffer_size)

    train_data = dataset.take(train_size) \
                        .batch(batch_size) \
                        .prefetch(buffer_size=tf.data.AUTOTUNE)

    val_data = dataset.skip(train_size) \
                      .batch(batch_size) \
                      .prefetch(buffer_size=tf.data.AUTOTUNE)

    return train_data, val_data, tokenizer

In [3]:
# https://www.tensorflow.org/text/tutorials/transformer
# classes for positional embedding and attention layers
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)


class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()


class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [4]:
# classes for NN model architecture
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    y = self.causal_self_attention(x=x)

    x = self.add([x, y])
    x = self.layer_norm(x)

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x


class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

  def call(self, x):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x)

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x


class GPT(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, x):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.

    x = self.decoder(x)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    # Drop the keras mask, so it doesn't scale the losses/metrics.
    try:
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


# performance metrics
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [23]:
train_data, val_data, tokenizer= PrepareDataset(
    corpus=corpus,
    n=125,
    batch_size=32,
    train_split=0.8
)

unique_words = len(tokenizer.word_index) + 1
d_model = 128

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(
    learning_rate,
    beta_1=0.9,
    beta_2=0.98,
    epsilon=1e-9
)

num_layers = 8
dff = 512
num_heads = 8
dropout_rate = 0.1
epochs = 100

gpt = GPT(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=unique_words,
    target_vocab_size=unique_words,
    dropout_rate=dropout_rate
)

gpt.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy]
)

gpt.fit(train_data, validation_data=val_data, epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7992fe1bbf40>

In [41]:
# model the output
test_idx = 1

for input, labels in val_data.take(test_idx):
    break

test_labels = labels[0][12:-13]

output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)

for i in range(len(test_labels.numpy())):
    output_array = output_array.write(i, test_labels.numpy()[i])

# output_array = output_array.write(0, 1)
# output_array = output_array.write(1, 100)

for i in tf.range(len(output_array.stack().numpy()), len(input[0])*2):
    output = tf.reshape(output_array.stack(), (1, len(output_array.stack())))
    predictions = gpt(output, training=False)

    # Select the last token from the `seq_len` dimension.
    predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

    predicted_id = tf.argmax(predictions, axis=-1)[0][0]

    # Concatenate the `predicted_id` to the output which is given to the
    # decoder as its input.
    output_array = output_array.write(i+1, predicted_id)

    if predicted_id == 4:
        break

[tokenizer.index_word[s] for s in output_array.stack().numpy() if s != 0]

['funky-bitch',
 'maze',
 'ocelot',
 'sparkle',
 'cavern',
 'wingsuit',
 'set-2',
 'carini',
 'ghost',
 'mikes-song',
 'simple',
 'joy',
 'weekapaug-groove',
 'julius',
 'sand',
 'wading-in-the-velvet-sea',
 'you-enjoy-myself',
 'set-e',
 'quinn-the-eskimo-the-mighty-quinn',
 'eos',
 'set-1',
 'my-soul',
 'bathtub-gin',
 '555',
 'pebbles-and-marbles',
 'the-line',
 'vultures',
 'fast-enough-for-you',
 'back-on-the-train',
 'taste',
 'gumbo',
 'halfway-to-the-moon',
 'stealing-time-from-the-faulty-plan',
 'suzy-greenberg',
 'set-2',
 'chalk-dust-torture',
 'scents-and-subtle-sounds',
 'twist',
 'fuego',
 'the-wedge',
 'light',
 'harry-hood',
 'first-tube',
 'set-e',
 'fluffhead',
 'eos',
 'set-1',
 'llama',
 'undermind',
 'stash',
 'halfway-to-the-moon',
 'i-didnt-know',
 'nellie-kane',
 'guyute',
 'the-line',
 'ocelot',
 'no-quarter',
 'ha-ha-ha',
 'suzy-greenberg',
 'set-2',
 '46-days',
 'back-on-the-train',
 'simple',
 'ghost',
 'backwards-down-the-number-line',
 'harry-hood',
 'wadi

In [42]:
print(test_labels)

print(output_array.stack())

tf.Tensor(
[ 62  25 110  27  11 197   2  85  49   9  51 167  10  42  80 108   8   3
 228   4   1 126  35 152 238 209 186 134  79  83  94 182 133  19   2   7
 198  70 131  89  93  21  92   3  52   4   1  30 170  16 182  54 136  87
 209 110 267 219  19   2  84  79  51  49  88  21 108  15   3  41   4   1
  48  56  82 133 152  33  31 261 223  27 197  17  11   2  28 158  85  93
 131  44 143 165   3  66  60  52   4], shape=(99,), dtype=int64)
tf.Tensor(
[ 62  25 110  27  11 197   2  85  49   9  51 167  10  42  80 108   8   3
 228   4   1 126  35 152 238 209 186 134  79  83  94 182 133  19   2   7
 198  70 131  89  93  21  92   3  52   4   1  30 170  16 182  54 136  87
 209 110 267 219  19   2  84  79  51  49  88  21 108  15   3  41   4   1
  48  56  82 133 152  33  31 261 223  27 197  17  11   2  28 158  85  93
 131  44 143 165   3  66  60  52   4   0   1 103  56  33  35  56  33  48
 110  16   2  28  80  70  57 108  57 108  21   3  77   4], shape=(122,), dtype=int64)


In [None]:
[tokenizer.index_word[s] for s in labels[test_idx-1].numpy() if s != 0]

NameError: ignored

In [None]:
[tokenizer.index_word[s] for s in encoder_input.numpy()[0] if s != 0]

['set-1',
 'fuego',
 'my-soul',
 'back-on-the-train',
 '555',
 'dog-faced-boy',
 'fuck-your-face',
 'horn',
 'frankie-says',
 'my-friend-my-friend',
 'roses-are-free',
 'roggae',
 'birds-of-a-feather',
 'wingsuit',
 'set-2',
 'possum',
 'crosseyed-and-painless',
 'light',
 'the-dogs',
 'lengthwise',
 'twist',
 'wading-in-the-velvet-sea',
 'harry-hood',
 'golgi-apparatus',
 'backwards-down-the-number-line',
 'set-e',
 'waiting-all-night',
 'sing-monica',
 'the-star-spangled-banner',
 'eos']