In [1]:
import numpy as np
import torch
import torchvision
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
from torch import nn
import random
import matplotlib.pyplot as plt
import torch.optim as optim
import bs4
import urllib.request

webpage = str(urllib.request.urlopen("https://www.gutenberg.org/cache/epub/345/pg345.txt").read())
soup = bs4.BeautifulSoup(webpage)

text = soup.get_text()
text = text.replace("\\r\\n", '\n')
print(text[0:1000])

tokens = [ "[UNK]" ] + list(set(text))
print("Tokens:", tokens[0:10])


## All the above is as before

b'\xef\xbb\xbfThe Project Gutenberg eBook of Dracula
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Dracula

Author: Bram Stoker

Release date: October 1, 1995 [eBook #345]
                Most recently updated: November 12, 2023

Language: English

Credits: Chuck Greif and the Online Distributed Proofreading Team


*** START OF THE PROJECT GUTENBERG EBOOK DRACULA ***




                                DRACULA

                                  _by_

                              Bram Stoker

                        [Illustration: colophon]

                                NEW YORK



In [2]:
def get_random_substring(text, length):
  start_point = random.randint(0, len( text ) - 1 - length - 1)
  # We want to get a sequence of length 'length', and the next character
  substring = text[start_point:start_point + length]
  next_token = text[start_point + length]

  return substring, next_token

print( get_random_substring(text, 100) )

def generate_batch(text, length, batch_size, token_dict):
  x_index_lists = []
  y_index_list = []

  for i in range( batch_size ):
    substring, next_token = get_random_substring( text, length )
    x_index_lists.append( [ token_dict.index( c ) for c in substring ]  )
    y_index_list.append( token_dict.index( next_token ))

  return torch.LongTensor(x_index_lists), torch.LongTensor(y_index_list)

generate_batch(text, 10, 5, tokens)


### All the above, as before

('l. We only know that the box is somewhere on the water,\nmoving along. The customs and the octroi, if', ' ')


(tensor([[23,  3, 67, 76, 10, 23, 81,  8, 31, 47],
         [23,  8, 86, 23,  8, 34, 23, 46, 23, 81],
         [81,  8, 31, 33, 23, 32, 81, 25, 10, 23],
         [46, 86,  8, 76, 10, 23, 76,  2, 23, 52],
         [13, 46,  8, 10, 23, 76, 10, 23, 10, 25]]),
 tensor([23,  3, 81, 76, 13]))

In [120]:
class TokenTransformerModel(nn.Module):
  def __init__(self, token_dictionary, embedding_size, n_heads):
    super(TokenTransformerModel, self).__init__()


    ## Note that, among other things, one thing we need to specify for a transformer model is how many 'attention heads' we want in each attention layer.

    self.tokens = token_dictionary
    self.n_tokens = len( self.tokens )
    self.dim = embedding_size
    self.n_heads = n_heads

    ## As before, we need an embedding matrix, to transform the token indices into representative vectors of a specified dimension
    self.embedding = nn.Embedding( self.n_tokens, self.dim, max_norm=True)
    self.embedding_dropout = nn.Dropout( 0.1 )

    #######################################
    ## The below specifies the parameters we need for the first attention layer
    ## For each attention head, (n_heads in total), we need a query matrix, for converting the terms of the sequence to query vectors
    ## This I am (arbitrarily) setting to be mapped into a space half as large as the embedding (dim // 2), though it could be larger, could be smaller.
    ## The dimension of W_Q1 is thus (n_heads) x (dim / 2) x (dim), or rather n_heads many (dim/2)x(dim) matrices

    ## The key matrices are set up similarly

    ## For this first attention layer, the 'value' vectors will just be the original sequence vectors

    self.W_Q1 = nn.Parameter( torch.randn( self.n_heads, self.dim // 2, self.dim ), requires_grad = True ) * 0.1
    self.W_K1 = nn.Parameter( torch.randn( self.n_heads, self.dim // 2, self.dim ), requires_grad = True ) * 0.1

    ## Once we compute the attention heads, we want to pass the results (and the original sequence items) into a small feed forward network,
    ## In this case, just two layers deep. This is as set up in 'Attention Is All You Need'. Note I am again arbitrarily setting the internal dimension
    ## of the network, in this case to 200.

    ## The input will be ( each attention head + the t-th sequence item), the output will be a computed 'summary' for that sequence item, that factors in the 'global' info
    ## from the attention heads.
    self.attn_layer_1_layer_1 = nn.Linear( in_features = ( self.n_heads + 1 ) * self.dim, out_features = 200 )
    self.attn_layer_1_layer_2 = nn.Linear( in_features = 200, out_features = self.dim )

    ## Again following 'attention is all you need', we will batchnorm the result along the sequence
    self.batch_norm_1 = nn.BatchNorm1d( num_features = 20 )
    #######################################

    #######################################
    ## Below, similar parameters for the second attention layer

    self.W_Q2 = nn.Parameter( torch.randn( self.n_heads, self.dim // 2, self.dim ), requires_grad = True ) * 0.1
    self.W_K2 = nn.Parameter( torch.randn( self.n_heads, self.dim // 2, self.dim ), requires_grad = True ) * 0.1

    self.attn_layer_2_layer_1 = nn.Linear( in_features = ( self.n_heads + 1 ) * self.dim, out_features = 200 )
    self.attn_layer_2_layer_2 = nn.Linear( in_features = 200, out_features = self.dim )

    self.batch_norm_2 = nn.BatchNorm1d( num_features = 20 )
    #######################################

    #######################################
    ## Below, parameters for one final attention head
    ## But on this third layer, I am only computing a single 'attention head', representing a final 'summary' of the whole sequence
    ## So I only need one of each kind of matrix

    ## Making an arbitrary decision here that the value vectors for this layer will be twice as large as the original sequence vectors

    self.W_Q3 = nn.Parameter( torch.randn( self.dim // 2, self.dim ), requires_grad = True ) * 0.1
    self.W_K3 = nn.Parameter( torch.randn( self.dim // 2, self.dim ), requires_grad = True ) * 0.1
    self.W_V3 = nn.Parameter( torch.randn( 2 * self.dim, self.dim ) , requires_grad = True ) * 0.1

    ## The final vector result of this layer will be a vector of size 2*dim, representing a summary of the whole input sequence.
    #######################################


    #######################################
    ## Again, we convert that vector summary to a vector of logits, so that we can predict the next token in the sequence.
    self.logit_layer = nn.Linear( in_features = 2 * self.dim, out_features = self.n_tokens )
    #######################################


  def forward(self, input_tensor, masking = False):

    ######################################################################################
    ## As before, mask if requested, removing about 15% of the original tokens (forcing the network)
    ## To learn longer term dependencies
    embedded = input_tensor.clone()
    batch_size, seq_length = embedded.shape

    if masking:
      for i in range( batch_size ):
        for t in range( seq_length ):
          if random.random() <= 0.15:
            embedded[i, t] = 0
    ######################################################################################



    embedded = self.embedding( embedded ) ## Convert the tensor of indices into embedded vectors
    batch_size, seq_length, feature_count = embedded.shape

    ############################################################################################
    ## Here I am calculating the positional embedding for every postion in the sequence
    ## And once each embedding vector (of size feature count) is calculated, I add the results
    ## To the token embeddings, and apply dropout.
    pos_embedding = torch.zeros( seq_length, feature_count )
    div_term = torch.exp(torch.arange(0, feature_count, 2) * (-np.log(1000.0) / feature_count))

    for i in range( seq_length ):
      pos_embedding[i, 0:feature_count:2] = torch.sin(i * div_term)
      pos_embedding[i, 1:feature_count:2] = torch.cos(i * div_term)

    embedded = embedded + pos_embedding
    embedded = self.embedding_dropout( embedded )

    ## At this point, 'embedded' stores a vector representation of combined information about
    ## /what/ token is present, and /where/ that token is in the sequence.
    ############################################################################################

    ############################################################################################

    ## ATTENTION LAYER 1
    ## First thing we need to do is compute the 'attention head' for each head.

    attention_heads_1 = []
    for h in range( self.n_heads ):
      test_query_vector_sequence = torch.matmul( embedded, self.W_Q1[h].t() ) # This will multiply every sequence of vectors by W_Q1 for that head, giving a set of (batch_size, seq_length, dim/2) query vectors
      test_key_vector_sequence = torch.matmul( embedded, self.W_K1[h].t() ) # Similarly, (batch_size, seq_length, dim/2) key vectors

      test_attention_weights = test_query_vector_sequence * test_key_vector_sequence # This uses broadcasting to multiply each term of each query vector by the corresponding term of the corresponding key vector
      test_attention_weights = torch.sum( test_attention_weights, dim = 2 ) # This sums each (query * key) vector, essentially computing the dot product for each pair. (batch_size, seq_length) final dot products
      test_attention_weights = test_attention_weights  / np.sqrt( self.dim // 2 ) # Scale the dot products by 1/sqrt(dim) - this gives 'scaled dot product attention' weights
      test_attention_weights = torch.nn.Softmax(dim = 1)( test_attention_weights ) # Softmax to get the final attention weights, (batch_size, seq_length) - for each seq in the batch, a set of weights for each term of the sequence
      test_attention_weights = test_attention_weights.unsqueeze(2) # This final line here is going to expand it to a tensor of size (batch_size, seq_length, 1), with this superfluous component on the end to make dimensions line up

      test_attention_head = torch.sum( embedded * test_attention_weights, dim = 1 ) # This multiplies each embedded vector by its corresponding weight, and then adds across each scaled term of the sequence

      # Final result here is going to be of size ( batch_size, feature_count ) - for each sequence in the batch, we have an attention vector of size 'feature_count' that summarizes that sequence

      attention_heads_1.append( test_attention_head.unsqueeze(1) )
      # Add the computed attention head to 'attention_heads_1', modifying it to dimension (batch_size, 1, feature_count)

    attention_heads_1 = torch.concat( attention_heads_1, dim = 1 )
    # This glues all the attention heads into one tensor of shape (batch_size, n_heads, feature_count)
    # We have computed all the attention heads, for each sequence in the batch


    # We then process each sequence, based on teh computed attention
    processed_sequence_term = []
    for i in range( seq_length ):
      seq_term = embedded[:, i].unsqueeze(1) # For each sequence in the batch, we get sequence term i, shaping it to be a tensor of size (batch_size, 1, feature_count)
      seq_term = torch.concat( [ seq_term, attention_heads_1 ], dim = 1 ) # We glue it to the attention heads, creating a tensor of size (batch_size, n_heads + 1, feature_count)
      seq_term = torch.nn.Flatten()( seq_term ) # And flatten it to a tensor of (batch_size, (n_heads+1)*feature_count) - note that this vector (n_heads+1)*feature_count contains information abt sequence element i, /and the entire sequence/ based on the attention vectors

      seq_term = self.attn_layer_1_layer_1( seq_term ) # Pass it into a dense layer
      seq_term = nn.ReLU()( seq_term ) # Activation function
      seq_term = self.attn_layer_1_layer_2( seq_term ) # Pass it into a dense layer

      seq_term = self.batch_norm_1( embedded[:, i] + seq_term ) # We add the computed 'residual' back to the original sequence element, and then batch normalize

      # Final shape: (batch_size, feature_count)

      processed_sequence_term.append( seq_term.unsqueeze(1) ) # Collect it, reshaped as (batch_size, 1, feature_count), so that we can glue each term of the sequence back together

    embedded = torch.concat( processed_sequence_term, dim = 1 )

    # Embedded now has shape (batch_size, seq_length, feature_count), each term of each sequence in the batch representing 'processed' information about how that sequence item relates to everything else in the sequence
    ################################################################################################

    ############################################################################################
    ## ATTENTION HEAD 2
    ## Much the same as attention head 1!

    attention_heads_2 = []
    for h in range( self.n_heads ):
      test_query_vector_sequence = torch.matmul( embedded, self.W_Q2[h].t() )
      test_key_vector_sequence = torch.matmul( embedded, self.W_K2[h].t() )

      test_attention_weights = torch.sum( test_query_vector_sequence * test_key_vector_sequence, dim = 2 )  / np.sqrt( self.dim // 2 )
      test_attention_weights = torch.nn.Softmax(dim = 1)( test_attention_weights )
      test_attention_weights = test_attention_weights.unsqueeze(2)

      test_attention_head = torch.sum( embedded * test_attention_weights, dim = 1 )

      attention_heads_2.append( test_attention_head.unsqueeze(1) )

    attention_heads_2 = torch.concat( attention_heads_2, dim = 1 )

    processed_sequence_term = []
    for i in range( seq_length ):
      seq_term = embedded[:, i].unsqueeze(1)
      seq_term = torch.concat( [ seq_term, attention_heads_2 ], dim = 1 )
      seq_term = torch.nn.Flatten()( seq_term )

      seq_term = self.attn_layer_2_layer_1( seq_term )
      seq_term = nn.ReLU()( seq_term )
      seq_term = self.attn_layer_2_layer_2( seq_term )

      seq_term = self.batch_norm_2( embedded[:, i] + seq_term )

      processed_sequence_term.append( seq_term.unsqueeze(1) )

    embedded = torch.concat( processed_sequence_term, dim = 1 )
    ################################################################################################

    ############################################################################################
    ## Final attention layer

    ## Note that at this layer, we only have one attention head
    ## The value vectors that we are combining are now mapped versions of the embedded sequence, though, of dimension twice as large

    test_query_vector_sequence = torch.matmul( embedded, self.W_Q3.t() )
    test_key_vector_sequence = torch.matmul( embedded, self.W_K3.t() )
    test_value_vector_sequence = torch.matmul( embedded, self.W_V3.t() )

    test_attention_weights = torch.sum( test_query_vector_sequence * test_key_vector_sequence, dim = 2 )  / np.sqrt( self.dim // 2 )
    test_attention_weights = torch.nn.Softmax(dim = 1)( test_attention_weights )
    test_attention_weights = test_attention_weights.unsqueeze(2)

    test_attention_head = torch.sum( test_value_vector_sequence * test_attention_weights, dim = 1 )

    # Final dimension here: (batch_size, 2 * feature_count )

    logits = self.logit_layer( test_attention_head ) # Convert to probabilities

    return logits
    ################################################################################################

In [121]:
transformer_model = TokenTransformerModel(tokens, embedding_size = 20, n_heads = 5)

In [122]:
batch_size = 7
x_batch, y_batch = generate_batch(text, 35, batch_size, tokens)

In [123]:
transformer_model( x_batch ).shape

torch.Size([7, 87])

In [142]:
def predict_next_character(model, substring, tokens):
  indices = torch.LongTensor( [ [ tokens.index(c) for c in substring ] ] )
  next_token_logits = model( indices ).detach()[0]
  next_token_probabilities = torch.nn.Softmax( dim = 0 )( next_token_logits )
  next_token_probabilities = np.asarray( next_token_probabilities )

  next_character = random.choices( tokens, weights = next_token_probabilities, k = 1 )
  return next_character[0]

def extend_prompt(model, prompt, tokens, character_count):
  model.eval()
  output = prompt
  for i in range( character_count ):
    next_character = predict_next_character(model, output, tokens)
    output += next_character
  return output

extend_prompt(transformer_model, "Dracula, where is my money? ", tokens, 40)

'Dracula, where is my money?  ,rtlsetid;  ldure,em&tu!;tkts lt  ,t kt'

In [143]:
transformer_model = TokenTransformerModel(tokens, embedding_size = 20, n_heads = 4)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam( transformer_model.parameters(), lr = 0.005, weight_decay = 0.001 )

In [None]:
batch_size = 100

for epoch in range(100):
  print("Training Epoch", epoch)

  transformer_model.train()

  for batch in range(100):
    seq_length = random.randint(1,50)
    x_batch, y_batch =  generate_batch(text, seq_length, batch_size, tokens)
    optimizer.zero_grad()
    logits = transformer_model( x_batch, masking = True )
    loss = loss_function( logits, y_batch )
    loss.backward( retain_graph = True ) #NOTE: I do not know why I needed to add retain_graph, it seems to thing I botched the computation graph somewhere, but I'm pretty sure I did not >.< This fixes it though, just runs a little slower than I'd like.
    optimizer.step()
    #print( loss.item() )

  print("########################################################")
  print( extend_prompt(transformer_model, "Mina, ", tokens, 40) )
  print("########################################################")

Training Epoch 0
########################################################
Mina, t0\0x9x09xx99e  e  weffplb eanrohd
 hhe 
########################################################
Training Epoch 1
########################################################
Mina, \x9899x29x92e\990Ser reo9   t xvx\0\ c  
########################################################
Training Epoch 2
########################################################
Mina, 8888990899o0 \n0x0Ttxn08y8\8x9
\\9xeox8x
########################################################
Training Epoch 3
########################################################
Mina, \98998099\0999990299\#\\999999\9x8000x8\
########################################################
Training Epoch 4
########################################################
Mina, 89899x8x8908280x9D\cxx8880cx0x990txx(x8\
########################################################
Training Epoch 5
########################################################
Mina, \x9xxx9xxx\02x\x8x8x8x98\x980x