In [1]:
import numpy as np
import torch
import torchvision
import torchvision.datasets as datasets
from torchvision.transforms import ToTensor
from torch import nn
import random
import matplotlib.pyplot as plt
import torch.optim as optim
import bs4
import urllib.request

webpage = str(urllib.request.urlopen("https://www.gutenberg.org/cache/epub/345/pg345.txt").read())
soup = bs4.BeautifulSoup(webpage)
text = soup.get_text()

# In the above lines, I download a webpage from the link (project gutenberg, free e-books)
# Specifically the text to Bram Stoker's 'Dracula'
# soup is the internal beautiful soup object/representation of the webpage
# and I just grab the text from the webpage, representing the full text

text = text.replace("\\r\\n", '\n')
# Note that because of the processing that occurs, special characters like newlines are parsed as '\' and 'n'
# So this takes all pairs that /should/ represent newlines, and replaces them with actual newline characters

print(text[0:1000]) # First 1000 characters of the text

b'\xef\xbb\xbfThe Project Gutenberg eBook of Dracula
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Dracula

Author: Bram Stoker

Release date: October 1, 1995 [eBook #345]
                Most recently updated: November 12, 2023

Language: English

Credits: Chuck Greif and the Online Distributed Proofreading Team


*** START OF THE PROJECT GUTENBERG EBOOK DRACULA ***




                                DRACULA

                                  _by_

                              Bram Stoker

                        [Illustration: colophon]

                                NEW YORK



In [2]:
tokens = list(set(text)) # This creates an ordered list of every unique character that occurs /somewhere/ in the text
tokens = [ "[UNK]" ] + tokens # Here I am appending a special 'unknown' token for when tokens are 'masked' or deliberately obscured from the text

In [3]:
len( tokens ) # Total, 87 unique tokens ( 86 characters + one mask token )

87

In [4]:
tokens[0:10]

['[UNK]', 'r', 'm', '\\', '!', '3', '$', ']', 'l', 'B']

In [5]:
## Purpose of this code is to generate a random subsequence from the text, of a given length
## and grab the next character that follows

# This will be used to generate training data pairs for the problem of "predict the next token in this sequence"

def generate_sample_token_sequence(text, length):
  start_point = random.randint(0, len( text ) - 1 - length - 1)
  # We want to get a sequence of length 'length', and the next character
  substring = text[start_point:start_point + length]
  next_token = text[start_point + length]

  return substring, next_token

generate_sample_token_sequence(text, 100)

(' when we got back, save for some poor creature who\nwas screaming away in one of the distant wards, a',
 'n')

In [6]:
# This code generates a training batch
# There will be batch_size instances of token sequences, of length 'length', and the corresponding next token for each
# Note that these will be expressed in terms of 'indices', based on where the token exists in the dictionary
# So that a string as a sequence of tokens is transformed into a sequence of integers

# Index = 0 represents the [UNK] token, recall.

def generate_batch(text, length, batch_size, token_dict):
  x_index_lists = []
  y_index_list = []

  for i in range( batch_size ):
    substring, next_token = generate_sample_token_sequence( text, length )
    x_index_lists.append( [ token_dict.index( c ) for c in substring ]  )
    y_index_list.append( token_dict.index( next_token ))

  return torch.LongTensor(x_index_lists), torch.LongTensor(y_index_list)

In [7]:
generate_batch(text, 10, 5, tokens)

(tensor([[15, 56,  8, 43, 55, 15, 56, 60, 23, 80],
         [55, 45, 67, 15, 55, 60, 23, 44, 80,  2],
         [60, 55, 56, 77, 55, 20, 32, 26, 55, 77],
         [18, 55, 67, 72, 43, 55, 15, 44, 55, 45],
         [55, 43, 56, 43, 55, 72, 32, 60, 55, 70]]),
 tensor([44, 67, 44, 11, 32]))

In [8]:
class TokenRNNModel(nn.Module):
  def __init__(self, token_dictionary, embedding_size):
    super(TokenRNNModel, self).__init__()

    self.tokens = token_dictionary
    self.embedding_dim = embedding_size

    self.embedding = nn.Embedding( len( self.tokens ), self.embedding_dim, max_norm=True)

    # We create an embedding matrix, with a number of entries equal to the number of tokens
    # And the embedding dimension (size of each token vector) as specified
    # max_norm = True means that while the embedding vectors are learnable parameters, their norms
    # will be kept from running away to infinity


    # Here we create a stacked LSTM for processing a sequence of tokens
    # The input_size is the number of features in each term of the sequence, in this case the embedding dimension
    # The hidden size is the size of the internal memory (i.e. number of 'nodes' in the LSTM cell), which I am arbitrarily setting to 3 * the dimension of the embedding.
    # We are stacking two LSTMs on top of each other - so the second LSTM will process the output sequence from the first
    # batch_first just means that the data will be passed in a tensor based on (batch_size, sequence_length, feature_count)
    # Dropout meaning that dropout will be applied between the iterations of the LSTM cells
    # And bidirectional = True means that the sequence will be processed in two directions (read forward and read backwards)
    # and the final output of the model will be the result of the forward pass concatenated with the result of the backwards pass

    self.lstm = nn.LSTM(input_size = self.embedding_dim,
                              hidden_size = 3 * self.embedding_dim, num_layers=2, bias=True,
                              batch_first = True, dropout=0.25, bidirectional=True)

    # The final output of the LSTM will be be a vector of size 3*embedding_dim + 3*embedding_dim (one for the forward pass, one for the backward pass)
    # We want to convert this vector into a vector of probabilities, one for each token, to represent what token comes 'next' in the sequence
    # So we will have an output dimension equal to the number of tokens

    self.logits = nn.Linear(in_features = 3*self.embedding_dim*2, out_features = len( self.tokens ) )

  def forward(self, input_tensor, masking = False):
    #############################################
    ## If masking is set to true, we arbitrarily and at random 'mask'
    ## about 15% of the tokens in the sequence, setting them to the [UNK] token
    ## This forces the LSTM to learn relationships that are /not/ just between tokens
    ## That are right next to each other - it has to learn longer distance correlations
    ## In order to account for the 'lost' tokens. This is very similar to dropout.

    embedded = input_tensor.clone()
    batch_size, seq_length = embedded.shape

    if masking:
      for i in range( batch_size ):
        for t in range( seq_length ):
          if random.random() <= 0.15:
            embedded[i, t] = 0
    ############################################

    embedded = self.embedding( embedded ) ## We take the 'masked' indices, and convert them to embedded vectors

    ## NOTE: It may be useful here to print out the shape of embedded during a forward pass, to show exactly why it has the dimensions it does

    lstm_output = self.lstm( embedded )

    ## We pass it into the LSTM and collect the output. Note that the LSTM output is a complicated tuple of a) the computed sequences, and b) the final output vectors

    lstm_output = lstm_output[0][:, -1, :]

    # lstm_output[0] is the tensor of computed sequences from the lstm
    # The first index of this tensor is the batch number, so we take all of them with the ':' slice
    # The second index is the term of the sequence we want, and we want the last one, so we take -1
    # The last index is the features, and we want all features of the last term of the sequence

    # Again, printing out the shape of lstm_output[0] may be useful,
    # as well as looking at the LSTM api page

    output = self.logits( lstm_output ) # Lastly, for each sequence in the batch, we compute a vector of token logits (which we can use for softmaxing to get the probabilities for each token)
    return output

In [9]:
x_batch, y_batch = generate_batch(text, 10, 5, tokens)

In [10]:
lstm_model = TokenRNNModel(tokens, 20)

In [11]:
# The purpose of this code is to take a sequence and predict the next token

def predict_next_character(model, substring, tokens):
  indices = torch.LongTensor( [ [ tokens.index(c) for c in substring ] ] ) # For each character in the sequence, identify its token index
  next_token_logits = model( indices ).detach()[0] # Pass in the indices to the model, detach the results from the computation graph, and take the vector of logits that results
  next_token_probabilities = torch.nn.Softmax( dim = 0 )( next_token_logits ) # Convert the logits to probabilities
  next_token_probabilities = np.asarray( next_token_probabilities ) # Saving the probabilities as a numpy array.

  next_character = random.choices( tokens, weights = next_token_probabilities, k = 1 ) # From the tokens, we choose a token, in accordance with the probabilities computed by the model
  return next_character[0] # We return the selected token (there is only one element in this list since k = 1)

# The following code starts with a prompt, and iteratively predicts the next token,
# Appends it to the prompt, then uses that to predict the next token after,
# until it has been extended to the desired length.

def extend_prompt(model, prompt, tokens, character_count):
  model.eval()
  output = prompt
  for i in range( character_count ):
    next_character = predict_next_character(model, output, tokens)
    output += next_character
  return output

In [12]:
extend_prompt(lstm_model, "Dracula, where is my money? ", tokens, 40)

# Note that for the untrained model, this produces essentially a garbage sequence of random tokens - no surprises there.

"Dracula, where is my money? {][\nw'.Uc.W4pFy-8X]2Q9HU\nkHNxbj#!tnbwa5'"

In [16]:
lstm_model = TokenRNNModel(tokens, 20)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam( lstm_model.parameters(), lr = 0.01 )

In [17]:
extend_prompt(lstm_model, "Mina, ", tokens, 40)

# Note, Mina Harker is one of the main characters of the story.

'Mina, RfQi2Cih,xIni[UNK]#%ydwg[v\\}[UNK],NTE16MBHBxx1Gw'

In [18]:
batch_size = 100

for epoch in range(100):
  print("Training Epoch", epoch)
  print()

  lstm_model.train()

  for batch in range(100):
    seq_length = random.randint(1,50) # Pick a random sequence length to generate data with (I chose 50 to keep things small)
    x_batch, y_batch =  generate_batch(text, seq_length, batch_size, tokens)
    optimizer.zero_grad()
    logits = lstm_model( x_batch, masking = True ) # Note that I am setting masking to true, to try to force the LSTM to learn longer dependencies
    loss = loss_function( logits, y_batch )
    loss.backward()
    optimizer.step()
    #print( loss.item() ) ## If you uncomment this, you'll generally see the loss decreasing, but it clutters up the output

  print("########################################################")
  print( extend_prompt(lstm_model, "Mina, ", tokens, 40) ) # Observe how good the model is by testing its generative capacity
  print("########################################################")

  ### NOTE
  ### The important thing to observe here as this trains is that at first, the output is completely random
  ### The first thing that you should observe though is that it starts to break up sequences of tokens with spaces
  ### In a way that starts to mimic the sizes of words.
  ### As it continues, it may start to get punctuation in "the right place"
  ### And may even start to get sequences of tokens that spell actual words
  ### It is slowly but surely learning what text 'looks like', and generating text accordingly.

  ### One thing to consider here: What if we had used words as the tokens instead of characters?
  ### Then the problem would be to learn relationships between words rather than relationships between characters
  ### Which turns very quickly to relationships in 'meaning'. Whatever that means.
  ### And the text generated could very quickly start to turn 'realistic'.

Training Epoch 0

########################################################
Mina, jaceiknthe t arosthelo-e\o melusous te t
########################################################
Training Epoch 1

########################################################
Mina, wy ag-yeate, bus ig terr nall, Vher\xe2\
########################################################
Training Epoch 2

########################################################
Mina, tot ackanover. illlcet et terere wat to 
########################################################
Training Epoch 3

########################################################
Mina, He dew beve ofed to toll to shat hi he n
########################################################
Training Epoch 4

########################################################
Mina, helveone is ane seat yore the to uede to
########################################################
Training Epoch 5

########################################################
Mina, Thath cnourd. She dead t