<a href="https://colab.research.google.com/github/edwardb1203/COMP590_NLP/blob/main/Copy_of_hw3_neural_language_mlp_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import sys
import time
import math
import argparse
from dataclasses import dataclass
from typing import List

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter

torch.manual_seed(1337)

<torch._C.Generator at 0x7f246815d330>

In [2]:
# Download the tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-04-07 00:45:56--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-04-07 00:45:56 (103 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [3]:
# Read the text file
input_file_path = os.path.join(os.path.dirname('./'), 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)


In [4]:
# Examine the dataset
with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

length of dataset in characters: 1,115,394


In [5]:
# get the vocabulary (i.e. the unique chars)
# chars should be a SORTED list of unique characters in the data object

chars = sorted(set([*data]))

vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


In [6]:
# a mapping from character to integer
ctoi = {s:i for i,s in enumerate(chars)}


# a mapping from integer to character
# that reverses the ctoi constructed above
itoc =  {num: char for char, num in ctoi.items()}

In [7]:
# encode the dataset
def encode(text):
  # input: a string
  # output: a list of integers that represents each character in the string
  return [ctoi[s] for s in text]


# decode the dataset
def decode(rep):
  # input:  a list of integers 
  # output: a string that was represented by the input list of integers
  return [itoc[i] for i in rep]

  

In [8]:
# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode both to integers
train_ids = torch.tensor(encode(train_data),dtype=torch.long)
val_ids = torch.tensor(encode(val_data),dtype=torch.long)

In [9]:
print(len(train_ids))

1003854


In [35]:
# Define a block size, batch_size, and construct the batch
# Create the training/test datasets
# Recall that we want to take in the past block_size characeters (where we 
# choose block_size) and output the next charcater
block_size = 32

# Args: 
#   split: set to train when want to use train_ids, set to val when want to use
#          val_ids
#   batch_size: the size of the batch for training
# Outputs:
#   x: training/val data, where each row is a list of previous character ids
#   y: training/val ground truth, where each entry is the ground-truth character
#      id for the current character
# 
# In essence, we want to use the list of previous character ids ot predict the 
# current character id

# Note that the output x and y should be torch tensors. 

def get_batch(split,batch_size):
  data = train_ids if split=='train' else val_ids
  # This line of code generates a random index for each batch in the training or validation data.
  # It makes a tensor ix of shape (batch_size, ), where each element is a random integer between 0 and len(data) - block_size.
  ix = torch.randint(len(data)-block_size,(batch_size,))
  # creates a column vector from the ix tensor, and the expression torch.arange(block_size) creates a row vector of values
  # from 0 to block_size. Adding these two tensors creates a matrix where each row is a sequence of character IDs from the original data.
  x = data[ix[:, None] + torch.arange(block_size)]
  # makes a tensor containing the indices of the last character in each sequence.
  # Indexing data with this tensor returns the ground-truth output for each input sequence.
  y = data[ix + block_size]
  return x,y

In [36]:
# Initialize the embedding dictionary
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embd = 80
n_embd2 = 256

class MLP_neural_language(nn.Module):
    """
    takes the previous block_size tokens, encodes them with a lookup table,
    concatenates the vectors and predicts the next token with an MLP.
    Reference:
    Bengio et al. 2003 https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf
    """

    def __init__(self, block_size,vocab_size,n_embd,n_embd2):
        super().__init__()
        self.block_size =block_size
        self.vocab_size =vocab_size
        self.wte = nn.Embedding(vocab_size,n_embd) 
        self.mlp = nn.Sequential(
            nn.Linear(self.block_size *n_embd,n_embd2),
            nn.Tanh(),
            nn.Linear(n_embd2, self.vocab_size)
        )

    def get_block_size(self):
        return self.block_size

    def forward(self, inputs_idx, targets=None):
        inputs_embd = self.wte(inputs_idx)
        inputs_embd = inputs_embd.reshape(inputs_embd.shape[0],-1)
        logits = self.mlp(inputs_embd)
        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss


m = MLP_neural_language(block_size,vocab_size,n_embd,n_embd2)

In [43]:
# a simple training loop
def train_model(m, block_size):
  m = m.to('cuda')
  batch_size = 4096
  num_epochs = 30
  num_batches = 256
  optimizer = torch.optim.Adam(m.parameters(),lr=1e-3)
  for i in range(num_epochs):
    epoch_loss = 0.0
    for steps in range(num_batches):
      optimizer.zero_grad(set_to_none=True)

      # sample a batch of data
      xb,yb = get_batch('train', batch_size)

      xb = xb.to('cuda')
      yb = yb.to('cuda')

      # evaluate the loss
      logits, loss = m.forward(xb,yb)

      loss.backward()
    
      optimizer.step()

      epoch_loss += loss.item()

    # print(epoch_loss/num_batches)

  print("Block size is: " + str(block_size) + " and loss is: " + str(loss.item()))

In [44]:
block_size = 128
m = MLP_neural_language(block_size,vocab_size,n_embd,n_embd2)
train_model(m, block_size)

Block size is: 128 and loss is: 0.5276082754135132


In [30]:
# # C 
# # context sizes to experiment with
# context_sizes = [2, 16, 64, 128]

# # train the language model for each context size
# for block_size in context_sizes:
#   # initialize the model again
#   m = MLP_neural_language(block_size,vocab_size,n_embd,n_embd2)

#   # train the model with the new context size and eval loss
#   train_model(m, block_size)

Block size is: 2 and loss is: 1.8821747303009033
Block size is: 16 and loss is: 1.3281852006912231
Block size is: 64 and loss is: 0.8763432502746582
Block size is: 128 and loss is: 0.5356072783470154


In [48]:
# Generation
# We use the first block_size number of characters in the val set to start 
# generating Shakespear-styled writings. 

m = m.to('cpu')
limit = 1000
# the first block_size number of characters in val_ids
context = val_ids[:block_size].tolist()
generation = context

for i in range(limit):
  # convert to tensor
  context_tensor = torch.tensor(context).unsqueeze(0)
  # output is a tensor of shape (1, vocab_size)
  output = m(context_tensor)
  # next character is the argmax of the output tensor along the second dimension
  pred = output[0].argmax(dim=1).item()
  # update context by appending the generated character and removing the first character
  context.append(pred)
  context = context[1:]
  # Append the generated character to the generation list
  generation.append(pred)


In [32]:
# Decode the generation and see what the AI came up with :)
print(' '.join(decode(generation)))
blcok_size_32 = ' '.join(decode(generation))

? 
 
 G R E M I O : 
 G o o d   m o r r o w ,   n e i g h b o u r r   i n   t h e   f i e l d   w e   w a s t i n g   b l a w   h i s   d e s t , 
 A n d   t r u e   s e r v i c e   o f   i n   t h e   s i c k   t o   m y   s p e a k   t o   c a l l s . 
 
 K I N G   E D W A R D   I V : 
 A l a s ,   M a r k   t h e   b e s t a t e   a n   o f   a n   i n t i o n   o f   d e a t h , 
 A n d   b e a r   t h e e   a t   t h y   s t a t   o f   h i m e , 
 W h o   c a n n o t   w h a t   s h o w   t o   y o u r   g a n d e d 
 T o   s i n   t h e   d a y   t h e   c o u l d   n o t   s o u l   t e n t e r   m a i d 
 A   m a n   a n d   t h e   s e e n   t o   y o u r   t h i n g   t h e i r   o w n 
 s p e a k   t h e   s t r a i n   o n c l e s   n o t   e s s e e d   t o   y o u r   t h e   f a t h e r   c a n n o t   d o n e   t o   s u c h 
 a n d   n o t   s o   d e a t h   t h e   k i n g   o f   h i s   f a i t   b e f o r e   h e a d   m y   l i f e , 
 A n d   m a k e   m y   l 

In [50]:
print(' '.join(decode(generation)))

? 
 
 G R E M I O : 
 G o o d   m o r r o w ,   n e i g h b o u r   B a p t i s t a . 
 
 B A P T I S T A : 
 G o o d   m o r r o w ,   n e i g h b o u r   G r e m i o . 
 G o d   s a v e   y o u ,   g e n t l e m e n ! 
 
 P E T R U C H I O : 
 A n d   y o o u ,   s i n   i s   s l u r e ;   I   s c u l l   t a t   t h e   h e   o n e   t o r r . 
 
 G U R E T : 
 A y ,   y f t r i a n   s i k e   m y   l i f ,   s o   n o g e :   t h e y ' g . 
 
 N E E N O : 
 B e c i s ,   m e t   n o t h e r ,   w a y   d o   i f   t h e   n e s . 
 
 K I N G   R I C H A R D   I I I : 
 S e e   s h y   d i d e s t   b a k i n g ,   E x t e r a n - g e   i t   n a y 
 u n t   d o n e   p a t i r e s t .   G o a t l ;   a t   I   c o m a n g   w e t h 
 s h o u l   b e   s e c e   t h a t   p r e c e ,   I   c r u c h e r   t h e m , 
 W h a t   s o   y o u r   m i s t   o f   t h u t ?   W a t ,   t h e   n o t   d o m e s , 
 A n d   a l l   t h o u   m a k e t '   t h e   b e   m a n   a m e n   