<a href="https://colab.research.google.com/github/ggolani/ML/blob/main/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
#hyperparameters for GPT2-124M
n_vocab = tokenizer.vocab_size
embed_dim = 768
seq_len = 256
n_heads = 12
n_blocks = 12
batch_size = 32
dropout = 0 # range [0-1]

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [None]:
import requests
text = requests.get('https://www.gutenberg.org/cache/epub/829/pg829.txt').text
gtTokens = torch.tensor(tokenizer.encode(text))
print(len(gtTokens))

Token indices sequence length is longer than the specified maximum sequence length for this model (158345 > 1024). Running this sequence through the model will result in indexing errors


158345


In [None]:
train_ratio = 0.9
train_data = torch.tensor([], dtype=torch.long)
test_data = torch.tensor([], dtype=torch.long)

import math


for i in range(10):
  shard_max = math.floor(len(gtTokens) / 10 * (i+1))
  shard_min = math.floor(len(gtTokens) / 10 * i)
  train_max = math.ceil(shard_min + (shard_max - shard_min) * train_ratio)
  train_data = torch.cat((train_data, gtTokens[shard_min:train_max]))
  test_data = torch.cat((test_data, gtTokens[train_max+1:shard_max-1]))



train index: 0 14251
test index: 14252 15833
train index: 15834 30086
test index: 30087 31668
train index: 31669 45920
test index: 45921 47502
train index: 47503 61755
test index: 61756 63337
train index: 63338 77589
test index: 77590 79171
train index: 79172 93424
test index: 93425 95006
train index: 95007 109258
test index: 109259 110840
train index: 110841 125093
test index: 125094 126675
train index: 126676 140927
test index: 140928 142509
train index: 142510 156762
test index: 156763 158344
142515
15810
torch.Size([142515])
torch.Size([15810])


In [None]:
# a function that returns a batch of data samples
def get_data_batch(training=True):

  # pick the dataset to use
  if training:
    data = train_data
  else:
    data = test_data

  # pick random indices to start
  ix = torch.randint(len(data)-seq_len,size=(batch_size,))

  # get the data and targets (via broadcasting outer product)
  X = data[ix[:,None] + torch.arange(seq_len)]
  y = data[ix[:,None] + torch.arange(1,seq_len+1)]
  return X,y

Input data (size torch.Size([32, 256])):
 tensor([[  724,    82,    11,  ...,  2366,    11,   290],
        [  201,   198,   464,  ...,   447,   247,    82],
        [  198, 10414,   364,  ...,   201,   198,    83],
        ...,
        [  669, 21242,   201,  ...,   286,  8122,   201],
        [  284, 28020,   286,  ...,   198,  6615,    11],
        [   11,   706,   543,  ...,    11,   290, 34730]])


Targets (size torch.Size([32, 256])):
 tensor([[   82,    11, 27793,  ...,    11,   290, 12277],
        [  198,   464, 20136,  ...,   247,    82,  4931],
        [10414,   364,   319,  ...,   198,    83, 34715],
        ...,
        [21242,   201,   198,  ...,  8122,   201,   198],
        [28020,   286,   597,  ...,  6615,    11, 10597],
        [  706,   543,   339,  ...,   290, 34730,  2405]])


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super().__init__()

        self.num_heads = n_heads
        self.head_dim = embed_dim // n_heads

        self.QKV = nn.Linear(embed_dim, 3*embed_dim, bias=True)
        self.W0 = nn.Linear(embed_dim, embed_dim, bias=True)

    def forward(self, x):
        B, T, E = x.shape # [batch, seq_len, embed_dim]
        qkv = self.QKV(x)
        q,k,v = torch.split(qkv, E, dim=2)

        q = q.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
        k = k.view(B, T, self.num_heads, self.head_dim).transpose(1,2)
        v = v.view(B, T, self.num_heads, self.head_dim).transpose(1,2)

        dropp=dropout if self.training==True else 0
        out = F.scaled_dot_product_attention(q, k, v, is_causal=True, dropout_p=dropp) # [B, nHeads, T, head_dim]

        # recombine heads: (B, nHeads, T, head_dim) -> [B, T, E]
        out = out.transpose(1,2).view(B, T, E)

        # finally, linearly mix the attention heads
        out = self.W0(out)

        return out

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self):
        super().__init__()

        self.layernorm_1 = nn.LayerNorm(embed_dim, eps=1e-5)
        self.attn = MultiHeadAttention()

        self.layernorm_2 = nn.LayerNorm(embed_dim, eps=1e-5)

        self.mlp_1 = nn.Linear(embed_dim, 4*embed_dim, bias=True)
        self.gelu = nn.GELU()
        self.mlp_2 = nn.Linear(4*embed_dim, embed_dim, bias=True)

        #n transformer block dropout
        self.trn_dropout = nn.Dropout(dropout)

    def forward(self, x):
        x_att = self.layernorm_1(x)
        x_att = self.trn_dropout(self.attn(x_att)) + x

        x_ff = self.layernorm_2(x_att)
        x_ff = self.mlp_2(self.gelu( self.mlp_1(x_ff) )) # expansion-contraction
        x_ff = x_att + self.trn_dropout(x_ff) #n dropout the MLP and add back to the embeddings vectors

        return x_ff

In [None]:
class LLM(nn.Module):
    def __init__(self):
        super().__init__()

        self.wte = nn.Embedding(n_vocab, embed_dim)
        self.wpe = nn.Embedding(seq_len, embed_dim)
        #n dropout
        self.emb_dropout = nn.Dropout(dropout)

        self.transformerBlocks = nn.Sequential(*[TransformerBlock() for _ in range(n_blocks)])

        self.layernorm_final = nn.LayerNorm(embed_dim, eps=1e-5)

        self.final_head = nn.Linear(embed_dim, n_vocab, bias=False)
        self.final_head.weight = nn.Parameter(self.wte.weight)

        self.apply(self.weightInits)

    def weightInits(self, module):
        # revisit initialization to optimize for choice of activation function
        if isinstance(module, nn.Linear):
          nn.init.xavier_normal_(module.weight)
          if module.bias is not None:
            nn.init.zeros_(module.bias)

        if isinstance(module, nn.Embedding):
          nn.init.xavier_normal_(module.weight)


    def forward(self, idx):
        token_embeddings = self.wte(idx)
        pos_embeddings = self.wpe(torch.arange(idx.shape[-1], device=device))
        x = token_embeddings + pos_embeddings
        x = self.emb_dropout(x) #n dropout after summing E+P

        x = self.transformerBlocks(x)
        x = self.layernorm_final(x)

        logits = self.final_head(x)

        outputs = F.log_softmax(logits/np.sqrt(embed_dim),dim=-1)

        return outputs

    def generate(self, idx, max_new_tokens=50):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -seq_len:]
            logits = self(idx_cond)
            logits = logits[:, -1, :]
            probs = torch.exp(logits)

            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [None]:

CHECKPOINT_PATH = '/content/gdrive/My Drive/my_checkpoint.pth'
def save_checkpoint(model, optimizer, epoch, loss, filepath):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, filepath)

In [None]:
# create an instance and test with some data
model = LLM().to(device)
#load from gdrive if reusing checkpointed model
#checkpoint = torch.load(CHECKPOINT_PATH, map_location=torch.device(device))
#model.load_state_dict(checkpoint['model_state_dict'])
#model.to(device)

LLM(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(256, 768)
  (emb_dropout): Dropout(p=0, inplace=False)
  (transformerBlocks): Sequential(
    (0): TransformerBlock(
      (layernorm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (QKV): Linear(in_features=768, out_features=2304, bias=True)
        (W0): Linear(in_features=768, out_features=768, bias=True)
      )
      (layernorm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp_1): Linear(in_features=768, out_features=3072, bias=True)
      (gelu): GELU(approximate='none')
      (mlp_2): Linear(in_features=3072, out_features=768, bias=True)
      (trn_dropout): Dropout(p=0, inplace=False)
    )
    (1): TransformerBlock(
      (layernorm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (QKV): Linear(in_features=768, out_features=2304, bias=True)
        (W0): Linear(in_features=768, out_features=768, bias=Tr

In [None]:
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.0001, weight_decay=.01)

In [None]:
num_samples = 5001

# initialize losses
train_loss = []
test_loss = []

for sampli in range(num_samples):

  # get a batch of data
  X,y = get_data_batch()

  # move data to GPU
  X,y = X.to(device), y.to(device)

  # clear previous gradients
  model.zero_grad(set_to_none=True)

  # forward pass
  log_probs = model(X)

  # calculate the losses on the (reshaped) targets
  loss = loss_function(log_probs.view(-1,log_probs.shape[-1]),y.view(-1))

  # backprop
  loss.backward()
  optimizer.step()

  # store the per-sample loss
  train_loss.append( loss.item() )

  if sampli%1000==0:
    save_checkpoint(model, optimizer, sampli, loss, CHECKPOINT_PATH)

  # evaluate the model with the test set
  if sampli%100==0:

    with torch.no_grad():
      X,y = get_data_batch(False)       # False -> testset data
      X,y = X.to(device), y.to(device)  # push it to the GPU
      out = model(X)                    # forward pass
      thisloss = loss_function(out.view(-1,out.shape[-1]),y.view(-1)) # calculate loss
      test_loss.append( thisloss.item() )

      # update our progress :)
      print(f'Sample {sampli:4}, train loss: {train_loss[-1]:5.2f}, test loss: {test_loss[-1]:5.2f}')

Sample    0, train loss:  2.79, test loss:  6.23
Sample  100, train loss:  2.72, test loss:  6.19
Sample  200, train loss:  2.66, test loss:  6.32
Sample  300, train loss:  2.61, test loss:  6.31
Sample  400, train loss:  2.54, test loss:  6.53
Sample  500, train loss:  2.52, test loss:  6.41
Sample  600, train loss:  2.46, test loss:  6.52
Sample  700, train loss:  2.42, test loss:  6.80
Sample  800, train loss:  2.43, test loss:  6.62
Sample  900, train loss:  2.31, test loss:  6.85
Sample 1000, train loss:  2.34, test loss:  6.75
Sample 1100, train loss:  2.22, test loss:  6.70
Sample 1200, train loss:  2.18, test loss:  6.66
Sample 1300, train loss:  2.14, test loss:  7.03
Sample 1400, train loss:  2.06, test loss:  7.03
Sample 1500, train loss:  2.03, test loss:  6.90
Sample 1600, train loss:  2.03, test loss:  6.99
Sample 1700, train loss:  1.88, test loss:  7.19
Sample 1800, train loss:  1.87, test loss:  6.98
Sample 1900, train loss:  1.93, test loss:  7.11
Sample 2000, train l

In [None]:
# plot the losses
plt.plot(train_loss,'k',label='Train loss')
plt.plot(range(0,num_samples,50),test_loss,'rs-',markerfacecolor='w',markersize=8,label='Test loss')

plt.legend()
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

NameError: name 'plt' is not defined

In [None]:
prompt = 'I find likewise that your printer has been so'
in2gpt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

output = model.generate(in2gpt,max_new_tokens=5)
print(tokenizer.decode(output[0]).replace('\r','\n'))

I went on holiday to Liliput and found the counten company”

 the compass of this thingrius had some children, which
