In [1]:
import torch
from torch import nn

In [2]:
class GPT2Model(nn.Module):
  def __init__(self, cfg_map):
    super().__init__()

    # embedding components
    self.emb_layer = nn.Embedding(cfg_map['vocab_size'], cfg_map['emb_dim'])
    self.pos_emb_layer = nn.Embedding(cfg_map['context_length'], cfg_map['emb_dim'])

    # huh
    self.dropout = nn.Dropout(cfg_map['drop_rate'])

    # transformer
    self.trfm_block = nn.Sequential(*[TransformerBlock(cfg_map) for i in range(cfg_map['n_layers'])])

    self.final_norm = LayerNorm(cfg_map['emb_dim'])

    # convert to logits
    self.out_head = nn.Linear(cfg_map['emb_dim'], cfg_map['vocab_size'], bias = False)

  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embed = self.emb_layer(in_idx)
    pos_embed = self.pos_emb_layer(torch.arange(seq_len, device = in_idx.device))
    x = tok_embed + pos_embed
    x = self.dropout(x)
    x = self.trfm_block(x)
    x = self.final_norm(x)

    logits = self.out_head(x)

    return logits

class TransformerBlock(nn.Module):
  def __init__(self, cfg_map):
    super().__init__()
    self.norm_1 = LayerNorm(cfg_map['emb_dim'])
    self.multihead_attention = MultiheadAttention(cfg_map['emb_dim'], cfg_map['emb_dim'], cfg_map['drop_rate'], cfg_map['context_length'], cfg_map['n_heads'])
    self.dropout = nn.Dropout(cfg_map['drop_rate'])
    self.norm_2 = LayerNorm(cfg_map['emb_dim'])
    self.ffw = FeedForward(cfg_map)
  def forward(self, x):
    shortcut_x  = x
    x = self.norm_1(x)
    x = self.multihead_attention(x)
    x = self.dropout(x)
    x = shortcut_x + x

    shortcut_x = x
    x = self.norm_2(x)
    x = self.ffw(x)
    x = self.dropout(x)
    x = x + shortcut_x
    return x

class LayerNorm(nn.Module):
  def __init__(self, emb_dim, eps=1e-5):
    super().__init__()
    self.epsilon = eps

    # learnable params to tweak the layer norm
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = torch.mean(x, dim = -1, keepdim = True)
    var = torch.var(x, dim = -1, keepdim = True, correction = False)
    norm_x = (x - mean) / (var + self.epsilon)**0.5
    return self.scale * norm_x + self.shift

class GELULayer(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh((2/torch.pi)**0.5 * (x + 0.044715 * x**3)))

class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']), # domain expansion
        GELULayer(), # just gelu for non-linear
        nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']), # domain contraction
    )

  def forward(self, x):
    return self.layers(x)

class MultiheadAttention(nn.Module):
  def __init__(self, d_in, d_out, drop_out_rate, context_length, num_heads, ena_bias = False):
    super().__init__()

    assert (d_out % num_heads == 0), \
      "d_out must be divisible by num_heads"

    self.d_in = d_in
    self.d_out = d_out
    self.num_heads = num_heads
    self.head_dim = self.d_out // self.num_heads

    self.W_Q = nn.Linear(d_in, d_out, bias = ena_bias)
    self.W_K = nn.Linear(d_in, d_out, bias = ena_bias)
    self.W_V = nn.Linear(d_in, d_out, bias = ena_bias)

    # projection?
    self.out_proj = nn.Linear(d_out, d_out)

    self.drop_out_layer = nn.Dropout(drop_out_rate)
    self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal = 1))

  def forward(self, x):

    batch, num_tokens, d_in = x.shape

    queries = self.W_Q(x)
    keys = self.W_K(x)
    values = self.W_V(x)

    queries = queries.view(batch, num_tokens, self.num_heads, self.head_dim)
    keys = keys.view(batch, num_tokens, self.num_heads, self.head_dim)
    values = values.view(batch, num_tokens, self.num_heads, self.head_dim)

    queries = queries.transpose(1,2)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)

    attention_score = queries @ keys.transpose(2, 3)

    mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
    attention_score.masked_fill_(mask_bool, -torch.inf)
    attention_score = attention_score / self.head_dim**0.5
    attention_weight = torch.softmax(attention_score, dim = -1)
    attention_weight = self.drop_out_layer(attention_weight)

    context_vectors = (attention_weight @ values).transpose(1, 2)

    context_vectors = context_vectors.contiguous().view(batch, num_tokens, self.d_out)

    # combs for learning relationship of head's results
    context_vectors = self.out_proj(context_vectors)

    return context_vectors


In [3]:
def simple_text_generation(model, idx, context_length, maximum_token):
  for i in range(maximum_token):
    # slice the input for acceptable input size (<= context length)
    idx = idx[:, -context_length:]

    with torch.no_grad():
      logits = model(idx)

    last_vector = logits[:, -1, :]

    best_token = torch.argmax(last_vector, dim = -1, keepdim = True)

    #print(best_token)

    idx = torch.cat((idx, best_token), dim = 1) # (batch, num_token, vocab_size)

  return idx



In [4]:
GPT_CONFIG_NEW = {
  "vocab_size": 50257, # Vocabulary size
  "context_length": 256, # Context length
  "emb_dim": 768, # Embedding dimension
  "n_heads": 12, # Number of attention heads
  "n_layers": 12, # Number of layers
  "drop_rate": 0.1, # Dropout rate
  "qkv_bias": False # Query-Key-Value bias
}

In [5]:
import tiktoken

def text_to_token_ids(text, tokenizer):
  encoded = tokenizer.encode(text,  allowed_special = {'<|endoftext|>'})
  encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dim
  return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
  # tiktoken only accept integer numpy array, never tensor
  if isinstance(token_ids, torch.Tensor):
      token_ids = token_ids.squeeze().tolist()  # [seq_len]
  return tokenizer.decode(token_ids)

In [9]:
torch.manual_seed(42)
model = GPT2Model(GPT_CONFIG_NEW)

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = simple_text_generation(
model=model,
idx=text_to_token_ids(start_context, tokenizer),
maximum_token=10,
context_length=GPT_CONFIG_NEW["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves youodonicle ' directly inflamm honeyopoly Kw ply benefit


In [6]:
inputs = torch.tensor([[16833, 3626, 6100], # ["every effort moves",
                          [40, 1107, 588]]) # "I really like"]

targets = torch.tensor([[3626, 6100, 345 ], # [" effort moves you",
                        [1107, 588, 11311]]) # " really like chocolate"]

In [10]:
with torch.no_grad(): # disable training
  logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [11]:
# predicted token ids
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[24851],
         [  406],
         [17670]],

        [[29716],
         [48014],
         [44693]]])


In [12]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"
f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1: etti L variants


In [13]:
targets[0]

tensor([3626, 6100,  345])

In [14]:
probas

tensor([[[5.7738e-05, 1.5567e-05, 6.4669e-06,  ..., 1.0048e-05,
          2.0194e-05, 1.2653e-05],
         [2.4166e-05, 1.5800e-05, 1.4866e-05,  ..., 5.5020e-06,
          8.9726e-06, 5.9605e-05],
         [1.3420e-05, 1.3131e-05, 2.8548e-05,  ..., 1.6662e-05,
          3.3616e-06, 4.7226e-05]],

        [[1.8145e-05, 3.0270e-05, 2.1098e-05,  ..., 2.1022e-05,
          2.8455e-05, 4.1221e-05],
         [9.5000e-06, 3.1086e-05, 1.0641e-05,  ..., 1.5377e-05,
          1.9063e-05, 5.9135e-05],
         [2.5252e-05, 9.3113e-06, 1.8893e-05,  ..., 9.4687e-06,
          4.9631e-06, 9.6070e-06]]])

In [15]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

# text_idx: batch index
# [0, 1, 2]: token index
# targets[]: take the target probability.

Text 1: tensor([3.7068e-06, 2.0301e-05, 1.7742e-05])
Text 2: tensor([1.4735e-05, 2.3505e-05, 6.3795e-06])


In [16]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([-12.5053, -10.8049, -10.9396, -11.1253, -10.6583, -11.9624])


In [17]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-11.3326)


In [18]:
neg_avg_probas = avg_log_probas * -1
neg_avg_probas

tensor(11.3326)

In [None]:
ce_logits = torch.flatten(logits, start_dim = 0, end_dim = 1)
ce_logits.shape

In [None]:
ce_targets = torch.flatten(targets)
ce_targets.shape

In [None]:
loss = torch.nn.functional.cross_entropy(ce_logits, ce_targets)
loss

In [None]:
with open("/content/the-verdict.txt", "r", encoding = "utf-8") as file:
    raw_text = file.read()
print(len(raw_text))
raw_text[:99]

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
p = [1, 2, 3, 4, 5, 6, 7]
p[0:3]

In [None]:
class GPTDataset(Dataset):
  def __init__(self, text, tokenizer, context_length, stride):
    self.input_chunks = []
    self.output_chunks = []

    encoded_text = tokenizer.encode(text)

    for i in range(0, len(encoded_text) - context_length, stride):
      input = encoded_text[i : i + context_length]
      output = encoded_text[i + 1 : i + context_length + 1]

      self.input_chunks.append(torch.tensor(input))
      self.output_chunks.append(torch.tensor(output))

  def __len__(self):
    return len(self.input_chunks)

  def __getitem__(self, index):
    return self.input_chunks[index], self.output_chunks[index]

In [None]:
def create_dataloader_v1(text, context_length, stride, batch_size, shuffle, drop_last, num_workers):
  tokenizer = tiktoken.get_encoding("gpt2")

  dataset = GPTDataset(text, tokenizer, context_length, stride)

  dataloader = DataLoader(
      dataset = dataset,
      batch_size = batch_size,
      shuffle = shuffle,
      drop_last = drop_last,
      num_workers = num_workers
  )
  return dataloader

In [None]:
train_ratio = 0.8
split_idx = int(len(raw_text) * train_ratio)
train_text = raw_text[:split_idx]
val_text = raw_text[split_idx:]

torch.manual_seed(123)

In [None]:
train_loader = create_dataloader_v1(
    text = train_text,
    context_length = GPT_CONFIG_NEW['context_length'],
    stride = GPT_CONFIG_NEW['context_length'],
    batch_size = 2,
    shuffle = True,
    drop_last = True,
    num_workers = 0
)

val_loader = create_dataloader_v1(
    text = val_text,
    context_length = GPT_CONFIG_NEW['context_length'],
    stride = GPT_CONFIG_NEW['context_length'],
    batch_size = 2,
    shuffle = True,
    drop_last = True,
    num_workers = 0
)

In [None]:
total_tokens = len(tokenizer.encode(raw_text))

if total_tokens * train_ratio < GPT_CONFIG_NEW['context_length']:
  print("shit")
else:
  print("ok")

In [None]:
if total_tokens * (1-train_ratio) < GPT_CONFIG_NEW['context_length']:
  print("shit")
else:
  print("ok")

In [None]:
print("Train loader:")
for x, y in train_loader:
  print(x.shape, y.shape)
print("\nValidation loader:")
for x, y in val_loader:
  print(x.shape, y.shape)

In [None]:
def calc_loss_batch(input, target, model, device):
  input_batch = input.to(device)
  target_batch = target.to(device)

  logits = model(input_batch)

  loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target.flatten())

  return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0.
  if len(data_loader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader) # iter all the batch if user didnt specify the 'num_batches'
  else:
    num_batches = min(num_batches, len(data_loader)) # if user give smaller 'num_batches' than actual length

  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i < num_batches:
      loss = calc_loss_batch(input_batch, target_batch, model, device)

      total_loss += loss.item()

    else:
      break

  return total_loss / num_batches

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
  train_loss = calc_loss_loader(train_loader, model, device)
  val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

In [None]:
def evaluate_model(model, train_loader, val_loader, eval_iter, device):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, eval_iter)

    val_loss = calc_loss_loader(val_loader, model, device, eval_iter)

  model.train()
  return train_loss, val_loss

In [None]:
def generate_and_print_sample(model, tokenizer, device, start_context):
  model.eval()
  context_length = model.pos_emb_layer.weight.shape[0]
  encoded = text_to_token_ids(start_context, tokenizer)
  with torch.no_grad():
    tokens = simple_text_generation(model, encoded, context_length, 50)

  decoded_text = token_ids_to_text(tokens, tokenizer)
  print(decoded_text.replace("\n", " "))
  model.train()

In [None]:
def train_model_simple(model, tokenizer, device, start_context, train_loader,
                       val_loader, eval_iter, train_epochs, eval_freq, optimizer):
  train_losses, val_losses, track_tokens_seen = [], [], []
  tokens_seen, global_step = 0, -1
  for epoch in range(train_epochs):
    model.train()
    for (input_batch, target_batch) in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()

      tokens_seen += input_batch.numel()
      global_step += 1

      if global_step % eval_freq == 0: #6
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, eval_iter, device)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)
        print(f"Ep {epoch+1} (Step {global_step:06d}): "
              f"Train loss {train_loss:.3f}, "
              f"Val loss {val_loss:.3f}"
        )
    generate_and_print_sample( #7
    model, tokenizer, device, start_context
    )

  return train_losses, val_losses, track_tokens_seen

In [None]:
torch.manual_seed(123)
dumb_gpt = GPT2Model(GPT_CONFIG_NEW)
start_context = "My name is"
optimizer = torch.optim.AdamW(dumb_gpt.parameters(), lr = 0.0004, weight_decay = 0.1)

train_losses, val_losses, track_tokens_seen = train_model_simple(
    model = dumb_gpt, tokenizer = tokenizer, device = device,
    start_context = start_context, train_loader = train_loader,
    val_loader = val_loader, eval_iter = 5, train_epochs = 10,
    eval_freq = 5, optimizer = optimizer
)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
num_epochs = 10
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
  fig, ax1 = plt.subplots(figsize=(5, 3))
  ax1.plot(epochs_seen, train_losses, label="Training loss")
  ax1.plot(
  epochs_seen, val_losses, linestyle="-.", label="Validation loss"
  )
  ax1.set_xlabel("Epochs")
  ax1.set_ylabel("Loss")
  ax1.legend(loc="upper right")
  ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
  ax2 = ax1.twiny() #1
  ax2.plot(tokens_seen, train_losses, alpha=0) #2
  ax2.set_xlabel("Tokens seen")
  fig.tight_layout()
  plt.show()
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, track_tokens_seen, train_losses, val_losses)

In [None]:
torch.manual_seed(42)

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = simple_text_generation(
model=dumb_gpt,
idx=text_to_token_ids(start_context, tokenizer),
maximum_token=25,
context_length=GPT_CONFIG_NEW["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [None]:
def generate(model, idx, context_length, maximum_token, temperature = 0.0, topk = None, eos_id = None):
  for i in range(maximum_token):
    # slice the input for acceptable input size (<= context length)
    idx = idx[:, -context_length:]

    with torch.no_grad():
      logits = model(idx)

    last_vector = logits[:, -1, :]

    if topk is not None:
      top_logits, _ = torch.topk(last_vector, topk)

      min_val = top_logits[:, -1]

      last_vector = torch.where(
          last_vector < min_val,
          torch.tensor(float('-inf')).to(last_vector.device),
          last_vector
      )

    if temperature > 0.0: #3
      last_vector = last_vector / temperature
      probs = torch.softmax(last_vector, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
    else:
      idx_next = torch.argmax(last_vector, dim = -1, keepdim = True)

    if idx_next == eos_id:
      break

    idx = torch.cat((idx, idx_next), dim = 1) # (batch, num_token, vocab_size)

  return idx



In [None]:
torch.manual_seed(123)

start_context = "Plato is drinking"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate(
model=dumb_gpt,
idx=text_to_token_ids(start_context, tokenizer),
maximum_token=25,
context_length=GPT_CONFIG_NEW["context_length"],
temperature = 1.4,
topk = 5,
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [None]:
# save
torch.save(dumb_gpt.state_dict(), "dumb_gpt.pth")

In [19]:
# load
model = GPT2Model(GPT_CONFIG_NEW)
model.load_state_dict(torch.load("model.pth", map_location=device))
model.eval()

NameError: name 'device' is not defined

In [20]:
torch.save({
  "model_state_dict": model.state_dict(),
  "optimizer_state_dict": optimizer.state_dict(),
  },
  "model_and_optimizer.pth"
)
# save both optimizer and model

NameError: name 'optimizer' is not defined

In [21]:
# load from checkpoint
checkpoint = torch.load("model_and_optimizer.pth", map_location=device)
model = GPT2Model(GPT_CONFIG_NEW)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train();

NameError: name 'device' is not defined