In [1]:
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)

In [2]:
import torch
from transformers import AutoTokenizer 

In [3]:
# raw data
path_do_data = "data/english.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


In [4]:
from model_mix import Transformer 
model = Transformer(
        vocab_size=vocab_size,
        num_embed=NUM_EMBED,
        block_size=BLOCK_SIZE,
        num_layers=NUM_LAYER,
        num_heads=NUM_HEAD,
        dropout=DROPOUT,
        algorithm='GPT',
        device=2
    ).to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in model.parameters()) / 1e6)
)

Model with 89.48M parameters


In [5]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
MAX_ITER = 2500
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=model, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=model, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = model.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()

step          0 | train loss 10.7195 | val loss 10.7112
step        250 | train loss 2.4492 | val loss 6.5238
step        500 | train loss 0.5321 | val loss 8.1349
step        750 | train loss 0.2151 | val loss 9.0535
step       1000 | train loss 0.1613 | val loss 9.6794
step       1250 | train loss 0.1431 | val loss 9.8860
step       1500 | train loss 0.1385 | val loss 10.0478
step       1750 | train loss 0.1408 | val loss 10.1481
step       2000 | train loss 0.1284 | val loss 10.6126
step       2250 | train loss 0.1310 | val loss 10.6251
step       2499 | train loss 0.1259 | val loss 10.4719
