In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [5]:
import torch
from model import Transformer
from transformers import AutoTokenizer  # pip install transformers
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)

# load model from checkpoint
# m = load_model_from_checkpoint(Transformer,vocab_size=vocab_size)

# example to decode sequence
# enc_sec = m.generate(idx=torch.zeros((1,1), dtype=torch.long),
# max_new_tokens=20)[0].tolist()
# print(decode(vocab=vocab, enc_sec=enc_sec))

# raw data
path_do_data = "data/train_noi.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# train a new model
model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)


Token indices sequence length is longer than the specified maximum sequence length for this model (54325 > 512). Running this sequence through the model will result in indexing errors


Model with 89.48M parameters


In [6]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
#MAX_ITER = 500
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()

step          0 | train loss 10.7617 | val loss 10.7851
step        500 | train loss 0.3862 | val loss 8.3055
step       1000 | train loss 0.1628 | val loss 9.5971
step       1500 | train loss 0.1379 | val loss 10.1851
step       2000 | train loss 0.1324 | val loss 10.0830
step       2500 | train loss 0.1219 | val loss 10.8053
step       3000 | train loss 0.1253 | val loss 11.0654
step       3500 | train loss 0.1147 | val loss 11.1153
step       4000 | train loss 0.1104 | val loss 11.3658
step       4500 | train loss 0.1136 | val loss 11.3534
step       4999 | train loss 0.1147 | val loss 11.6401


In [None]:
save_model_to_chekpoint(model=m, path_to_checkpoint="checkpoint", epoch=step)

In [15]:
# generate some output based on the context
contexts = [torch.zeros((1, 1), dtype=torch.long, device=DEVICE),
            torch.tensor([[2000, 2003]], dtype=torch.long, device=DEVICE),
            torch.tensor([[3001,2000,2013]], dtype=torch.long, device=DEVICE),
            torch.tensor([[2836,5054]], dtype=torch.long, device=DEVICE),
            torch.tensor([[8927, 2443]], dtype=torch.long, device=DEVICE),
            torch.tensor([[1996,1997,4106]], dtype=torch.long, device=DEVICE),
            torch.tensor([[3785, 2138]], dtype=torch.long, device=DEVICE),
            torch.tensor([[2006, 9932]], dtype=torch.long, device=DEVICE),
            torch.tensor([[5461, 7778]], dtype=torch.long, device=DEVICE),
            torch.tensor([[11365, 1010]], dtype=torch.long, device=DEVICE)
            ]
for context in contexts:
  print("output:",
      decode(
          enc_sec=m.generate(idx=context, max_new_tokens=50, block_size=BLOCK_SIZE)[0],
          tokenizer=tokenizer,
      )
  )

output: [PAD] image of a matchstick. as a casual user of chatgpt or another generative model, you may well have even less of an idea of what the initial training data consisted of. ask chatgpt where its data comes from, and it
output: to is that they can be trained automatically and are simple and computationally feasible to use. in speech recognition, the hidden markov model would output a sequence of n - dimensional real - valued vectors ( with n being a small integer, such as 10 ),
output: systems to from driverless cars will be perceived by a human being interacting with an artificial entity that closely ( though imperfectly ) resembles another human. unsupervised learning : a branch of machine learning which, as the name suggests, blends elements of both
output: performancesentangles the underlying factors of variation that explain the observed data. free and open - source software feature learning is motivated by the fact that machine learning tasks such as classification often r

In [13]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(
    decode(
        enc_sec=m.generate(idx=context, max_new_tokens=50, block_size=BLOCK_SIZE)[0],
        tokenizer=tokenizer,
    )
)

[PAD] and their related methods " in order to " understand and analyse actual phenomena " with data. it uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, information science, and domain knowledge. however, generative
