In [2]:
import torch
import tiktoken
import numpy as np
encoder = tiktoken.get_encoding("gpt2")
device = torch.device("mps")
block_size = 32
batch_size = 8

## Read and tokenize text

In [None]:
with open("input.txt","r", encoding="utf-8") as f:
    data = f.read()

n = len(data)
train_data = data[:int(0.9*n)]
val_data = data[int(0.9*n):]

# Encode with tiktoken gpt2 BPE
enc = tiktoken.get_encoding("gpt2")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)

# Export to binaries
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile("train.bin")
val_ids.tofile("val.bin")

# Load and test the model

## Better understanding of the data

In [3]:
def get_batch(split):

    if split == "train":
        data = np.memmap("train.bin", dtype=np.uint16, mode="r")
    else:
        data = np.memmap("val.bin", dtype=np.uint16, mode="r")

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [4]:
data = np.memmap("train.bin", dtype=np.uint16, mode="r")
data[:130]

memmap([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,
           11,  3285,   502,  2740,    13,   198,   198,  3237,    25,
          198,  5248,   461,    11,  2740,    13,   198,   198,  5962,
        22307,    25,   198,  1639,   389,   477, 12939,  2138,   284,
         4656,   621,   284,  1145,   680,    30,   198,   198,  3237,
           25,   198,  4965,  5634,    13, 12939,    13,   198,   198,
         5962, 22307,    25,   198,  5962,    11,   345,   760,   327,
         1872,   385,  1526, 28599,   318,  4039,  4472,   284,   262,
          661,    13,   198,   198,  3237,    25,   198,  1135,   760,
          470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,
          356,  1183,   423, 11676,   379,   674,   898,  2756,    13,
          198,  3792,   470,   257, 15593,    30,   198,   198,  3237,
           25,   198,  2949,   517,  3375,   319,   470,    26,  1309,
      

In [5]:
ix = torch.randint(len(data)-block_size, (batch_size,))
ix

tensor([173880,  79329, 300612, 263019, 209280, 216553, 247104,   5622])

In [6]:
len(data), block_size

(301966, 32)

In [7]:
x_tr, y_tr=get_batch("train")

In [8]:
x_tr.shape, y_tr.shape

(torch.Size([8, 32]), torch.Size([8, 32]))

In [9]:
x_tr[:3]

tensor([[ 4467,   286, 11906,  1751,  1364, 17903,    25,   198,  1537,  1918,
         22027, 43388,  1549,   616,  5229,   422,  6164,  5101,    11,   198,
          1870,   458,  1347,  1549,   734,  1067,   315,  2052,   422,   616,
         46299, 21755],
        [  373,   339,    13,   198,   198,  8763,  2606,    34,  1546,  5781,
            25,   198,   464,  2116, 31642,  1438,    11,   475,   530,   286,
          1365,  3450,    13,   198,   198,    43,  2885,    56,  3537, 12161,
            25,   198],
        [  198,  1639,   550,   881, 50129,   284,   787,   465, 18021,  1745,
            25,   198,  2215,   345,  3350,   503,    11,   340,   991,  1625,
          1363,    13,   198,   198,  2538, 35830,  1546,    25,   198, 11633,
           301,  3465]], device='mps:0')

## Passing data through the model

In [10]:
from model import GPTConfig, GPT
config = GPTConfig()
model = GPT(config=config).to(device)

No. of paramaters:  3.31808


In [11]:
output, loss = model(x_tr, y_tr)
output

tensor([[[-2.3952e-02, -3.3385e-04,  1.7458e-02,  ...,  3.2066e-01,
           1.4161e-01,  1.8533e-01],
         [-3.2040e-02,  8.3955e-02, -1.1653e-01,  ..., -1.4415e-01,
          -1.0661e-01,  1.3936e-01],
         [-4.5587e-03,  2.4526e-01, -1.3623e-01,  ..., -4.5750e-04,
          -7.6701e-02,  3.0728e-01],
         ...,
         [-7.0332e-02, -1.2510e-01, -4.0615e-01,  ...,  3.6386e-01,
           1.0711e-01,  5.3783e-01],
         [-3.2356e-01,  1.3388e-01,  2.3756e-01,  ..., -1.5389e-01,
          -3.9181e-03, -2.8393e-02],
         [ 2.1391e-01,  2.0663e-02, -2.5889e-01,  ..., -1.9707e-01,
           1.1265e-01,  3.3093e-01]],

        [[ 1.9286e-01,  5.3123e-02, -6.7029e-02,  ..., -1.9901e-02,
           1.4604e-01, -5.5481e-02],
         [-3.5802e-01, -1.9375e-01,  9.9082e-02,  ...,  9.4947e-02,
          -2.1083e-02, -2.0803e-01],
         [ 1.3091e-01,  2.4485e-01, -4.4572e-02,  ..., -1.8932e-01,
           1.0870e-01, -1.0794e-03],
         ...,
         [-2.0336e-01,  3

In [12]:
output.size()

torch.Size([8, 32, 50304])

In [13]:
learning_rate = 6e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95

In [14]:
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2))

num decayed parameter tensors: 10, with 3,319,808 parameters
num non-decayed parameter tensors: 5, with 320 parameters


In [15]:
data.shape, x_tr.shape

((301966,), torch.Size([8, 32]))

In [16]:
x_tr[0]

tensor([ 4467,   286, 11906,  1751,  1364, 17903,    25,   198,  1537,  1918,
        22027, 43388,  1549,   616,  5229,   422,  6164,  5101,    11,   198,
         1870,   458,  1347,  1549,   734,  1067,   315,  2052,   422,   616,
        46299, 21755], device='mps:0')

In [17]:
loss_dict = {}

num_epochs = 10

for epoch in range(num_epochs):

    for eval_iters in np.arange(2):
        x_tr, y_tr = get_batch("train")
        optimizer.zero_grad()
        logits, loss = model(x_tr, y_tr)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch: [{epoch+1}/{num_epochs}], Loss: {np.round(loss.item(),4)}")
    loss_dict.update({epoch:loss.item()})

Epoch: [1/10], Loss: 10.7726
Epoch: [2/10], Loss: 10.6957
Epoch: [3/10], Loss: 10.591
Epoch: [4/10], Loss: 10.51
Epoch: [5/10], Loss: 10.4492
Epoch: [6/10], Loss: 10.3302
Epoch: [7/10], Loss: 10.1916
Epoch: [8/10], Loss: 10.1491
Epoch: [9/10], Loss: 10.0197
Epoch: [10/10], Loss: 9.986
