In [1]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 256, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [2]:
from dataset import text_to_token_ids,token_ids_to_text
import tiktoken
import torch

tokenizer = tiktoken.get_encoding("gpt2")
text1 = "every effort moves"
text2 = "I really like"

text1_encoded = text_to_token_ids(text1,tokenizer)
text2_encoded = text_to_token_ids(text2,tokenizer)

inputs = torch.vstack((text1_encoded,text2_encoded))
print("inputs are :")
print(inputs)

target1 =  " effort moves you"
target2 = " really like chocolate"

target1_encoded = text_to_token_ids(target1,tokenizer)
target2_encoded = text_to_token_ids(target2,tokenizer)

targets = torch.vstack((target1_encoded,target2_encoded))
print("targets are :")
print(targets)

  cpu = _conversion_method_template(device=torch.device("cpu"))


inputs are :
tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
targets are :
tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


In [3]:
from llms import GPT2
model = GPT2(GPT_CONFIG_124M)
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [4]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)   

Token IDs:
 tensor([[[ 7001],
         [24651],
         [10481]],

        [[16491],
         [28345],
         [17464]]])


In [5]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:"
f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Olymp sexist beach


In [6]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([1.9182e-05, 1.1305e-05, 1.3430e-05])
Text 2: tensor([2.4795e-05, 3.0446e-05, 2.1681e-05])


In [7]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)
avg_log_probas = torch.mean(log_probas) * -1 # -1 for NLL
print(avg_log_probas)

tensor([-10.8615, -11.3903, -11.2180, -10.6049, -10.3996, -10.7391])
tensor(10.8689)


In [8]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [9]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)
perplexity = torch.exp(loss)
print(perplexity)

tensor(10.8689)
tensor(52516.9766)


In [10]:
file_path = "data/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


In [11]:
train_ratio = 0.9
split_point = int(0.9 * total_characters)

data_train = text_data[:split_point]
data_val = text_data[split_point:]



In [12]:
from dataset import create_dataloader
torch.manual_seed(123)
train_loader = create_dataloader(
    data_train,
    batch_size=2,
    max_len=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
)
val_loader = create_dataloader(
    data_val,
    batch_size=2,
    max_len=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
)

In [13]:
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)
print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [14]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    logits = model(input_batch) #(batch_size,context_len,vocab_size)

    # we will use torch.nn.functional 
    # we must do some dimensional arrangements
    # target has size of (batch_size,context_len)
    # logits has size of #(batch_size,context_len,vocab_size)
    # We must transform targets to (batch_size * context_len)
    # We must transform logits to (batch_size * context_len, vocab_size)
    # Look Example small for cell below

    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

In [15]:
batch_size = 2
context_len = 2
vocab_size = 3

logits = torch.randn((batch_size,context_len,vocab_size))
targets = torch.randint(0,vocab_size,(batch_size,context_len))

print(logits)
print("-----------------------------")
print(targets)

print(targets.flatten())
print(logits.flatten(0,1))

tensor([[[ 0.9447,  0.6217, -1.3501],
         [-0.1881, -2.3891, -0.4759]],

        [[ 1.7603,  0.6547,  0.5490],
         [ 0.3671,  0.1219,  0.6466]]])
-----------------------------
tensor([[0, 1],
        [0, 1]])
tensor([0, 1, 0, 1])
tensor([[ 0.9447,  0.6217, -1.3501],
        [-0.1881, -2.3891, -0.4759],
        [ 1.7603,  0.6547,  0.5490],
        [ 0.3671,  0.1219,  0.6466]])


In [16]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        # no data
        return float("nan")
    elif num_batches == None:
        # if num_batches is 0 then whole data
        num_batches = len(data_loader)
    else:
        num_batches = min(len(data_loader),num_batches)
    

    for i,(inp,target) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(inp, target, model, device)
            total_loss = loss.item()
        else:
            break
    return total_loss





In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps")
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 11.003658294677734
Validation loss: 10.96114730834961


In [19]:
from train import train_model_simple
torch.manual_seed(123)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=0.0004, weight_decay=0.1
)
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context="I want cake", tokenizer=tokenizer
)

Epoch 0, Step 0, Train Loss 1.6445178985595703, Val Loss 6.760586738586426
Epoch 0, Step 5, Train Loss 0.9228534698486328, Val Loss 6.630193710327148
I want cake, I felt to go a little wild--I felt nervous and uncertain.  "Once, when I looked up, I felt to see a smile behind his close grayish beard--as if he had the donkey. "There were days when I couldn't look at that it. . . . . . . . . . . . . .  "Oh, my own lair," he said.      "Ah, I was growing like the _rose Dub
Epoch 1, Step 10, Train Loss 0.5872092247009277, Val Loss 6.589439868927002
Epoch 1, Step 15, Train Loss 0.3463591933250427, Val Loss 6.648420333862305
I want cake, in the inevitable garlanded frame. The mere outline of the frame called up all Gisburn's past!  He laughed again, and threw back his head to look up at the sketch of the donkey. "strongest," I found her so. Gisburn--couldn't face it. But I had given up his painting because he had married her.   "--that was what the women called it was not till after that even