In [1]:
import torch

with open("../data/input.txt", "r") as f:
    text = f.read()

print(f"Text length: {len(text)}")

first_60 = text[:60]
print(f"{first_60}")


Text length: 1115393
First Citizen:
Before we proceed any further, hear me speak.


## Tokens

In [68]:
import tiktoken
from src.utils.utils import decode

# Get the tokenizer (GPT-2's tokenizer)
enc = tiktoken.get_encoding("gpt2")
# Or for GPT-4's tokenizer: enc = tiktoken.get_encoding("cl100k_base")

# Encode the text
tokens = enc.encode(text)
data = torch.tensor(tokens)

# No need for stoi/itos mappings anymore since tiktoken handles this
vocab_size = enc.n_vocab  # GPT-2 vocab size is 50257

# Modify decode function to use tiktoken
decode = lambda l: enc.decode(l)

# Rest of your setup
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [69]:
from src.utils.utils import BatchLoader, Evaluator
from src.utils.trainer import Trainer
from src.utils.models import BigramLanguageModel


# Model parameters - adjusted for token-based model
batch_size = 32
block_size = 16  # Can be longer now since tokens represent more information
n_embed = 64    # 384 - Standard size used in smaller transformers
n_heads = 4      # 6
n_layer = 1      # 6
eval_interval = 200
max_iters = 2000
learning_rate = 3e-3
dropout = 0.1

train_loader = BatchLoader(train_data, block_size=block_size, batch_size=batch_size)
val_loader = BatchLoader(val_data, block_size=block_size, batch_size=batch_size)

# Setup and train as before
model = BigramLanguageModel(vocab_size, n_embed, block_size, n_layer, n_heads, dropout)

# Setup training components
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
evaluator = Evaluator(model, train_loader, val_loader)
trainer = Trainer(model, optimizer, train_loader, evaluator, max_iters, eval_interval)

# Train the model
final_losses = trainer.train()

step 0: train loss 10.7429, val loss 10.7474
step 200: train loss 2.3959, val loss 2.3941
step 400: train loss 2.2412, val loss 2.2554
step 600: train loss 2.1781, val loss 2.2156
step 800: train loss 2.1156, val loss 2.1630
step 1000: train loss 2.0836, val loss 2.1245
step 1200: train loss 2.0486, val loss 2.1220
step 1400: train loss 2.0211, val loss 2.1144
step 1600: train loss 2.0097, val loss 2.0934
step 1800: train loss 1.9813, val loss 2.0685


In [70]:
# Generate some text
context = torch.zeros((1, 1), dtype=torch.long)
generated_text = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_text)

!!=2;42;F+!;PUNLKZLHUZTLK)!;V]L"[V"SH[L!UH[,"&THY["^V\SK"WHUL"RUUHUKTHU"UVPK"HSS"^PSS"V\Y"ZOV\UKKH[LJ'"THYKZZ"ZLLSS"[OL"Z[HZRZ"]V^'"[OL"I`V\Y"T`"[V)!!1PUK"\Z"[OHU'"[O`"[OH["[\YYVT"ILHRPU[PUN"[PJRLUZ"ZVVY"0SLT,!HUK"^V"6"^VYKZ!AV"[O`"OLYOLT"8PUN"1VVK"`HKVS"OPT`!.&SS"T`"MHPSN!DLSM"[OP[OL"SP[L)!0\[PV"THYRZ'"[H[SLHV["[OL"[OH['"^VSS!AOL"I\."RPUK'"MVY"?PJO!0VTHZZ)!!0VTL"H"SPL]LZ"TVZ["MVUZPSLWWLKL"VM"OVUNNL[LUULY)!!=?6.+"[\T+!.Z"PU"[OV\K"KYHUKTV^"T`".UK" AcceptY]L"THU"T`"ZP[ZPU[+"UV["ZSLHZ"NL^+!5V^'"6"WYHISL"H
