In [3]:
import torch

with open("data/input.txt", "r") as f:
    text = f.read()

print(f"Text length: {len(text)}")

first_60 = text[:60]
print(f"{first_60}")


Text length: 1115393
First Citizen:
Before we proceed any further, hear me speak.


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print('|'.join(chars))
print(f"Unique characters: {vocab_size}")


| |!|$|&|'|,|-|.|3|:|;|?|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z
Unique characters: 65


In [3]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode(first_60))

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8]


In [4]:
data = torch.tensor(encode(text))

# split into train and validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

block_size = 8

# 9 items will have 8 predicion examples
train_data[:block_size+1]

x = train_data[:block_size]
y = train_data[1:block_size+1]

print(x)

# useful so the transformer is used to seeing different lengths of data
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")


tensor([18, 47, 56, 57, 58,  1, 15, 47])
when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [9]:
from src import SimpleBigramLanguageModel, BatchLoader, Evaluator, Trainer

batch_size = 32
block_size = 16
max_iters = 4500
eval_interval = 500
learning_rate = 3e-3

# Setup data and model
torch.manual_seed(1337)
train_loader = BatchLoader(train_data, block_size=block_size, batch_size=batch_size)
val_loader = BatchLoader(val_data, block_size=block_size, batch_size=batch_size)

# model = SimpleBigramLanguageModel(vocab_size, n_embed, block_size)
model = SimpleBigramLanguageModel(vocab_size, block_size)

# Setup training components
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
evaluator = Evaluator(model, train_loader, val_loader, vocab_size)
trainer = Trainer(model, optimizer, train_loader, evaluator, max_iters, eval_interval)

# Train the model
final_losses = trainer.train()

step 0: perplexity: 112.3, 
step 500: perplexity: 27.1, 
step 1000: perplexity: 15.6, 
step 1500: perplexity: 13.1, 
step 2000: perplexity: 12.4, 
step 2500: perplexity: 12.1, 
step 3000: perplexity: 11.9, 
step 3500: perplexity: 11.9, 
step 4000: perplexity: 11.8, 
step 4500: perplexity: 11.8, 
step 5000: perplexity: 11.7, 


In [12]:
# Generate some text
context = torch.zeros((1, 1), dtype=torch.long)
generated_text = decode(model.generate(context, max_new_tokens=490)[0].tolist())
print(generated_text)


O:
Anele er fe co,
LLamer squsethaittthtr ayit tifod rer; y e ined guratosoulyequg.
BUEEd tavaperelee athavis u warray, n
We by bronond man, d cr miowivero agarlan
has,

Binksue; ain'lilavealeamy y t Isoup uge o'sth r.
What Beeethisunded orachigorsh kn, Ta cheneinhit we t,
Fr s ide Bus ithikee me;
Bul ake har apy ave I arillevVIO hineeo n:
TI ad by andulcavis, scld
Atlithe day;

AO: T:
G butor benkeave y'd,
Gecknfime ttinthalond sBy wapiorasonou haverl the heayet asen d bor t man pe, t


## Self Attention


In [15]:
from src import BigramLanguageModel

# Model parameters
batch_size = 32
block_size = 16
max_iters = 4001
eval_interval = 500
learning_rate = 3e-3
n_embed = 64
n_heads = 4
n_layer = 1
dropout = 0.1
#

# model = SimpleBigramLanguageModel(vocab_size, n_embed, block_size)
model = BigramLanguageModel(vocab_size, n_embed, block_size, n_layer, n_heads, dropout)

# Setup training components
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
evaluator = Evaluator(model, train_loader, val_loader, vocab_size)
trainer = Trainer(model, optimizer, train_loader, evaluator, max_iters, eval_interval)

# Train the model
final_losses = trainer.train()

step 0: perplexity: 65.9, 
step 500: perplexity: 8.7, 
step 1000: perplexity: 7.6, 
step 1500: perplexity: 7.1, 
step 2000: perplexity: 6.8, 
step 2500: perplexity: 6.6, 
step 3000: perplexity: 6.4, 
step 3500: perplexity: 6.4, 
step 4000: perplexity: 6.2, 
step 4500: perplexity: 6.2, 
step 5000: perplexity: 6.1, 


In [57]:
# Generate some text
context = torch.zeros((1, 1), dtype=torch.long)
generated_text = decode(model.generate(context, max_new_tokens=300)[0].tolist())
print(generated_text)


your bes, you as: nowful hear of nold; ward. bath hed lequieds a firt nobor, we have's can to fordids me forim stul of them grabless wind mons to rewind, gays, buthit gner upon
to til Roe sive ve thoughd pods.
That thols, and tame, you seas, fightr, dors soo stae woung, tet, is of com staiser.

Sher.'t Caren: out be het righs. your peaut wapes, in sie;
This to my, this, twer one and med fea kin, we balwans fight rove Rorand don-hil man
I word, seer no, awrad youghtrise:
Stto hy pove, and hond an


### Scale Out

In [40]:
# Model parameters
batch_size = 32
block_size = 6
max_iters = 4001
eval_interval = 500
learning_rate = 3e-3
n_embed = 192
n_heads = 3
n_layer = 4
dropout = 0.1
#

# model = SimpleBigramLanguageModel(vocab_size, n_embed, block_size)
model = BigramLanguageModel(vocab_size, n_embed, block_size, n_layer, n_heads, dropout)

# Setup training components
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
evaluator = Evaluator(model, train_loader, val_loader, vocab_size)
trainer = Trainer(model, optimizer, train_loader, evaluator, max_iters, eval_interval)

# Train the model
final_losses = trainer.train()

# 2.09 5.6

step 0: perplexity: 4412.1, 
step 500: perplexity: 7.6, 
step 1000: perplexity: 6.7, 
step 1500: perplexity: 6.2, 
step 2000: perplexity: 6.0, 
step 2500: perplexity: 5.8, 
step 3000: perplexity: 5.7, 
step 3500: perplexity: 5.5, 
step 4000: perplexity: 5.5, 


In [42]:
# Generate some text
context = torch.zeros((1, 1), dtype=torch.long)
generated_text = decode(model.generate(context, max_new_tokens=400)[0].tolist())
print(generated_text)


NOR YORK:
Love fring.

LADY CAPULET:
King Esell own so it not by
here to I will love thee.

SICINIUS:
Action; porder.
Who on the me proction.
Now you the words my mercest fornight
That a reconk lay to before-jasts a contern:
O grath?

COMINIUS:
No fro to'th with your gruerm of mror now
on yous: to't!' then't Cabsughtly senator
Your chage
Fausenue to the such in shive.
Will bord, my exaincin, and M
