In [25]:
with open("input.txt", "r", encoding="utf-8") as f: 
    text = f.read()

In [26]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [27]:
# take set of all the chars in the text 
chars = sorted(list(set(text))) 
voc_size = len(chars) 
print(''.join(chars))
print(voc_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [28]:
# mapping characters to integers and vice versa
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [29]:
print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [34]:
# encode the number representation from encoding to tensor
import torch 

data = torch.tensor(encode(text), dtype=torch.long) 
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [35]:
# split up dataset into train and validation
n = int(0.9*len(data))
train_data = data[:n] 
val_data = data[n:]

In [36]:
# set chunk size a.k.a blocksize 
block_size = 8 
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [37]:
# simulate how the completion (generation) will look like in batch training 
x = train_data[:block_size] 
y = train_data[1:block_size+1] 
for t in range(block_size): 
    context = x[:t+1] 
    target = y[t] 
    print(f"When the input is {context} the target: {target}")

When the input is tensor([18]) the target: 47
When the input is tensor([18, 47]) the target: 56
When the input is tensor([18, 47, 56]) the target: 57
When the input is tensor([18, 47, 56, 57]) the target: 58
When the input is tensor([18, 47, 56, 57, 58]) the target: 1
When the input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
When the input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [10]:
# samples
torch.randint(3, 10, (3,)) # low, high, (dimension)

tensor([6, 4, 7])

In [11]:
torch.randint(len(data) - block_size, (4, ))

tensor([554189,  46403, 660826, 739944])

In [12]:
torch.stack([data[i:block_size+i] for i in range(2)])

tensor([[18, 47, 56, 57, 58,  1, 15, 47],
        [47, 56, 57, 58,  1, 15, 47, 58]])

In [39]:
torch.manual_seed(1337) 
batch_size = 4 # how many independent sequences will we process in parallel 
block_size = 8 # what is the maximum context length for predictions 
 
def get_batch(split): 
    # generate a small batch of data of inputs x and targets y 
    data = train_data if split == 'train' else val_data 
    ix = torch.randint(len(data) - block_size, (batch_size, )) # random numbers in batch_size 
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y 

xb, yb = get_batch('train') 
print("inputs:") 
print(xb.shape) 
print(xb) 
print('targets:')
print(yb.shape) 
print(yb)
    

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [40]:
for b in range(batch_size): 
    for t in range(block_size): 
        context = xb[b, :t+1] 
        target = yb[b, t] 
        print(f"when input is {context.tolist()} the target: {target}")

when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53, 56, 1, 58, 46] the target: 39
when input is [44, 53, 56, 1, 58, 46, 39] the target: 58
when input is [44, 53, 56, 1, 58, 46, 39, 58] the target: 1
when input is [52] the target: 58
when input is [52, 58] the target: 1
when input is [52, 58, 1] the target: 58
when input is [52, 58, 1, 58] the target: 46
when input is [52, 58, 1, 58, 46] the target: 39
when input is [52, 58, 1, 58, 46, 39] the t

In [60]:
embedding_layer = torch.nn.Embedding(65, 65)

pred_log = embedding_layer(xb)
print(f"original tensor shape: {xb.shape}")
print(f"logits shape (output): {pred_log.shape}")

original tensor shape: torch.Size([4, 8])
logits shape (output): torch.Size([4, 8, 65])


In [61]:
pred_log.view(4*8, 65).shape

torch.Size([32, 65])

In [68]:
pred_log[:, -1, :].shape

torch.Size([4, 65])

In [74]:
import torch.nn as nn 
from torch.nn import functional as F 
torch.manual_seed(1337) 

class BigramLanguageModel(nn.Module): 
    def __init__(self, vocab_size): 
        super().__init__() 
        # taking a tensor input (word ids, tokenized) into vector representations 
        # num_embeddings means how large is your vocabulary (in this case 65 unique symbols) 
        # embedding_dim means how long is the vector representation for each symbols (each symbol will be represented in 65 dimension vector)
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size) 

    def forward(self, input_tensors, target_tensors=None): 
        # forward pass without activation function
        logits = self.token_embedding_table(input_tensors) # (B,T,C)
        if target_tensors is None: 
            loss = None 
        else: 
            B, T, C = logits.shape 
            logits = logits.view(B*T, C) 
            target_tensors = target_tensors.view(B*T)
            loss = F.cross_entropy(logits, target_tensors)

        return logits, loss

    def generate(self, input_tensors, max_new_tokens): 
        for _ in range(max_new_tokens): 
            # get the predictions 
            logits, loss = self(input_tensors) 
            # focus only on the last time step 
            logits = logits[:, -1, :] # becomes (B, C) <=> batch and the logits for each sequence 
            # apply softmax to get probabilities 
            probs = F.softmax(logits, dim=-1)  # (B, C) 
            # sample from the distribution 
            next_tensor = torch.multinomial(probs, num_samples=1) # (B, 1) 
            # append sampled index to the running sequence 
            input_tensors = torch.cat((input_tensors, next_tensor), dim=1) # (B, T+1)
        return input_tensors
    
m = BigramLanguageModel(voc_size)
logits, loss = m(xb, yb) 
print(logits.shape)
print(loss)

print(decode(m.generate(input_tensors=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [77]:
m.parameters

<bound method Module.parameters of BigramLanguageModel(
  (token_embedding_table): Embedding(65, 65)
)>

In [79]:
# create a pytorch optimizer (optimizing parameters)
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [84]:
batch_size = 32 
for steps in range(10000): 
    # sample batch  
    xb, yb = get_batch('train') 

    # evaluate the loss 
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step() 

    print(loss.item())

2.4185891151428223
2.599518060684204
2.295361042022705
2.577411413192749
2.3329381942749023
2.429128885269165
2.578380584716797
2.5815277099609375
2.521124839782715
2.403442621231079
2.5146613121032715
2.4481654167175293
2.4663002490997314
2.507948637008667
2.487795829772949
2.5057222843170166
2.4201440811157227
2.5067803859710693
2.465468406677246
2.6360366344451904
2.5174543857574463
2.5208189487457275
2.3912534713745117
2.5044260025024414
2.459249258041382
2.4825921058654785
2.4661500453948975
2.4553732872009277
2.5351030826568604
2.4736835956573486
2.338839292526245
2.4390368461608887
2.4807491302490234
2.477936267852783
2.5107412338256836
2.4946043491363525
2.409062147140503
2.4068453311920166
2.5647335052490234
2.5866634845733643
2.4308767318725586
2.604182720184326
2.5206141471862793
2.4279870986938477
2.634502649307251
2.4467709064483643
2.369974374771118
2.525829315185547
2.488741397857666
2.4655141830444336
2.504819631576538
2.4895212650299072
2.3516480922698975
2.57582378387

In [86]:
print(decode(m.generate(input_tensors=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


ORKEEYo the s tit:

BY f.
IOKI bucre fande aud ler g me fr--s,
Why w foutok'FOK:
METhowh.
pr;
Youngor an mor wice te t--weriea otheco, Yourouryo the gsillode ferpismad faveneashonou a t wand. MEEnghast t theame, imy uf
RWhy g't o,-tulowe---wil CERCo celf h IGHAn t.
Go arvasu brangremea llt d bu h ollier, lyorde t t'e byonosethe Ither s.
PAs tous st we oure f ferstt as athed me tofourter hon. mun pensteasthelme ber fethe clirouch nid, pewhare ilff ld atande Bame INore.
ROK:
emen, mt, list atort I
