# Open data and have a look

In [2]:
with open('input.txt','r', encoding='utf-8') as f:
  text = f.read()

In [3]:
print(text[0:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# Investigate the tokens

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


# Tokenise

In [5]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

In [6]:
def encode(word,stoi):
  return [stoi[letter] for letter in word]

def decode(integers, itos):
  return ''.join([itos[i] for i in integers])
encode('hello', stoi)
decode(encode('hello', stoi),itos)

'hello'

# Encode all the words and save as a torch.Tensor

In [7]:
import torch
encoded_text = encode(text,stoi)
data = torch.tensor(encoded_text, dtype = torch.long)

In [8]:
train_size = int(len(data) * 0.9)
train_data = data[:train_size]
test_data = data[train_size:]
print('data size is',len(data))
print('train size is',len(train_data))
print('test size is',len(test_data))

data size is 1115394
train size is 1003854
test size is 111540


# Split into blocks

In [9]:
block_size = 8

train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [10]:
train_data[1:block_size+1]

tensor([47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
for i in range(block_size):
    context = train_data[:i+1]
    target = train_data[i+1]
    print(f'in the context of {context}, the target is {target}')

in the context of tensor([18]), the target is 47
in the context of tensor([18, 47]), the target is 56
in the context of tensor([18, 47, 56]), the target is 57
in the context of tensor([18, 47, 56, 57]), the target is 58
in the context of tensor([18, 47, 56, 57, 58]), the target is 1
in the context of tensor([18, 47, 56, 57, 58,  1]), the target is 15
in the context of tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
in the context of tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [12]:
block_size = 3
batch_size = 2

def get_batch(data, block_size, batch_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    X = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+1+block_size] for i in ix])
    return X, y

torch.manual_seed(1337)

X,y = get_batch(train_data, block_size,batch_size)

for b in range(batch_size):
    for t in range(block_size):
        xs = X[b,:t+1]
        ys = y[b,t]
        print(f'in the context of {xs}, the target is {ys}')


in the context of tensor([50]), the target is 50
in the context of tensor([50, 50]), the target is 6
in the context of tensor([50, 50,  6]), the target is 1
in the context of tensor([52]), the target is 42
in the context of tensor([52, 42]), the target is 1
in the context of tensor([52, 42,  1]), the target is 58


In [13]:
X

tensor([[50, 50,  6],
        [52, 42,  1]])

In [14]:
y

tensor([[50,  6,  1],
        [42,  1, 58]])

# Bigram Model

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F 

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.next_letter_guesser = torch.nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, X, y):
        X = X.view(X.shape[0] * X.shape[1])
        y = y.view(y.shape[0] * y.shape[1])
        logits = self.next_letter_guesser(X)
        loss = F.cross_entropy(logits,y)
        return logits, loss
    
    
    def generate(self, X, max_new_tokens):
        for _ in range(max_new_tokens):
            logits = self.next_letter_guesser(X)
            logits = logits[:,-1,:]
            probabilities_of_letter = F.softmax(logits, dim = -1)
            next_token = torch.multinomial(probabilities_of_letter, num_samples=1)
            X = torch.cat((X,next_token),dim = 1)
        return X



In [16]:
blm = BigramLanguageModel(vocab_size)
idx = torch.zeros((1,1),dtype = torch.long)
generated_output = blm.generate(idx, 100)
print(decode(generated_output.tolist()[0], itos))


m3,gvSSvbDKLnsZuDw
W;mFmjD..yERpSNJQ$p:iIOOfEPvU,QfLSNaQOtw.aK;YvdW;xEr
TFZYbmXJlmlFV!PN,EqrWDlrbiNI


# Training Loop

In [17]:
batch_size = 32
block_size = 8
optimizer = torch.optim.AdamW(blm.parameters(),lr=1e-3)

for steps in range(10000):
    Xb, yb = get_batch(train_data,block_size,batch_size)
    logits, loss = blm(Xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

    print(loss.item())




4.468088150024414
4.525026798248291
4.643395900726318
4.546878337860107
4.553181171417236
4.442619800567627
4.408929347991943
4.555429935455322
4.424373626708984
4.472684383392334
4.505117893218994
4.532315254211426
4.496657371520996
4.508419513702393
4.538655757904053
4.514628887176514
4.455508232116699
4.557900905609131
4.530964374542236
4.59072208404541
4.439437389373779
4.571071624755859
4.5483622550964355
4.372831344604492
4.5083818435668945
4.411751747131348
4.444128513336182
4.598188877105713
4.643206596374512
4.582300662994385
4.547321796417236
4.467952251434326
4.513563632965088
4.5258989334106445
4.4756011962890625
4.480522632598877
4.524418354034424
4.450125217437744
4.4152631759643555
4.5567498207092285
4.393040180206299
4.403182506561279
4.577556133270264
4.465339660644531
4.6025800704956055
4.582770347595215
4.628195285797119
4.472461223602295
4.496840000152588
4.460074424743652
4.426327228546143
4.555615425109863
4.478347301483154
4.520768165588379
4.620629787445068
4.46

In [19]:
idx = torch.zeros((1,1),dtype = torch.long)
generated_output = blm.generate(idx, 1000)
print(decode(generated_output.tolist()[0], itos))


Whes cubolave in:
LKENTENCinthourener
Im he it rtoutrurrue inded t:
OR:
H:
Andesse,
I: myo be; ARYOTh CII CLAn ENAn, BRE:
CAthe k llden th the dots ndnowha qu m, weat!
Whel t an lesblloator:
Fr thabuno we wtull maXF odot.

UMPUpututhy e!



Vowhe m y y ales neranon
An s, le sut tcod w f heatug t my't care!
Adle, cld f sthen:
AThemy TI my ne fu sthe wet:

HERDaienothichevee t si:
A:
OUS:
BOMNod he ak ath we m, eengmor t
Wesivesp, d
WWhen
Ththithart overt musthe hes hogie thake:
Sn bend swe buriman ithea
Clovert ind, ffe Cly ts;RINTE:
Whed st sus d we aves! br:
O:
Thyonoraccllleabu, se t me s:
TZWvezOL:

D ts nd ller mang, h;:
INGLe mamicerth my,-nditouche ou h f--ht ha tenofeils Phi.


Th armall matathand she, ved thag r adas t; s.
CLOLI ld n me

BRINThene mom k Isin. hr linfe; je uf oube wshor t, st veteace tourthy:
CO s coure,
Bomy:
R prd,
Iqu t, won ce diobowanonrlo?
Whore hrene dre.
HANGRororvos giee tu ry ar.
Ay y, thes bum' fo mers velas wan'Or manc sksssweckitofid?
s ' arsan t y