In [56]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", "input.txt")

('input.txt', <http.client.HTTPMessage at 0x1f03f80cb10>)

In [57]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text=f.read()
print(f'Length of text: {len(text)} characters')
print(text[:500])

Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [58]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(''.join(chars)) 
print(f'Vocab size: {vocab_size}')


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size: 65


In [59]:
# tokenizer
s_to_i = { ch:i for i,ch in enumerate(chars) }
i_to_s = { i:ch for i,ch in enumerate(chars) }
encode=lambda s: [s_to_i[c] for c in s]
decode=lambda l: ''.join([i_to_s[i] for i in l])

print(encode("Hello World"))
print(decode(encode("Hello World")))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42]
Hello World


In [60]:
# encoding dateser and storing in Tensor

import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])  # first 1000 characters encoded as integers

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [61]:
#train validation split
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [62]:
context_length=8
train_data[:context_length+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [63]:
torch.manual_seed(18)
batch_size=4
block_size=8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)



print("-----------------")
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'when input is {context.tolist()} the target: {target}')

inputs:
torch.Size([4, 8])
tensor([[63,  1, 57, 41, 43, 54, 58, 56],
        [58,  6,  1, 42, 43, 39, 56,  1],
        [ 1, 46, 47, 51,  1, 58, 46, 39],
        [42, 59, 58, 63,  0, 32, 53,  1]])
targets:
torch.Size([4, 8])
tensor([[ 1, 57, 41, 43, 54, 58, 56, 43],
        [ 6,  1, 42, 43, 39, 56,  1, 51],
        [46, 47, 51,  1, 58, 46, 39, 58],
        [59, 58, 63,  0, 32, 53,  1, 44]])
-----------------
when input is [63] the target: 1
when input is [63, 1] the target: 57
when input is [63, 1, 57] the target: 41
when input is [63, 1, 57, 41] the target: 43
when input is [63, 1, 57, 41, 43] the target: 54
when input is [63, 1, 57, 41, 43, 54] the target: 58
when input is [63, 1, 57, 41, 43, 54, 58] the target: 56
when input is [63, 1, 57, 41, 43, 54, 58, 56] the target: 43
when input is [58] the target: 6
when input is [58, 6] the target: 1
when input is [58, 6, 1] the target: 42
when input is [58, 6, 1, 42] the target: 43
when input is [58, 6, 1, 42, 43] the target: 39
when input i

In [64]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(18)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)   

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)  # (B,T,C)
        if targets is None:
            loss = None
        else:
            B,T,C=logits.shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T) 
            loss=F.cross_entropy(logits,targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] 

            probs = F.softmax(logits, dim=-1) 
            idx_next = torch.multinomial(probs, num_samples=1) 
            idx = torch.cat((idx, idx_next), dim=1) 
        return idx

m=BigramLanguageModel(vocab_size)
logits,loss=m(xb,yb)
print(logits.shape)
print(loss)

idx=torch.zeros((1,1),dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.9172, grad_fn=<NllLossBackward0>)

E oRvxfSFnJS,XrkdQbZQaZ
dkIvvwsSZxiLaKQoQDAyaC&$ubNLQiYaN.I?s.juJ,Z-kE:u,czIAR?gbNMweNUydsEjBj'Kqb?i


In [65]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [66]:
batch_size=32

for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(f"Final loss: {loss.item()}")

Final loss: 2.491960048675537


In [67]:
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))


F t a ie duseeray w
promavevedrt! t$UE rhalodforLI medbe, oucoco ele t hl ameathandyourad, ggatha,
I cowot my


Hiron; bay oun er, thaids ath--mes therreshaucon3DOfal
The y t m byoimof th;
TEThou w!
DYofelo ksnge.
Bild we:
INour&&Fimyo flly, pr mabupel; inof,
Woth nsthes cetenese, d hol mares t mbe sengr loitoth deve;
Ast; a seane
O:
RCI INCENGouthenenk's t mufe his ind, as, k-
ge ththur cknoul waded, s agef io d airoouthinga; it al hond r, so, wid equ chaloy.
Tha, hand th m O tispeair:

TEXDo brese for ccaverevebl HAlou pond r be.
BHOlit, d e, viu d s ishy haned
Dile J.
G th's,


es
The mphy dveda thous!
t dofanl.
cerise s f m,

Gerail lldidkis w; fle
Ficturs bs ceqQ'd k wind ourequlaltal hyodee, th
oothr TCanQ'sed, e wale tth.
OUCULimllocshawenondels louroknviow as owarir s fo wrivomas
llout, WAPo f? play qur, lepe.

O:
KI
I it---t ak wiD sel t stistin choo w; wf mant inel k, wioranorgr f w notik; mouthy by noue y.

Thoureretl that.

R:
JWAhou? ancorthenchons mard;

Pamor wiss f,oul

In [68]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(100)
        for k in range(100):
            xb, yb = get_batch(split)
            logits, loss = m(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [69]:
B,T,C=4,8,2
x=torch.randn((B,T,C))

#v1
xbow=torch.zeros(B,T,C)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t]=torch.mean(xprev,0)

In [70]:
#v2
wei=torch.tril(torch.ones(T,T))
wei=wei/wei.sum(1,keepdim=True)

xbow2=wei@x
torch.allclose(xbow, xbow2)

True

In [77]:
#v3
tril=torch.tril(torch.ones(T,T))
wei=torch.zeros((T,T))
wei=wei.masked_fill(tril==0, float('-inf'))
wei=F.softmax(wei,dim=-1)
xbow3=wei@x
torch.allclose(xbow, xbow3)

True

In [None]:
 #v4

torch.manual_seed(18)
B,T,C=4,8,32
x=torch.randn(B,T,C)

head_size=16

key=nn.Linear(C, head_size, bias=False)
query=nn.Linear(C, head_size, bias=False)
value=nn.Linear(C, head_size, bias=False)
k=key(x)   # (B,T,head_size)
q=query(x) # (B,T,head_size)
v=value(x)
wei=q@k.transpose(-2,-1)*head_size**-0.5


tril=torch.tril(torch.ones(T,T))
wei=torch.zeros((T,T))
wei=wei.masked_fill(tril==0, float('-inf'))
wei=F.softmax(wei,dim=-1)
out=wei@v
out.shape



torch.Size([4, 8, 16])

In [118]:
n_embed=32
block_size=128
batch_size=32

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key=nn.Linear(C, head_size, bias=False)
        self.query=nn.Linear(C, head_size, bias=False)
        self.value=nn.Linear(C, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout=nn.Dropout(0.1)

    def forward(self, x):
        B,T,C=x.shape
        k=self.key(x)   # (B,T,head_size)
        q=self.query(x) # (B,T,head_size)

        wei=q@k.transpose(-2,-1)*head_size**-0.5
        wei=wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei=F.softmax(wei,dim=-1)
        wei=self.dropout(wei)
        v=self.value(x)
        out=wei@v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection=nn.Linear(n_embed,n_embed)


    def forward(self, x):
        out=torch.cat([h(x) for h in self.heads], dim=-1)
        out=self.projection(out)
        return out
    
class FeedForward(nn.Module):
    def __init__(self, n_embed, dropout=0.1):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(n_embed,4*n_embed),
            nn.ReLU(),
            nn.Linear(4*n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa=MultiHeadAttention(n_head, head_size)
        self.ffwd=FeedForward(n_embed)
        self.ln1=nn.LayerNorm(n_embed)
        self.ln2=nn.LayerNorm(n_embed)

    def forward(self, x):
        x =  x + self.sa(self.ln1(x))
        x =  x + self.ffwd(self.ln2(x))
        return x


In [119]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(18)

n_embed=32
B,T,C=4,8,32
class LanguageModel(nn.Module):
    def __init__(self):
        
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)   
        self.position_embedding_table=nn.Embedding(block_size,n_embed)
        self.lm_head=nn.Linear(n_embed, vocab_size)
        self.ffwd=FeedForward(n_embed)
        self.sa_heads=MultiHeadAttention(num_heads=4, head_size=n_embed//4)
        self.blocks=nn.Sequential(
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            nn.LayerNorm(n_embed),
        )
        

    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb=self.position_embedding_table(torch.arange(T))
        x=token_emb+pos_emb
        x=self.blocks(x)
        logits = self.lm_head(x)  # (B,T,C)

        if targets is None:
            loss = None
        else:
            B,T,C=logits.shape
            logits_flat=logits.view(B*T,C)
            targets_flat=targets.view(B*T) 
            loss=F.cross_entropy(logits_flat,targets_flat)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond=idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] 

            probs = F.softmax(logits, dim=-1) 
            idx_next = torch.multinomial(probs, num_samples=1) 
            idx = torch.cat((idx, idx_next), dim=1) 
        return idx

m=LanguageModel()


optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

idx=torch.zeros((1,1),dtype=torch.long)

for steps in range(5000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 100 == 0:
        print(f"step {steps}: loss {loss.item()}")

print(f"Final loss: {loss.item()}")

step 0: loss 4.441918849945068
step 100: loss 3.056455135345459
step 200: loss 2.709581136703491
step 300: loss 2.577690839767456
step 400: loss 2.5531868934631348
step 500: loss 2.4875001907348633
step 600: loss 2.4918079376220703
step 700: loss 2.4628536701202393
step 800: loss 2.3984739780426025
step 900: loss 2.451901435852051
step 1000: loss 2.4115681648254395
step 1100: loss 2.3755342960357666
step 1200: loss 2.368553638458252
step 1300: loss 2.3368382453918457
step 1400: loss 2.3147060871124268
step 1500: loss 2.3360044956207275
step 1600: loss 2.304831027984619
step 1700: loss 2.2697925567626953
step 1800: loss 2.2710354328155518
step 1900: loss 2.2163712978363037
step 2000: loss 2.1960935592651367
step 2100: loss 2.2169835567474365
step 2200: loss 2.1868064403533936
step 2300: loss 2.212981700897217
step 2400: loss 2.152888536453247
step 2500: loss 2.1642351150512695
step 2600: loss 2.112882137298584
step 2700: loss 2.1534698009490967
step 2800: loss 2.1257407665252686
step 29

In [121]:
print(decode(m.generate(idx, max_new_tokens=10000)[0].tolist()))




ORCKINTI:
By frow, I have shen thesbuned of Mingue.

LUCIO:
Dumede, I minade cin thy dist
All kict and a ence thal naw'dsed, But cancent,
I'l to langucht jace, not that on garen to heas
Pad deevf sat condser to of shall swikel prom Moosss
Un toued hase sut lomonce I byou think me sicblon
To his shing tus this pred batht ur theat you?
Neor congsmilict Bears is your have knop Farde!

MELABENES:
Goured handse ding, poth now wich whas!
DWit Hest nothan, nif and I swan no the.

KINE VINCE ENLOM:
Whreard alsourd ling'ds wos you ching's,
This-for come weordn hokese crioos.
Pret this be nour nour I'lonem:
He all hand not pight to is rince yewf
Witillor, uth mee of that tworeftin.

HESASS:
O, bardie yould le merve copages eyou,
What, of And thims grath with as gooor you.
ISABR AUCENT:
Younk hore own make for with compp
A werence nin and to say ther onelficr:
Whose that relforke profionond.

For prepntitle heas bone bow gron that this th
That ifonqua utes shery ous nie nive,
Thix fe deckell na