### Makemore lesson 3

In [24]:
##Create train, dev/val,test
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [25]:

words = open('names.txt', 'r').read().splitlines()
random.seed(42)
random.shuffle(words)

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

#Building the dataset - very similar to the trigram construction but dynamic
def prepare_data(data, block_size):

    block_size = block_size #alternative for context length

    X, Y = [], []

    for w in data:

        #print(w, '--- word of interest')
        context = [0] * block_size #How many characters to consider from the left to the right
        for ch in w + '.': #adding end word
            
            ix = stoi[ch]

            X.append(context)
            Y.append(ix)

            #print(''.join(itos[i] for i in context), '--->', itos[ix])

            context = context[1:] + [ix] #context is redefined as a new list and we move the window to the right


    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

train_limit = int(0.8*len(words))
dev_limit = int(0.9*len(words))
block_size = 3

Xtr, Ytr = prepare_data(words[:train_limit], block_size = block_size)
Xdev, Ydev = prepare_data(words[train_limit:dev_limit], block_size = block_size)
Xtest, Ytest = prepare_data(words[dev_limit:], block_size = block_size)

In [26]:
Xtr.shape, Xdev.shape

(torch.Size([182625, 3]), torch.Size([22655, 3]))

In [32]:
vocab_size = len(stoi)
embedding_dim = 10
n_hidden = 200

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, embedding_dim), generator = g)
W1 = torch.randn((embedding_dim * block_size, n_hidden), generator = g)
b1 = torch.randn(n_hidden, generator = g)
W2 = torch.randn((n_hidden, vocab_size), generator = g) * 0.01 #making sure don't hve exploding initialization loss
b2 = torch.randn(vocab_size, generator = g) * 0 #making sure we don't have exploding initialization

parameters = [C,W1, b1, W2, b2]

param_sum = 0
for p in parameters:
    p.requires_grad = True
    param_sum += p.nelement()

print(f"Number of parameters: {param_sum}")



Number of parameters: 11897


In [33]:
max_iterations = 200000
batch_size = 32
lossi = []
for it in range(max_iterations):

    batch = torch.randint(0, Xtr.shape[0], (batch_size,), generator = g)

    embedding = C[Xtr[batch]]
    embdcat = embedding.view(embedding.shape[0],-1)
    tanh = torch.tanh(embdcat @ W1 + b1)

    logits = tanh@W2 + b2 

    loss = F.cross_entropy(logits, Ytr[batch])


    #We need to reset the gradients before we run backward so that we don't compile grads
    for p in parameters:
        p.grad = None

    lossi.append(loss.log10().item())
    #steps.append(it)

    loss.backward()
    lr = 0.1 if it < 100000 else 0.01

    #Now update the parameters
    for p in parameters:
        p.data += -lr * p.grad

    if it % 10000 == 0:
        print(f'{it:7d}/{max_iterations:7d}: {loss.item():.4f}')


print(f"Training Loss: {loss.item()}")

    




      0/ 200000: 3.3221
  10000/ 200000: 2.1900
  20000/ 200000: 2.4196
  30000/ 200000: 2.6067
  40000/ 200000: 2.0601
  50000/ 200000: 2.4988
  60000/ 200000: 2.3902
  70000/ 200000: 2.1344
  80000/ 200000: 2.3369
  90000/ 200000: 2.1299
 100000/ 200000: 1.8329
 110000/ 200000: 2.2053
 120000/ 200000: 1.8540
 130000/ 200000: 2.4566
 140000/ 200000: 2.1879
 150000/ 200000: 2.1118
 160000/ 200000: 1.8956
 170000/ 200000: 1.8644
 180000/ 200000: 2.0326
 190000/ 200000: 1.8417
Training Loss: 2.370270013809204


In [29]:
@torch.no_grad() #-> this decorator disables the gradient tracking
def split_loss(split):
    x,y = {'train': (Xtr, Ytr),
            'dev': (Xdev, Ydev),
            'test': (Xtest, Ytest)}[split]

    print(x.shape, y.shape)
    emb = C[x]

    embcat = emb.view(emb.shape[0], -1)
    h = torch.tanh(embcat@W1 + b1)
    logits = h @W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())


split_loss('train')
split_loss("dev")
split_loss("test")
    

torch.Size([182625, 3]) torch.Size([182625])
train 2.127152919769287
torch.Size([22655, 3]) torch.Size([22655])
dev 2.173154830932617
torch.Size([22866, 3]) torch.Size([22866])
test 2.1648871898651123


In [31]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []

    context = [0] * block_size 

    while True:
        emb = C[torch.tensor([context])] #-> turns the generated context into an embedding from our adjusted look up table post grad owkr
        embdcat = emb.view(1, -1)
        h = torch.tanh(embdcat@W1 + b1)

        logits = h@W2 + b2

        probs = F.softmax(logits, dim =1)

        #sample from the distribution
        ix = torch.multinomial(probs, num_samples = 1, generator = g).item()

        context = context[1:] + [ix]

        out.append(ix)

        if ix == 0:
            break


    print("".join(itos[i] for i in out))


mona.
mayah.
seen.
nah.
yam.
rensleigh.
rari.
adeendielin.
shy.
jenne.
elissopharleiyah.
hotelin.
shubergahgriel.
kendreth.
konnton.
foud.
mace.
ryyah.
fael.
yuma.
