In [2]:
import torch 
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [3]:
#reading in all of words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s : i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
print(stoi)
itos = {i : s for s, i in stoi.items()}

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [5]:
X, Y = [], []
block_size = 3 #content length, from how much do we want to predict next char
for w in words:
    # print(w)
    context = [0] * 3 #outputs [0,0,0]
    for chr in w + '.':
        ix = stoi[chr]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context) + ' ----> ' + itos[ix])
        context = context[1:] + [ix] #creating new list with new char appended to finish

X = torch.tensor(X)
Y = torch.tensor(Y)


In [7]:
def build_dataset(iwords):
    X, Y = [], []
    block_size = 3 #content length, from how much do we want to predict next char
    for w in iwords:
        # print(w)
        context = [0] * 3 #outputs [0,0,0]
        for chr in w + '.':
            ix = stoi[chr]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context) + ' ----> ' + itos[ix])
            context = context[1:] + [ix] #creating new list with new char appended to finish

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y
import random
random.seed(1)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xval, Yval = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
print(Xtr.shape, Xval.shape, Xte.shape)



torch.Size([182308, 3]) torch.Size([22886, 3]) torch.Size([22952, 3])


In [8]:
C = torch.randn(27, 2) #we have 27 different characters and for now, we want 2 dimension word embedding
emb = C[Xtr] #getting embeddings for all indexes we created from context 2d array 
#basically it looks up all of integer values X is stored and places their respective 2 value arrays into them
print(emb.shape)


torch.Size([182308, 3, 2])


In [9]:
W1 = torch.randn(6, 100) #100 neurons are choice, getting 6 is total vectors we want to have.
b1 = torch.randn(100) 
#we need to convert embed shape of torch.Size([32, 3, 2]) to torch.Size([32, 6])
#you can do: 
#torch.cat(emb[:, 0, :], emb[:, 1, :], emb[:, 2, :], 1) // creates a whole new tensor in memory
#torch.cat(torch.unbind(emb, 1), 1)
#but most efficient way is without creating any additional memory in computer


In [10]:
#emb = emb.view(emb.shape[0], emb.shape[1] * emb.shape[2]) // instead of doing this, do below, torch will infer shape number automaticlly
emb = emb.view(-1, 6) #just write 6 but in prod, you want to do  emb.shape[1] * emb.shape[2]
h = torch.tanh(emb @ W1 + b1)
print(h.shape)

torch.Size([182308, 100])


In [11]:
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

In [12]:
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [14]:
loss = -prob[torch.arange(prob.shape[0]), Ytr].log().mean()

In [15]:
###================wrapping everything at once========================###

In [48]:
C = torch.randn(27,10) #before 2 now 10
W1 = torch.randn(30, 200) # before 100 now 300, before 300 now 200 since we want to know spesificly if embedding increase will affect result
B1 = torch.randn(200)  # before 100 now 300 before 300 now 200
W2 = torch.randn(200, 27)  # before 100 now 300 before 300 now 200
B2 = torch.randn(27)
params = [C, W1, B1, W2, B2]
for p in params:
    p.requires_grad = True

In [49]:
sum(n.nelement() for n in params) # total changable weights in our model

11897

In [50]:
# lre = torch.linspace(0.001, 1, 100)
# print(lre) # this is bad approach since it is completely linear
lre = torch.linspace(-3, 0, 1000)
lrs = 10 ** lre # this is way to much good practice since values are exponentially decreasing and then deexponentially decreasing
##IMPORTANT the way we decided on 0,001 and 1 learning rate is actually by looking our spesifics of our data and see where it ACTUALLY EXPLODES, where does grads actually becomes absurd, 
#we need to decide on threshold values from looking our grad datas.


In [73]:
lres = []
losses = []

for i in range(30000):
    #minibatch 
    ix = torch.randint(0, Xtr.shape[0], (32, )) # 32 sizes of batches 

    #forward pass
    emb = C[Xtr[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) # 32, 100
    logits = h @ W2 + B2 # 32 27
    # counts = logits.exp()
    # probs = counts / counts.sum(1, keepdim=True)
    # print(probs.shape)
    # loss = -probs[torch.arange(probs.shape[0]), Y].log().mean()
    # 1. the same exact result appears when we use cross entropy func becuase it does not create any additional memory for all intermediate steps
    # 2. backward pass would be efficent because instead of going through derivatives or every operation of softmax // log, -minus division etc., it will already have predefined backward func.
    # 3. last reason is when there is too large values for logits, say, 100, exp operasion outputs inf, to overcome this, it finds maximum number of array of logits and subtract each element from it 
    loss = F.cross_entropy(logits, Ytr[ix])
    # print(f'{loss.item() = }')

    #backward pass
    for p in params:
        p.grad = None
    loss.backward()
    # lr = lrs[i]
    for p in params:
        # p.data += -lr * p.grad
        p.data += -0.01 * p.grad

    #graphin learning rate decay
    # lres.append(lre[i])
    # losses.append(loss.item())


#right now this modal 'overfitting' the 32 examples we give to data very easily, we cannot get 0 because
#there are for example ... ---> e ... ---> t multiple contexts pointing different variables, so we should not be getting 0

#the losses for the batches only calculate loss according to that batch so below calculating whole loss
emb = C[Xte] 
h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) 
logits = h @ W2 + B2 
loss = F.cross_entropy(logits, Yte)
print(f'final loss for all of the results validation; {loss.item() = }')

#### when we also evaluate overall loss in training and loss values are near to each other, we say we dont overfit data
#### but we underfit the data because model is not powerful enught with its small param num to memorize whole set
#### to avoid underfit, we increase params count. RESULT: didnt much improved loss, still getting underfit but overall loss decreased at most to 2.257 before.. 2.3 ishhh /// our model is slightly better than random guess === ln(27) == 3.3
#### increasing word embedding dim from 2 to 10, RESULT: loss decreased so this means its problem of embedding 2.257 to 2.137 


# emb = C[Xtr] 
# h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) 
# logits = h @ W2 + B2 
# loss = F.cross_entropy(logits, Ytr)
# print(f'final loss for all of the results training; {loss.item() = }')

#### the result we see in this graph is, we need to use 0.1 ish result for our learning rate, the minimum of the grapgh, its the place where loss is
# is much more stable 
# plt.plot(lres, losses)


final loss for all of the results validation; loss.item() = 2.2811198234558105


In [44]:
### as we add more and more parameters to our model, we are overfitting training data, we can near 0 loss with hundreds of millions of neurons
### but when we try to sample from data, it will give exact same result we get from training data.
### so what we do to evaluate these results is split data into, training, dev/validation, test splits /// 80% 10% 10

#normally you dont wait for nn training to finish like this, you assign bunch of jobs to cpu and wait them finish in days,
#normally you would initilieeze all of hyperparameteres as a single variables and change them accordingly

In [104]:
######========================== AAAALLL of them wrapped in single cell.==================================#####
import torch 
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random

words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s : i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i : s for s, i in stoi.items()}

def build_dataset(iwords):  
    X, Y = [], []
    block_size = 3 #content length, from how much do we want to predict next char
    for w in iwords:
        # print(w)
        context = [0] * 3 #outputs [0,0,0]
        for chr in w + '.':
            ix = stoi[chr]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context) + ' ----> ' + itos[ix])
            context = context[1:] + [ix] #creating new list with new char appended to finish

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

random.seed(1)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xval, Yval = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
print(f"""Dataset splits:
Training set (Xtr):   {Xtr.shape} ({Xtr.shape[0]} examples, {Xtr.shape[1]} context length)
Validation set (Xval): {Xval.shape} ({Xval.shape[0]} examples, {Xval.shape[1]} context length)
Test set (Xte):      {Xte.shape} ({Xte.shape[0]} examples, {Xte.shape[1]} context length)
Total examples: {Xtr.shape[0] + Xval.shape[0] + Xte.shape[0]}""")

##initilize parameters
C = torch.randn(27,10) 
W1 = torch.randn(30, 200) 
B1 = torch.randn(200)  
W2 = torch.randn(200, 27)  
B2 = torch.randn(27)
params = [C, W1, B1, W2, B2]
for p in params:
    p.requires_grad = True

total_params = sum(n.nelement() for n in params)
print(f"""Model Parameter Count Breakdown:
C  (char embeddings): {C.nelement():,} parameters ({C.shape}: 27 chars × 10 embedding dim)
W1 (first weight):    {W1.nelement():,} parameters ({W1.shape}: 30 input × 200 hidden)
B1 (first bias):      {B1.nelement():,} parameters ({B1.shape}: 200 hidden)
W2 (second weight):   {W2.nelement():,} parameters ({W2.shape}: 200 hidden × 27 output)
B2 (second bias):     {B2.nelement():,} parameters ({B2.shape}: 27 output)
--------------------------------
Total Parameters:     {total_params:,}""")

for i in range(100000):
    #minibatch 
    ix = torch.randint(0, Xtr.shape[0], (32, ))  

    #forward pass
    emb = C[Xtr[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) # 32, 100
    logits = h @ W2 + B2 # 32 27
    loss = F.cross_entropy(logits, Ytr[ix])

    #backward pass
    for p in params:
        p.grad = None
    loss.backward()

    lr = 0.1 if i < 50000 else 0.01
    for p in params:
        p.data += -0.01 * p.grad

#val loss
emb = C[Xval] 
h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) 
logits = h @ W2 + B2 
val_loss = F.cross_entropy(logits, Yval)

#train loss
emb = C[Xtr] 
h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) 
logits = h @ W2 + B2 
train_loss = F.cross_entropy(logits, Ytr)

print(f"""Model Performance:
Training Loss:   {train_loss.item():.4f}
Validation Loss: {val_loss.item():.4f}
Random Guess:    {torch.log(torch.tensor(27.0)):.4f} (ln(27) - theoretical worst case)
Gap to Random:   {torch.log(torch.tensor(27.0)).item() - val_loss.item():.4f}
""")


Dataset splits:
Training set (Xtr):   torch.Size([182254, 3]) (182254 examples, 3 context length)
Validation set (Xval): torch.Size([22914, 3]) (22914 examples, 3 context length)
Test set (Xte):      torch.Size([22978, 3]) (22978 examples, 3 context length)
Total examples: 228146
Model Parameter Count Breakdown:
C  (char embeddings): 270 parameters (torch.Size([27, 10]): 27 chars × 10 embedding dim)
W1 (first weight):    6,000 parameters (torch.Size([30, 200]): 30 input × 200 hidden)
B1 (first bias):      200 parameters (torch.Size([200]): 200 hidden)
W2 (second weight):   5,400 parameters (torch.Size([200, 27]): 200 hidden × 27 output)
B2 (second bias):     27 parameters (torch.Size([27]): 27 output)
--------------------------------
Total Parameters:     11,897
Model Performance:
Training Loss:   2.2964
Validation Loss: 2.3041
Random Guess:    3.2958 (ln(27) - theoretical worst case)
Gap to Random:   0.9918



In [106]:
# Calculate test set loss
emb = C[Xte] 
h = torch.tanh(emb.view(-1, emb.shape[1] * emb.shape[2]) @ W1 + B1) 
logits = h @ W2 + B2 
test_loss = F.cross_entropy(logits, Yte)

print(f"""Model Performance on Test Set:
Test Loss:      {test_loss.item():.4f}
Random Guess:   {torch.log(torch.tensor(27.0)):.4f} (ln(27) - theoretical worst case)
Gap to Random:  {torch.log(torch.tensor(27.0)).item() - test_loss.item():.4f}

Performance Analysis:
- The model achieves a test loss of {test_loss.item():.4f}, which is 
  {(torch.log(torch.tensor(27.0)).item() - test_loss.item()):.4f} better than random guessing
- For reference, random guessing would give a loss of ln(27) ≈ {torch.log(torch.tensor(27.0)):.4f}
""")

Model Performance on Test Set:
Test Loss:      2.3227
Random Guess:   3.2958 (ln(27) - theoretical worst case)
Gap to Random:  0.9731

Performance Analysis:
- The model achieves a test loss of 2.3227, which is 
  0.9731 better than random guessing
- For reference, random guessing would give a loss of ln(27) ≈ 3.2958



In [113]:
# sample from the model

for _ in range(20):
    
    out = []
    context = [0] * 3 # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + B1)
      logits = h @ W2 + B2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

kalishah.
bryen.
eviovaniah.
elin.
devon.
laurarle.
harisah.
suham.
bryva.
bavoy.
hanis.
ize.
ost.
cara.
kiock.
set.
kehania.
xavtenm.
satela.
alexia.
