In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
from Transformers.NanoGPT import block_size
%matplotlib inline


In [177]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [6]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [140]:
block_size = 3
context = [0] * block_size

X , Y = [] , []

for w in words:
    # print(w)
    context = [0] * block_size
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print("X:" , X ,  "Y:" , Y)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)


In [141]:
X.shape,X.dtype,  Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

### Lets Build our Embedding Table (Look Up Table) for our Character level model

#### Now we have make our dataset to predict the probability of nxt word. lets put these into neural network layer. Before moving to feeding, we need to reduce our 27 char we have into lower dimensions --> Two dim
-- As we are proceeding to reimplement the architecture by A Neural Probabilistic Language Model {A Neural Probabilistic Language Model}
-- they have implemented via 17000  words in our case we are proceeding with char level model so we have 27 dim
-- They compressed 17000 dim to 30 dim , So lets compress the dimension to 2.

In [27]:
# Look Up Table : -- C
# by the way these are the weights which we adjust during the back-propagation
C = torch.rand(27, 2) # intialize random numbers for Look up table
C

tensor([[0.2524, 0.6784],
        [0.9725, 0.1287],
        [0.8615, 0.9255],
        [0.1698, 0.2375],
        [0.5656, 0.2844],
        [0.8355, 0.7482],
        [0.2397, 0.1919],
        [0.9909, 0.7481],
        [0.1026, 0.3385],
        [0.2130, 0.6969],
        [0.1042, 0.1538],
        [0.2023, 0.2267],
        [0.9852, 0.0523],
        [0.2429, 0.0680],
        [0.3577, 0.5322],
        [0.5342, 0.9979],
        [0.9999, 0.7068],
        [0.1383, 0.0674],
        [0.8196, 0.4099],
        [0.5432, 0.4103],
        [0.6175, 0.0526],
        [0.7107, 0.2023],
        [0.6299, 0.5623],
        [0.6084, 0.3684],
        [0.6064, 0.5983],
        [0.3008, 0.9703],
        [0.0621, 0.8364]])

In [28]:
C[5]

tensor([0.8355, 0.7482])

In [32]:
# Now we start to generate the embeddings of our chars
# lets say we need to encode num 5 in 27 numbers
# why we need to do this,
# ----- in order to get the char embeddings we have two dim for each char, so we're extracting the embedding by
# matrix multiplying  one hot * Our Look up table (C)
F.one_hot(torch.tensor([5 ]), 27).dtype

torch.int64

In [33]:
# Now by doing this way we can pluck the relevent char embedding to feed to NN
# this is the one way, but torch offers very easy way to handle this , we can do this direclty by mentioning the indexs on the tensor itself
F.one_hot(torch.tensor([5 ]), 27).float() @ C


tensor([[0.8355, 0.7482]])

In [34]:
C[torch.tensor([5, 6, 7,7,7  ])]

tensor([[0.8355, 0.7482],
        [0.2397, 0.1919],
        [0.9909, 0.7481],
        [0.9909, 0.7481],
        [0.9909, 0.7481]])

In [38]:
# we can also give multidim array to get the tables
# Now we can get our embeddings for our characters which --> X , C--> (32 , 3) ,(27,2) -> (32, 3, 2)
C[X].shape

torch.Size([32, 3, 2])

In [40]:
# lets check this
X[13, 2]

tensor(1)

In [41]:
C[X][13, 2]

tensor([0.9725, 0.1287])

In [43]:
# its how its embed on the 3 dim array
C[1]

tensor([0.9725, 0.1287])

In [44]:
# so this our emb matrix for our chars
embed = C[X]
embed.shape

torch.Size([32, 3, 2])

In [45]:
# lets start with the first layer of our MLP
W1 = torch.randn(6, 100)
b1 = torch.randn(100)

In [46]:
# to modify weights we need to follow W.X+b
# we cant do it directly
# since we have our embed shape as (32, 3, 2) which cant be multiply with [6,100]
# in order to do that we need to change our embed matrix into (32, 6)
W1 @ embed +b1

RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [32, 100] but got: [32, 3].

In [48]:
# lets concat the embed matrix into 32, 6 via 2 nd dim
torch.cat([embed[:, 0,:], embed[:,1,:], embed[:,2,:]], dim=1)

tensor([[0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.8355, 0.7482],
        [0.2524, 0.6784, 0.8355, 0.7482, 0.2429, 0.0680],
        [0.8355, 0.7482, 0.2429, 0.0680, 0.2429, 0.0680],
        [0.2429, 0.0680, 0.2429, 0.0680, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.5342, 0.9979],
        [0.2524, 0.6784, 0.5342, 0.9979, 0.9852, 0.0523],
        [0.5342, 0.9979, 0.9852, 0.0523, 0.2130, 0.6969],
        [0.9852, 0.0523, 0.2130, 0.6969, 0.6299, 0.5623],
        [0.2130, 0.6969, 0.6299, 0.5623, 0.2130, 0.6969],
        [0.6299, 0.5623, 0.2130, 0.6969, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.9725, 0.1287, 0.6299, 0.5623],
        [0.9725, 0.1287, 0.6299, 0.5623, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.252

In [53]:
torch.cat([embed[:,0,:], embed[:,1,:], embed[:,2,:]], dim=1).shape

torch.Size([32, 6])

In [57]:
# this is fine for three block size but for more block size we can use torch.unbind
# which gives == embed[:,0,:], embed[:,1,:], embed[:,2,:]
torch.unbind(embed,1)  # 1, IS DIM

(tensor([[0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.8355, 0.7482],
         [0.2429, 0.0680],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.5342, 0.9979],
         [0.9852, 0.0523],
         [0.2130, 0.6969],
         [0.6299, 0.5623],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.9725, 0.1287],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.2130, 0.6969],
         [0.5432, 0.4103],
         [0.9725, 0.1287],
         [0.8615, 0.9255],
         [0.8355, 0.7482],
         [0.9852, 0.0523],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.5432, 0.4103],
         [0.5342, 0.9979],
         [0.9999, 0.7068],
         [0.1026, 0.3385]]),
 tensor([[0.2524, 0.6784],
         [0.2524, 0.6784],
         [0.8355, 0.7482],
         [0.2429, 0.0680],
         [0.2429, 0.0680],

In [59]:
# now we can concate this is same we have on top
torch.cat(torch.unbind(embed,1), 1).shape


torch.Size([32, 6])

In [62]:
# how can do this operation even more efficient way as : with the torch.view() method
# which will not effect the storage but it changes the shape of dim in what shape we need to view the storage
embed.view(32,6) # just as simple as that and it concats the way actually we want in Dim=1


tensor([[0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.8355, 0.7482],
        [0.2524, 0.6784, 0.8355, 0.7482, 0.2429, 0.0680],
        [0.8355, 0.7482, 0.2429, 0.0680, 0.2429, 0.0680],
        [0.2429, 0.0680, 0.2429, 0.0680, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.5342, 0.9979],
        [0.2524, 0.6784, 0.5342, 0.9979, 0.9852, 0.0523],
        [0.5342, 0.9979, 0.9852, 0.0523, 0.2130, 0.6969],
        [0.9852, 0.0523, 0.2130, 0.6969, 0.6299, 0.5623],
        [0.2130, 0.6969, 0.6299, 0.5623, 0.2130, 0.6969],
        [0.6299, 0.5623, 0.2130, 0.6969, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.9725, 0.1287, 0.6299, 0.5623],
        [0.9725, 0.1287, 0.6299, 0.5623, 0.9725, 0.1287],
        [0.2524, 0.6784, 0.2524, 0.6784, 0.2524, 0.6784],
        [0.252

In [64]:
# we can check this :
embed.view(32,6)  == torch.cat(torch.unbind(embed,1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [66]:
# so we can perform our first layer multiplications, Now its actually work
embed.view(32,6) @ W1 +b1

torch.Size([32, 100])

In [67]:
# for not hard coding
h = embed.view(embed.shape[0],6) @ W1 +b1  # we can do  embed.shape[0] == -1
h.shape

torch.Size([32, 100])

#### we have the  values for layer 2 of our MLP  [32,100],
#### lets introduce the non-Linearity to our layer
#####  idx --> Look_up_table[idx] --> [32, 6] (inputs of mlp) -- emb*w1+b --> [6, 100] --> tanh(layer2)

In [68]:
h = torch.tanh(embed.view(embed.shape[0],6) @ W1 +b1 )

In [69]:
h.shape

torch.Size([32, 100])

In [70]:
# We need to make sure the broadcasting rules while calculations of tensors
# in this case we are adding W1 + b
# which are  [32, 100] + [100]
# what broadcasting do
# 32, 100
#  1  , 100    it adds one dim on left which gives a one row of 100 values and adds the same row with 32 rows in w1 (that what we need to have)


In [71]:
# Finally we are here to final layer of our MLP
# we need to 100 ---> 27(our char ints) --- these are vary by vocabulary
W2 = torch.randn(100, 27)
b2 = torch.rand(27)

In [73]:
# so our logits will be h*W2 + b2

logits = h @ W2 +b2
logits.shape

torch.Size([32, 27])

In [74]:
# now we got the logits of our final layer we now need to get the softmax values of those logits
# to get the probabilites
# as we do in our last part
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True)  # these are the predictions for the next word

In [75]:
prob.shape

torch.Size([32, 27])

In [76]:
prob[0].sum()

tensor(1.0000)

In [77]:
# also  we have actual values Y
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [78]:
# now lets see how was our model predicted these chars values
prob[torch.arange(32), Y]  # these are predicted probs for actual values which are not that good

tensor([8.6524e-08, 6.4011e-03, 7.8358e-06, 6.2319e-06, 6.0953e-12, 2.1791e-09,
        4.8655e-10, 5.5937e-09, 2.0430e-06, 4.9461e-07, 1.2399e-04, 2.3388e-08,
        2.4564e-06, 1.5236e-01, 3.4163e-03, 1.5080e-07, 2.7459e-06, 1.7223e-04,
        8.7097e-05, 1.5270e-08, 5.6583e-08, 1.9838e-06, 7.7026e-08, 5.3460e-05,
        7.0146e-10, 2.1474e-04, 1.0187e-08, 4.6979e-09, 2.8162e-08, 2.3356e-11,
        9.0008e-09, 8.6235e-10])

## Loss function

In [81]:
loss = -prob[torch.arange(32), Y].log().mean()
loss # this is loss we need to minimize to predict the correct next word prediction

tensor(14.8393)

In [142]:
# ------------- > lets make this more cleaner

In [154]:
X.shape,  Y.shape # dataset

(torch.Size([228146, 3]), torch.Size([228146]))

In [155]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.rand(27, 2 , generator=g)  # look up table
W1 = torch.randn((6, 100), generator=g)  ## Layer 1
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)  # layer 2
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [156]:
sum(p.nelement() for p in parameters) # num of parameters


3481

## Forward Pass

In [145]:
# emb = C[X]  # (32, 3, 2)
# h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
# logits = h @ W2 + b2 # (32, 27)
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdim=True)
# loss = -prob[torch.arange(32), Y].log().mean()
# loss


IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [32], [228146]

In [111]:
# we can calculate the loss even more efficiently with the F.cross_entropy
# F.cross_entropy(logits, Y)
# why its efficient
# 1-- more efficient forward pass
# 2-- more efficient back pass
# 3-  and things can be numerically well behave


tensor(10.1065)

In [146]:
# lets see an example:
# in this it working fine right but what if we have extreme high value in our logits
#like [1000, 0, 10 ]
logits = torch.tensor([-5, 0, 1, 2])
counts = logits.exp()

probs = counts / counts.sum()
probs

tensor([6.0625e-04, 8.9976e-02, 2.4458e-01, 6.6484e-01])

In [147]:
# shows nan as exp(1000) --> inf
logits = torch.tensor([-5, 0, 1, 1000])

counts = logits.exp()
print(counts)
probs = counts / counts.sum()
probs

tensor([0.0067, 1.0000, 2.7183,    inf])


tensor([0., 0., 0., nan])

In [157]:
# So how pytorch tackle this in F.cross_entropy
# it just subtracts the highest value in the tensor , which gives an efficient way to represent logits
# not only this Pytorch. Cross_entropy reduce the calculations of all these exp and counts and finding probs by squeezing them which makes much more easier for backpropagation

logits = torch.tensor([-5, 0, 1, 1000]) -1000
counts = logits.exp()
print(counts)
probs = counts / counts.sum()
probs

tensor([0., 0., 0., 1.])


tensor([0., 0., 0., 1.])

In [158]:
for p in parameters:
    p.requires_grad = True

In [175]:
epochs = 100
for _ in range(epochs):

    # mini batch
    ix = torch.randint(0, X.shape[0], (32,))

    # Forward pass
    emb = C[X[ix]]  # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Y[ix])
    # print(loss.item())
    # Backward Pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update parameters
    for p in parameters:
        learning_rate = -0.1
        p.data += learning_rate * p.grad
print(loss.item())

2.6803760528564453


In [176]:
emb = C[X]  # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Y)
print(loss.item())

2.7324628829956055


In [152]:
Y

tensor([ 5, 13, 13,  ..., 26, 24,  0])