In [12]:
# Import Libraries
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import os

In [13]:
# Get the dataset
if not os.path.exists('names.txt'):
    !wget "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"
else:
    pass

In [14]:
#Reading training data
words =  open('names.txt','r').read().splitlines()

In [17]:
len(words)

32033

In [16]:
# Create training set for the bigrams
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [24]:
# build the dataset
block_size= 3

# The X are the input for the neural net and Y is the labels corresponding to X
X, Y = [], []

for w in words[:1]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix= stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] 

X = torch.tensor(X)
Y = torch.tensor(Y)

print('target:',X)
print('label',Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
target: tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]])
label tensor([ 5, 13, 13,  1,  0])


In [28]:
X.shape,X.dtype,Y.shape,Y.dtype

(torch.Size([5, 3]), torch.int64, torch.Size([5]), torch.int64)

In [29]:
C = torch.randn(27,2)
# This contains the embedding representation of characters

In [32]:
C[stoi['e']]

tensor([ 0.4903, -0.1670])

In [35]:
# lets try the one hot method and see if we are getting similiar results or not
F.one_hot(torch.tensor(stoi['e']),num_classes=27).float() @ C

tensor([ 0.4903, -0.1670])

In [48]:
# Embedding our X
C[X]
C[X]

tensor([[[ 0.6951,  0.5885],
         [ 0.6951,  0.5885],
         [ 0.6951,  0.5885]],

        [[ 0.6951,  0.5885],
         [ 0.6951,  0.5885],
         [ 0.4903, -0.1670]],

        [[ 0.6951,  0.5885],
         [ 0.4903, -0.1670],
         [-1.0997, -0.6135]],

        [[ 0.4903, -0.1670],
         [-1.0997, -0.6135],
         [-1.0997, -0.6135]],

        [[-1.0997, -0.6135],
         [-1.0997, -0.6135],
         [ 0.7726,  0.4719]]])

In [57]:
emd = C[X]

In [59]:
emd.shape

torch.Size([5, 3, 2])

In [60]:
# The tanh layer
W1 = torch.randn(6,100)
b1 = torch.randn(100)

In [61]:
emd @ W1 + b1

RuntimeError: mat1 and mat2 shapes cannot be multiplied (15x2 and 6x100)

In [75]:
# The above resulted in error because the embeddings are stacked up in the dimension (5,3,2) we need to somehow concatenate it

# First Way
# plucking out individual rows for each dimension
[emd[:,0,:], emd[:,1,:], emd[:,2,:]]

#concatenating them on axis=1
torch.cat([emd[:,0,:], emd[:,1,:], emd[:,2,:]],1).shape



torch.Size([5, 6])

In [81]:
# Second Way
# it takes the parameter that acts as a dimension on which you want the values. It spits out an list of tensors containing that exact information
torch.unbind(emd,1)


#concat it through dimension 1
torch.concat(torch.unbind(emd,1),1).shape

torch.Size([5, 6])

In [82]:
# Why is the above way in efficient?
a = torch.arange(18)

In [86]:
a.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [89]:
a.view(3,6)

tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16, 17]])

In [90]:
'''When we called a.view(3,3,2) on it the attributes of the tensor works internally to reflect that tensor as shape(3,3,2) 
but underlying storage doesn't change that means the no new memory is created for this operation and this 
the exact reason why torch.unbind was not an efficient choice as concatenation of tension will always create a new memory.'''

"When we called a.view(3,3,2) on it the attributes of the tensor works internally to reflect that tensor as shape(3,3,2) \nbut underlying storage doesn't change that means the no new memory is created for this operation and this \nthe exact reason why torch.unbind was not an efficient choice as concatenation of tension will always create a new memory."

In [93]:
h = emd.view(5,6)@ W1 + b1
# To make it more dynamic let's fetch the shape of tensor from the tensor itself

h = emd.view(emd.shape[0],6) @ W1 + b1

# If we give -1 then it interprets the dimension on its own
h = emd.view(-1,6) @ W1 + b1

In [103]:
# The softmax layer
# The input will be the output of previous layer that will be 100 hence my softmax layer will take input of 100 and 
# since we this is the last layer we want it to represent the characters that will ne represented by neurons in our case it will be 27
W2 = torch.randn(100,27)
b2 = torch.randn(27)

In [112]:
# The logits
logits = h @ W2 + b2
counts = logits.exp()
probs = counts/counts.sum(1,keepdim=True)


In [117]:
probs

tensor([[1.0722e-13, 1.9058e-10, 1.1091e-18, 6.7012e-08, 1.5597e-10, 2.7795e-23,
         7.1728e-28, 5.5383e-30, 8.7077e-22, 5.6067e-25, 1.9316e-04, 5.7447e-12,
         1.1414e-07, 3.0389e-23, 2.0979e-28, 9.9959e-01, 8.0854e-13, 9.1878e-09,
         2.2304e-17, 3.0658e-10, 4.8815e-06, 9.7131e-16, 4.1470e-08, 1.7691e-25,
         1.5031e-15, 2.1425e-04, 1.3438e-25],
        [4.3870e-08, 2.5043e-09, 1.8327e-21, 1.8688e-09, 1.2603e-06, 3.8913e-19,
         4.0859e-29, 5.0054e-25, 2.0458e-21, 4.6420e-22, 4.2432e-01, 1.4125e-09,
         1.1695e-04, 2.9947e-15, 5.6164e-24, 6.4353e-03, 1.0323e-10, 3.2459e-12,
         1.4123e-18, 2.8439e-06, 9.5557e-09, 2.9738e-19, 3.0809e-06, 3.5601e-23,
         3.3558e-11, 5.6912e-01, 3.7505e-21],
        [1.2824e-08, 4.7736e-12, 6.4878e-31, 3.4058e-19, 9.0286e-28, 7.3304e-14,
         1.4065e-25, 7.7113e-23, 2.2929e-22, 2.2235e-27, 8.7217e-07, 6.1552e-15,
         1.6313e-02, 2.6638e-07, 4.2773e-11, 5.6812e-12, 1.7715e-18, 1.5601e-22,
         2.6979e-

In [119]:
probs[torch.arange(5),Y]

tensor([2.7795e-23, 2.9947e-15, 2.6638e-07, 6.5361e-14, 4.4588e-27])

In [120]:
# Negative log likelihood
-probs[torch.arange(5),Y].log().mean()

tensor(38.3102)

In [121]:
# Rewriting the code to make it work with dataset containing first 5 words
# build the dataset
block_size= 3

# The X are the input for the neural net and Y is the labels corresponding to X
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix= stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] 

X = torch.tensor(X)
Y = torch.tensor(Y)

print('target:',X)
print('label',Y)


emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .
target: tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 1

In [122]:
g = torch.Generator().manual_seed(2147483647) # for reproduciability
C = torch.randn(27,2,generator=g)
W1 = torch.randn(6,100,generator=g)
b1 = torch.randn(100,generator=g)
W2 = torch.randn(100,27,generator=g)
b2 = torch.randn(27,generator=g)

parameters = [C,W1,b1,W2,b2]


In [123]:
sum(p.nelement() for p in parameters)

3481

In [124]:
emb = C[X] #(32,3,2)
h = torch.tanh(emb.view(-1,6) @ W1+b1) # (32,100)
logits = h @ W2 + b2 # (32,27)
counts = logits.exp()
probs = counts/counts.sum(1,keepdim=True)
loss = -probs[torch.arange(32),Y].log().mean()
loss



tensor(17.7697)

In [125]:
# Introducing Cross Entropy
F.cross_entropy(logits,Y)

tensor(17.7697)