In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [6]:
len(words)

32033

In [15]:
#character to integer mapping & vice-versa
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [27]:
#building dataset

block_size = 3
X, Y = [], []
for w in words[:5]:
    
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(f"{''.join(itos[i] for i in context)} -----> {itos[ix]}")
        context = context[1:] + [ix] #kinda like rolling window, crop & append
        
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .
isabella
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
sophia
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [39]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [31]:
# creating the embedding lookup table
C = torch.randn((27,2)) #reduces the dimensions of the input vector to 2

In [48]:
emb = C[X]
emb.shape

# X[13, 2] -> represents 1
# emb[X[13,2]] -> returns the embedded value for the X[13, 2] element, which is 1
# emb[1] -> return the lower dimensional embedding for the integer input 1
# outputs of prev 2 are both equal = 
# tensor([[ 1.5126,  1.2000],
#         [ 1.5126,  1.2000],
#         [-0.7919, -0.1932]])

torch.Size([32, 3, 2])

In [49]:
W1 = torch.randn((6, 100)) 
b1 = torch.randn(100)
# 6 -> 3 inputs of 2D embeddings => 2x3 = 6 input neurons
# 100 -> hidden layer inputs, chosen randomly - hyperparameter

In [58]:
h = torch.tanh(emb.view(-1,6)@W1 + b1) 
# tensorObject.view alters the inner storage layout to represent the tensor in given dimension
# In this case, we need 6 inputs (3x2), so we're essentially concatenating them to match that input dimension
# & multiplying it with the weights initialized for the first layer.
# The output has 100 tanh activation funciton outputs of the hidden layer (refer Bengio et al. 2003)
h

tensor([[-0.8064,  0.8649,  0.9566,  ..., -0.4694,  0.9214, -0.8978],
        [ 0.3819, -0.7671, -0.8991,  ..., -0.8792,  0.7239, -0.8545],
        [ 0.8336,  0.2300, -0.1703,  ..., -0.9996, -0.9830,  0.2964],
        ...,
        [ 0.8771, -0.4381,  0.9549,  ..., -1.0000, -0.9967, -0.9179],
        [ 0.9720,  0.2754,  0.9995,  ...,  0.6776, -0.9609,  0.9840],
        [ 0.9274, -0.8437,  0.9696,  ...,  0.9955, -0.0239, -0.1782]])

In [60]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
# weights and biases for the softmax layer

In [61]:
logits = h @ W2 + b2

In [62]:
logits.shape

torch.Size([32, 27])

In [63]:
counts = logits.exp()

In [64]:
prob = counts/counts.sum(axis=1, keepdims=True)

In [69]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [73]:
loss = -torch.log(prob[torch.arange(32), Y]).mean()
loss

tensor(14.9209)

## Putting MLP together

In [116]:
#building dataset

block_size = 3
X, Y = [], []
for w in words:
    
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] #kinda like rolling window, crop & append
        
X = torch.tensor(X)
Y = torch.tensor(Y)

In [117]:
X.shape, Y.shape #dataset

(torch.Size([228146, 3]), torch.Size([228146]))

In [118]:
g = torch.Generator().manual_seed(2147483647)

In [119]:
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
params = [C, W1, b1, W2, b2]

In [120]:
sum(p.nelement() for p in params) # total parameters in the network

3481

In [121]:
for p in params:
    p.requires_grad = True

In [139]:
for i in range(100):
    
    #minibatch
    ix = torch.randint(0, X.shape[0], (32, )) # batch of size 32
    
    #forward prop
    emb = C[X[ix]]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix]) 
    # does the same softmax calc we did. more efficient since there's no creation of extra tensors + more efficient
#     print(loss.item())

    #backward prop
    for p in params:
        p.grad = None
    loss.backward()

    #update
    for p in params:
        p.data += -0.1 * p.grad
        
print(loss.item())

2.504465103149414


In [140]:
#overall loss
emb = C[X]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
loss

tensor(2.6897, grad_fn=<NllLossBackward0>)