In [13]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [15]:
chars = sorted(list(set(''.join(words))))
i2s = {p+1:l for p, l in enumerate(chars)}
i2s[0] = '.'

s2i = {}

for i, l in i2s.items():
    s2i[l] = i

# s2i = {s:i for i, s in i2s.items()} same thing one line

    


In [16]:
block_size = 3
X, Y = [], [] # X is input, Y is labels
for w in words[:5]:
    
    context = [0] * block_size
    for ch in w + '.':
        print(w)
        ix = s2i[ch]
        X.append(context) # current running context
        Y.append(ix)
        print(''.join(i2s[i] for i in context), '--->', i2s[ix])
        context = context[1:] + [ix] # move it up one 
        
X = torch.tensor(X) # [n, 3] 3 leading characters of however many inputs
Y = torch.tensor(Y)  # [n] this is the next letter after the 3, and what we are trying to predict 

emma
... ---> e
emma
..e ---> m
emma
.em ---> m
emma
emm ---> a
emma
mma ---> .
olivia
... ---> o
olivia
..o ---> l
olivia
.ol ---> i
olivia
oli ---> v
olivia
liv ---> i
olivia
ivi ---> a
olivia
via ---> .
ava
... ---> a
ava
..a ---> v
ava
.av ---> a
ava
ava ---> .
isabella
... ---> i
isabella
..i ---> s
isabella
.is ---> a
isabella
isa ---> b
isabella
sab ---> e
isabella
abe ---> l
isabella
bel ---> l
isabella
ell ---> a
isabella
lla ---> .
sophia
... ---> s
sophia
..s ---> o
sophia
.so ---> p
sophia
sop ---> h
sophia
oph ---> i
sophia
phi ---> a
sophia
hia ---> .


In [5]:
g2 = torch.Generator().manual_seed(214748364)

C = torch.randn((27, 2), generator=g2) # 27 x 2
V = F.one_hot(torch.tensor(5), num_classes=27).float() # 27 x 1
A = V @ C
emb = C[X]
emb.shape   # 3 arrays of [32, 2]

torch.Size([32, 3, 2])

In [6]:
# PyTorch Indexing

print(X.shape)
print(C[X].shape)
print(C[X][13,2])
print(C[1])

torch.Size([32, 3])
torch.Size([32, 3, 2])
tensor([-0.7303,  1.1832])
tensor([-0.7303,  1.1832])


In [7]:
# Creating first hidden layer of neurons

W1 = torch.randn((6, 100), generator=g2)
b1 = torch.randn(100, generator=g2)

# we want to do eeb @ W1 + b1 but because of dimensions wont work

In [8]:
# Concatenate all the pieces of embedded matrix, which is the weights at each place of letter in matrix X
# N = torch.cat((emb[:, 0, :], emb[:, 1], emb[:, 2]), 1)
# N.shape     These lines do the same thing, but below is more flexible at 

Z = torch.cat(torch.unbind(emb, 1), 1)
Z.shape

torch.Size([32, 6])

In [9]:
Z = torch.cat(torch.unbind(emb, 1), 1) # uneffiecent and creates lots of memory
Z.shape

# Q = torch.unbind(emb, 1)  this breaks the [32, 3, 2] matrix into a tuple of 3 [32, 2] arrays
# Q[0].shape 

torch.Size([32, 6])

In [10]:
## practice with a.view

a = torch.arange(18)
print(a.shape)
a.view(3,3,2) # must multiply to same numbers

torch.Size([18])


tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [11]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # this is creating a 
h

tensor([[-0.7437,  0.9111,  0.4172,  ..., -0.8906,  0.0780, -0.1556],
        [-0.9579,  0.9981,  0.9558,  ..., -0.7515,  0.7401,  0.9818],
        [ 0.5338,  0.9274,  0.1247,  ...,  0.9116,  0.4257,  0.8823],
        ...,
        [-0.9653,  0.9928,  0.9868,  ...,  0.8227, -0.8342,  0.9816],
        [-0.4200,  0.9607,  0.2070,  ...,  0.8971,  0.9916,  0.2514],
        [-0.9407,  0.9780, -0.9071,  ...,  0.9787, -0.9759, -0.2924]])

In [22]:
g2 = torch.Generator().manual_seed(214748364)

C = torch.randn((27, 2), generator=g2) # 27 x 2 this is a 2d vector of all the next possible characters

W1 = torch.randn((6, 100), generator=g2) # weights 6 x 100 because it takes 
b1 = torch.randn(100, generator=g2)

W2 = torch.randn((100,27), generator=g2) # creating second hidden layer which will take in the 100 transformed matrix and connect it to 27 output neurons
b2 = torch.randn((27) , generator = g2) # bias 


params = [C, W1, W2, b1, b2]
num = sum(p.nelement() for p in params)
# print(num)

for p in params:
    p.requires_grad = True

for _ in range(10):
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2 # logits which are outputs
    loss = F.cross_entropy(logits, Y)
    print(loss)
    # counts = logits.exp()  OLD WAY TO CALCULATE LOSS ==> F.cross_entropy is better
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(32), Y].log().mean() # this plucks the spot of probabilities following each Y
    # print(loss)
    # prob[0].sum()  == 1 #(normalized)


    for p in params:
        p.grad = None

    loss.backward()

    for p in params:
        p.data += -.2 * p.grad
    

tensor(18.3554, grad_fn=<NllLossBackward0>)
tensor(12.3675, grad_fn=<NllLossBackward0>)
tensor(9.3520, grad_fn=<NllLossBackward0>)
tensor(7.5759, grad_fn=<NllLossBackward0>)
tensor(6.1386, grad_fn=<NllLossBackward0>)
tensor(5.5536, grad_fn=<NllLossBackward0>)
tensor(4.2515, grad_fn=<NllLossBackward0>)
tensor(3.6685, grad_fn=<NllLossBackward0>)
tensor(3.1533, grad_fn=<NllLossBackward0>)
tensor(2.6511, grad_fn=<NllLossBackward0>)
