In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('../names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
type(set(''.join(words)))

set

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

Now let's build the dataset

How the code works is the following:

- block_size specifies the context length, i.e. the length of characters that will be used to predict the next one
- context initially starts off as a list of dots. as you increment the context, you add the subsequent characters into the context, specifically the indices
- personally i think the print statement is genius. it really visualizes what's going on in context, and it helps me understand how itos really works

In [5]:
block_size = 3 # context length: adjust how many characters will be used to predict the next one
X, Y = [], []
for w in words[:5]:
    print(f'word: {w}')
    context = [0] * block_size
    
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]
        
X = torch.tensor(X)
Y = torch.tensor(Y)

word: emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
word: olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
word: ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
word: isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
word: sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [6]:
# now let's look at the specifics of our dataset
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In the paper that we are reproducing, they embed 17,000 words in as little as 30 dimensions, so since we only have 27 characters to worry about, we'll start by embedding them in 2 dimensions

In [7]:
C = torch.randn((27, 2))
C, C.shape, C.dtype

(tensor([[-0.5065, -1.1483],
         [ 0.8822,  0.5330],
         [-1.2642,  0.0869],
         [-0.0562, -0.3326],
         [ 0.0144,  1.2099],
         [ 1.8999, -1.2768],
         [ 0.1334,  0.1299],
         [ 0.4965, -0.9700],
         [ 0.7935,  0.0548],
         [-0.6443,  1.4054],
         [-1.5379, -0.0958],
         [-1.1648, -0.5040],
         [-0.8052,  1.8072],
         [-1.3181,  0.5950],
         [-0.3643,  1.0934],
         [ 0.1604,  0.3971],
         [ 0.9695, -0.4810],
         [ 0.4978,  2.4461],
         [-0.7249, -2.0963],
         [ 0.4252, -0.4802],
         [ 0.0848, -0.2283],
         [-1.0108,  1.3149],
         [-0.5131,  0.0509],
         [ 1.4330,  1.0075],
         [ 0.6535, -0.2736],
         [-0.6682,  0.1172],
         [-0.2573, -0.3574]]),
 torch.Size([27, 2]),
 torch.float32)

Now say if we wanted to embed values into our C matrix:

In [8]:
C[5] # this picks out the 5th row vector in C

tensor([ 1.8999, -1.2768])

In [9]:
# and let's further say that we wanted to retrieve multiple rows of C to basically pick out 
# some of these row vectors
# this is because pytorch indexing is very powerful
# i can see how this can be used for being able to pick out multiple rows of C at a time when given X
print(C[[5,6,7]], '\n', C[torch.tensor([5,6,7])], '\n', C[torch.tensor([5,6,7,7,9])])

tensor([[ 1.8999, -1.2768],
        [ 0.1334,  0.1299],
        [ 0.4965, -0.9700]]) 
 tensor([[ 1.8999, -1.2768],
        [ 0.1334,  0.1299],
        [ 0.4965, -0.9700]]) 
 tensor([[ 1.8999, -1.2768],
        [ 0.1334,  0.1299],
        [ 0.4965, -0.9700],
        [ 0.4965, -0.9700],
        [-0.6443,  1.4054]])


In [10]:
C[X] # this picks out three row vectors per vector of X, and forms its own tensor

tensor([[[-0.5065, -1.1483],
         [-0.5065, -1.1483],
         [-0.5065, -1.1483]],

        [[-0.5065, -1.1483],
         [-0.5065, -1.1483],
         [ 1.8999, -1.2768]],

        [[-0.5065, -1.1483],
         [ 1.8999, -1.2768],
         [-1.3181,  0.5950]],

        [[ 1.8999, -1.2768],
         [-1.3181,  0.5950],
         [-1.3181,  0.5950]],

        [[-1.3181,  0.5950],
         [-1.3181,  0.5950],
         [ 0.8822,  0.5330]],

        [[-0.5065, -1.1483],
         [-0.5065, -1.1483],
         [-0.5065, -1.1483]],

        [[-0.5065, -1.1483],
         [-0.5065, -1.1483],
         [ 0.1604,  0.3971]],

        [[-0.5065, -1.1483],
         [ 0.1604,  0.3971],
         [-0.8052,  1.8072]],

        [[ 0.1604,  0.3971],
         [-0.8052,  1.8072],
         [-0.6443,  1.4054]],

        [[-0.8052,  1.8072],
         [-0.6443,  1.4054],
         [-0.5131,  0.0509]],

        [[-0.6443,  1.4054],
         [-0.5131,  0.0509],
         [-0.6443,  1.4054]],

        [[-0.5131,  0

In [11]:
C[X][0] # this corresponds to all of the dots in the beginning of the first word

tensor([[-0.5065, -1.1483],
        [-0.5065, -1.1483],
        [-0.5065, -1.1483]])

Let's do more examples with pytorch indexing

In [12]:
X[13] # this is the thirteenth element of X
X[13, 2] # this gets the second element of the thirteenth element of X

tensor(1)

In [13]:
C[X[13,2]] # this is the embedding of this value within X for C

tensor([0.8822, 0.5330])

In [14]:
C[X][13, 2]

tensor([0.8822, 0.5330])

In [15]:
C[1] # this matches the X element's embedding within C!

tensor([0.8822, 0.5330])

now let's formally define our embedding

This is saying that we have 32 characters that are being fed into our input neurons (3), and each neuron contains a two dimensional embedding of the character

In [16]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

Now let's start making our weights

first dimension of W needs to equal the number of inputs that's coming from C: since we have two dimensional embeddings, and our context length was 3, the first dimension must be $3\times2 = 6$ to handle all of the inputs

the second dimension of W1 will be some arbitrary number representing the neurons in this hidden layer

the biases have to equal the number of neurons that I set in W1---just a vector of scalar elements

In [17]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

Our normal way of tensor multiplication won't work now, however, since emb @ W1 doesn't have compatible shapes

Remember that the shape of W1 is: first dimension is the amount of characters that we saved, the second dimension is the context length---i.e. the number of input neurons, and the third dimension was the desired embedding of each character, 2

In [18]:
emb.shape, W1.shape

(torch.Size([32, 3, 2]), torch.Size([6, 100]))

In [19]:
emb @ W1

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

So now we need to do something else

Now what would work is if we concatenated emb so that we could turn the 32x3x2 into a _x6 tensor so that we can do the tensor multiplication

The first step we need to do is pluck out the first embedding, but keep all of the relevant information

In [20]:
torch.cat((emb[:, 0], emb[:, 1], emb[:, 2]), dim=1).shape

torch.Size([32, 6])

However this isn't good enough, since we'd have to change the .cat if the context length changed, so we need something else---we basically are isolating dim=1, so we can use torch.unbind for this

this creates a tuple of tensors, which is equivalent to the arg within torch.cat

In [21]:
torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[-0.5065, -1.1483, -0.5065, -1.1483, -0.5065, -1.1483],
        [-0.5065, -1.1483, -0.5065, -1.1483,  1.8999, -1.2768],
        [-0.5065, -1.1483,  1.8999, -1.2768, -1.3181,  0.5950],
        [ 1.8999, -1.2768, -1.3181,  0.5950, -1.3181,  0.5950],
        [-1.3181,  0.5950, -1.3181,  0.5950,  0.8822,  0.5330],
        [-0.5065, -1.1483, -0.5065, -1.1483, -0.5065, -1.1483],
        [-0.5065, -1.1483, -0.5065, -1.1483,  0.1604,  0.3971],
        [-0.5065, -1.1483,  0.1604,  0.3971, -0.8052,  1.8072],
        [ 0.1604,  0.3971, -0.8052,  1.8072, -0.6443,  1.4054],
        [-0.8052,  1.8072, -0.6443,  1.4054, -0.5131,  0.0509],
        [-0.6443,  1.4054, -0.5131,  0.0509, -0.6443,  1.4054],
        [-0.5131,  0.0509, -0.6443,  1.4054,  0.8822,  0.5330],
        [-0.5065, -1.1483, -0.5065, -1.1483, -0.5065, -1.1483],
        [-0.5065, -1.1483, -0.5065, -1.1483,  0.8822,  0.5330],
        [-0.5065, -1.1483,  0.8822,  0.5330, -0.5131,  0.0509],
        [ 0.8822,  0.5330, -0.5131,  0.0

We're starting to go into an aside, so that we can learn what's going on with torch.tensor under the hood

In [22]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [23]:
a.shape

torch.Size([18])

In [24]:
a.view(3, 3, 2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

Now we can call .storage() and see how these tensors are stored in the computers memory

In [25]:
a.storage() # computers store vectors!

  a.storage() # computers store vectors!


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

So turns out we can just use .view() instead of using the .cat operation!

In [26]:
emb.view(32, 6)

tensor([[-0.5065, -1.1483, -0.5065, -1.1483, -0.5065, -1.1483],
        [-0.5065, -1.1483, -0.5065, -1.1483,  1.8999, -1.2768],
        [-0.5065, -1.1483,  1.8999, -1.2768, -1.3181,  0.5950],
        [ 1.8999, -1.2768, -1.3181,  0.5950, -1.3181,  0.5950],
        [-1.3181,  0.5950, -1.3181,  0.5950,  0.8822,  0.5330],
        [-0.5065, -1.1483, -0.5065, -1.1483, -0.5065, -1.1483],
        [-0.5065, -1.1483, -0.5065, -1.1483,  0.1604,  0.3971],
        [-0.5065, -1.1483,  0.1604,  0.3971, -0.8052,  1.8072],
        [ 0.1604,  0.3971, -0.8052,  1.8072, -0.6443,  1.4054],
        [-0.8052,  1.8072, -0.6443,  1.4054, -0.5131,  0.0509],
        [-0.6443,  1.4054, -0.5131,  0.0509, -0.6443,  1.4054],
        [-0.5131,  0.0509, -0.6443,  1.4054,  0.8822,  0.5330],
        [-0.5065, -1.1483, -0.5065, -1.1483, -0.5065, -1.1483],
        [-0.5065, -1.1483, -0.5065, -1.1483,  0.8822,  0.5330],
        [-0.5065, -1.1483,  0.8822,  0.5330, -0.5131,  0.0509],
        [ 0.8822,  0.5330, -0.5131,  0.0

So now this is all what we need to do!

In [27]:
#h = emb.view(32, 6) @ W1 + b1 # least efficient way of doing this
#h = emb.view(emb.shape[0], emb.shape[1]*emb.shape[2]) @ W1 + b1 # getting better
h = emb.view(-1, emb.shape[1]*emb.shape[2]) @ W1 + b1 # pytorch will infer the size that h needs to be

In [28]:
h.shape

torch.Size([32, 100])

This is a much better way to manipulate tensors because torch.cat needs more memory to create this new tensor, and then further do manipulations with it

but remember that we want h to be tanh

In [29]:
h = torch.tanh(emb.view(-1, emb.shape[1]*emb.shape[2]) @ W1 + b1)
h

tensor([[-0.9961,  0.6846,  0.9854,  ..., -0.2181, -0.7957, -0.9717],
        [-0.4027, -0.1303,  0.9906,  ...,  0.9999, -0.9998, -0.9985],
        [ 0.8235,  0.8517,  0.9596,  ..., -0.9883, -0.9744,  0.4970],
        ...,
        [ 0.9917,  0.2004,  0.9947,  ...,  0.9733, -0.4136, -0.7867],
        [ 0.8529,  0.8911,  0.5631,  ..., -0.9920, -0.4192, -0.8091],
        [-0.6496, -0.2859, -0.6388,  ...,  0.8515,  0.7850, -0.9923]])

In [30]:
h.shape # hidden layer of activations for every one of our 32 examples

torch.Size([32, 100])

Remember to be careful with tensor broadcasting (when I'm using addition)
you must make sure that the shapes are able to be broadcasted together

In [31]:
# h.shape = [32, 100]
# b1.shape = [100]
# so how these will be broadcasted is the following
# 32, 100
#     100 (align at the right)

# 32, 100
#  1, 100 # create a fake dimension

Now we're going to create the last layer in the network where we're going to SoftMax the hell out our activation layer

In [37]:
W2 = torch.randn((100, 27)) # 100 for number of neurons, 27 is possible number of characters
b2 = torch.randn(27)

tensor([ 0.2328, -1.9877, -0.2004, -0.2033,  0.1437, -1.9307, -0.0372, -0.8194,
        -2.8052,  0.3659, -0.5587,  0.0999, -1.5277,  0.8663,  0.8419, -0.3162,
         0.5580, -1.1929,  0.4629, -0.3704,  0.2857,  1.5748, -0.8777, -0.4344,
        -1.6901, -0.6643, -1.7464])

And now to make are logits, which are the output from the activation layer (our hidden layer)

In [40]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

Just like before we need to exponentiate our logits, and then normalize them in order to get a probability

In [124]:
counts[0].shape

torch.Size([27])

counts by itself is a $32 \times 27$ tensor, and when we normalize, we are trying to sum up all of the elements that correspond to the particular row which itself represents a character. these are basically vectors, and they live within counts, and this is how you properly normalize. it's backwards from row comma column, those are matries, these are tensors. Each of the 32 tensors has a length 27 vector in it

so you need to be careful how you sum. you have to sum on dim=1, since we want to sum all of the row elements within each tensor, keeping dim=0 will sum up particular elements in a column, which isn't correct!

In [125]:
counts.sum(dim=1, keepdim=True).shape

torch.Size([32, 1])

In [132]:
# this is not what you want to do
counts.sum(dim=0, keepdim=True)[0][0]

tensor(1214.3671)

In [129]:
# because it leads to this
counts[:, 0].sum() 

tensor(1214.3671)

In [84]:
# this is correct
counts.sum(dim=1, keepdim=True)[0][0]

tensor(548391.6875)

In [135]:
# which leads to us learning how to properly normalize
counts = logits.exp()
prob = counts / counts.sum(dim=1, keepdims=True)
prob.shape

torch.Size([32, 27])

We can check to make sure that every row should sum to one now!

In [136]:
prob.sum(dim=1) # and it does!

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

Now we need to understand how to take Y into account (our desired predictd labels)

We have to iterate through prob, and then pluck out the associated probability that corresponds to the particular label Y

In [147]:
prob[torch.arange(32), Y]

tensor([2.4699e-06, 2.0522e-02, 1.2604e-07, 2.1381e-13, 6.5617e-07, 1.6194e-07,
        5.0987e-13, 3.4829e-04, 6.4209e-05, 1.0275e-11, 2.4695e-08, 2.0583e-09,
        1.6098e-04, 5.8448e-11, 4.7563e-08, 3.7173e-13, 8.8967e-01, 9.6893e-06,
        2.8236e-08, 6.4799e-14, 1.1435e-06, 3.1050e-16, 1.0801e-08, 1.5387e-13,
        4.0439e-13, 1.2600e-07, 5.0489e-06, 4.6009e-10, 9.6622e-12, 1.8915e-07,
        5.2537e-13, 3.6565e-14])

Now we turn it into a loss

In [155]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(19.1586)

This is the loss which we want to minimize in order to get the network to predict the next sequence