### Makemore Part 2

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
#read in all the words
words = open('names.txt', 'r').read().splitlines()

In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [4]:
#Building the dataset - very similar to the trigram construction but dynamic

block_size = 3 #alternative for context length

X, Y = [], []

for w in words[:5]:

    print(w, '--- word of interest')
    context = [0] * block_size #How many characters to consider from the left to the right
    for ch in w + '.': #adding end word
        
        ix = stoi[ch]

        X.append(context)
        Y.append(ix)

        print(''.join(itos[i] for i in context), '--->', itos[ix])

        context = context[1:] + [ix] #context is redefined as a new list and we move the window to the right


X = torch.tensor(X)
Y = torch.tensor(Y)

emma --- word of interest
... ---> e
..e ---> m
.em ---> m
emm ---> .
mm. ---> .
olivia --- word of interest
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> .
vi. ---> .
ava --- word of interest
... ---> .
... ---> v
..v ---> .
.v. ---> .
isabella --- word of interest
... ---> i
..i ---> s
.is ---> .
is. ---> b
s.b ---> e
.be ---> l
bel ---> l
ell ---> .
ll. ---> .
sophia --- word of interest
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> .
hi. ---> .


In [5]:
##look up table. 
##The column metric is showing how many "features" are contained within each vector
C = torch.randn((27,2))

#Can execute look ups with single integers (for one character retrieval at a time)
embedding_of_char_5 = C[5]
print(f"Embedding of the char 5: {embedding_of_char_5}")
##Or
#Can execute look ups with lists of integers and/or tensors to retrieve multiple embeddings to begin with
embedding_of_chars_5_6_7 = C[torch.tensor([5,6,7])]

print(f"Embedding of the chars 5,6, and 7: {embedding_of_chars_5_6_7}")

Embedding of the char 5: tensor([-0.4521, -2.0867])
Embedding of the chars 5,6, and 7: tensor([[-0.4521, -2.0867],
        [-0.2620,  0.5576],
        [-1.4036,  1.1917]])


In [6]:
""" Using the method above we can get C[X] which represents
    the 2 dimensional embeddings for each character stored in the Xn (32 for first 5 words) samples
    of char sequences
"""

' Using the method above we can get C[X] which represents\n    the 2 dimensional embeddings for each character stored in the Xn (32 for first 5 words) samples\n    of char sequences\n'

In [7]:
embedding = C[X]
print(f"embedding shape {embedding.shape}")

embedding shape torch.Size([32, 3, 2])


In [8]:
print(f"10th sample is chars: {X[10]}")

10th sample is chars: tensor([ 8, 21,  8])


In [9]:
print(f"Embedding for the 10th sample should be dims(3 x2): {embedding[10]}")
print("This is showing the 2d embding representation of the characters in tensor above")

Embedding for the 10th sample should be dims(3 x2): tensor([[ 1.1741,  0.8073],
        [-2.2416, -0.3208],
        [ 1.1741,  0.8073]])
This is showing the 2d embding representation of the characters in tensor above


In [10]:
W1 = torch.randn((6, 100)) #100 neurons [weights to be tuned]
b1 = torch.randn(100)

In [11]:
#embedding[0] -> first example /trigram subset "...", "..e", ".em"
#embedding[0, 0,:] -> first 2 dimensional vector representation of the character of 
#embedding[0,0,0] -> first vector value of 2D vector rep of the first character in the first example subset

In [12]:
embedding[:,0,:].shape #the first char of all Xn samples and it's full vector representation

torch.Size([32, 2])

In [13]:
#Now we concatenate embeddings of the different subsets to be one tensor for multiplication against the weights
#torch.cat([embedding[:,0,:], embedding[:,1,:], embedding[:,2,:]],1).shape

#Can also use
#torch.cat(torch.unbind(embedding,1),1).shape 
"""
#unbind removes a tensor dimension and returns a tuple of all slices along the dimension. 
# So here we are removing the 1th dimensionand returning all the tuples that were within that. 
# So here that would be collecting all character embedding groups per char into separate tensors for easier concatenation
"""
#Can also use view() which forces pytorch to change the view of the embedding to match input dimensions
#embedding.view(32,6)

#embedding.view(32,6) == torch.cat(torch.unbind(embedding,1),1), "Double check your array manipulation" #proving that the embeddng view and cat + unbind methods



'\n#unbind removes a tensor dimension and returns a tuple of all slices along the dimension. \n# So here we are removing the 1th dimensionand returning all the tuples that were within that. \n# So here that would be collecting all character embedding groups per char into separate tensors for easier concatenation\n'

#### Hidden Layer creation

In [14]:
#Creating the hidden layer
h = embedding.view(embedding.shape[0],6) @ W1 + b1 

tanh = torch.tanh(h) #activation function

In [17]:
tanh.shape

torch.Size([32, 100])

#### Now create the output layer

In [18]:
W2 = torch.randn([100,27]) #27 possible characters coming out
b2 = torch.randn(27)

In [19]:
#Creating logits in final layer
logits = tanh @ W2 + b2
print(logits.shape)



torch.Size([32, 27])


#### Now create the probabilistic output using softmax

In [20]:
#exponenize the logits to make everything positive
counts = logits.exp()

#calculate probabilities
probs = counts/counts.sum(1, keepdims = True)

In [21]:
#2nd method
probs_torch_softmax = torch.softmax(logits, 1)

In [22]:
#We want to index into prob and pluck out the highest predicted next character for all Xn samples
#For all Xn samples, we gather the index listed in Y so for Xn1, we gather the first classification expected character
loss = -probs[torch.arange(32), Y].log().mean()

In [25]:
loss_torch_softmax = -probs_torch_softmax[torch.arange(X.shape[0]), Y].log().mean()

### reproducicle time!

In [None]:
##LEts make it reproducible
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100), generator = g)
b1 = torch.randn(100, generator = g)
W2 = torch.randn((100,27), generator = g)
b2 = torch.randn(27, generator = g)

parameters = [C, W1, b1, W2, b2]

embeddings = C[X]
tanh = torch.tanh(embedding.view(embedding.shape[0],6) @ W1 + b1)#can also use embedding.view(-1,6)
logits = tanh @ W2 + b2
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss_deprec = -probs[torch.arange(X.shape[0]), Y].log().mean()
print(f"current loss: {loss_deprec}")

current loss: 20.056188583374023


In [34]:
print(f"Cross entropy loss {F.cross_entropy(logits, Y)}")

Cross entropy loss 20.056188583374023


In [None]:
## training loop 
learning_rate = -0.1


for _ in range(0,10)
    embeddings = C[X]
    tanh = torch.tanh(embedding.view(embedding.shape[0],6) @ W1 + b1)#can also use embedding.view(-1,6)
    logits = tanh @ W2 + b2
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss_deprec = -probs[torch.arange(X.shape[0]), Y].log().mean()
    print(f"current loss: {loss_deprec}")

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data += learning_rate * p.grad