In [232]:
import torch
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline

In [233]:
# Now we will make the same LM as we did with the Counting base, but use NN instead.

# Instead of counting all the bi-gram "character pairs" in a 2D array and then normalizing each row to get 
# the probability distribution for any particular character following another particular character.  

# our neural network will receive a single character as an input, and then the neural network will output the 
# probability distribution of all possible next characters.  So we will have 27 outputs, each output standing in for one possible
# next character, and each output being the probability that that particular character is next after the input.
# 
# We will use the data set which has the input characters and output characters (next character from the input) as the training
# set and will train the weights, using the loss function we created (negative log likelihood) that we will minimize during training.


In [234]:
# Read in file with list of
words = open('names.txt', 'r').read().splitlines()

In [235]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [236]:
# Count number of names in the dataset
print(f"There are {len(words)=} names in this dataset")

There are len(words)=32033 names in this dataset


In [237]:
# find shortest name and longest name.
# (len(w) for w in words) creates an iterator across all the word lengths.

# min and max take iterators as input, so:
print(f"min word length is: {min(len(w) for w in words)}")
print(f"max word length is: {max(len(w) for w in words)}")

min word length is: 2
max word length is: 15


In [238]:
# Put all characters into our dataset into one string, with ''.join(words), remove all duplicate characters with set, convert to list,
# then sort.
chars = sorted(list(set(''.join(words))))
print(f"{chars=}")

# Dictionary to map from char to integer index
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

# Dictionary to map from integer index to char
itos = {i:s for s,i in stoi.items()}

chars=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [239]:
# create the training set of bigrams (x,y)

# inputs
xs = []

# outputs (targets)
ys = []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    print(f"{ch1} {ch2}  =>  {ix1}, {ix2}")
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(f"{xs=}")
print(f"{ys=}")

. e  =>  0, 5
e m  =>  5, 13
m m  =>  13, 13
m a  =>  13, 1
a .  =>  1, 0
xs=tensor([ 0,  5, 13, 13,  1])
ys=tensor([ 5, 13, 13,  1,  0])


In [240]:
# We have the input values which are single integer indices.  We can't really apply weights to a single input integer index.  
# We need a vector of some kind to we end up with more than one node in the input layer.  The correct way to handle this is 
# "one-hot encoding".  If the max number is 10 for example. then one hot of 5 is [0, 0, 0, 0, 0, 1, 0, 0, 0, 0] and
# ine hot of 3 is [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

import torch.nn.functional as F

# take all the xs and create one-hot. Each element can range from 0 to 26 (27 total). In our xs, 
# we have 5 integer elements: xs=tensor([ 0,  5, 13, 13,  1]).  For each of the 5, create a one-hot based on its value.
# That first element is 0, so the one hot would look like [1, 0, 0, 0, 0, 0, 0, ...], the second element is 5 so the 
# one hot would look like [0, 0, 0, 0, 0, 1, 0, ...]
xenc = F.one_hot(xs, num_classes=27).float()
xenc

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [241]:
xenc.shape

torch.Size([5, 27])

In [242]:
# Now lets define our NN.  We have 27-element vectors (one hot) as inputs.  And the final output will have 27 outputs.  
# If we define a single layer network, the simplest kind, we need 27 nodes.  That is the weights for this single layer
# is 27x27.  Start with random weights.
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g) 
W.shape

torch.Size([27, 27])

In [243]:
# Note that the weights start out as random numbers, each individual weight starts out as random number plucked from a normal
# distributution as float number where center is 0.0 but can be positive of negative, the larger the absolute value, 
# the rarer, but can be a large as possible.

# When the inputs (in this case one-hots), get multiplied by the random wights, the 27 outputs will end up being positive or 
# negative numbers whose absolute value can be from 0 to large.  What we want however is probabilities.  One way to think
# of things is that the outputs are log(counts). Lets define the term logits to define log(counts). With log of counts, they 
# can be positive or negative with absolute value both below and above one.  To get counts, then, we'd do exponent.  
# So counts = exp(out) or  counts = exp(logits)
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding

# Forward pass of our 1 layer NN (Note @ is matrix multiplication)
logits = xenc @ W # predict log-counts

# Treat the outputs of our NN as log(counts) or logits.  To get the counts, do exp().  Thats equivalent to our 2D array where we counted
# the occurances of each character pair N in our simple counting based example before. 
counts = logits.exp() 

# Now convert to row probability by dividing each row by row sim.
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character



In [244]:
# BTW: the last 2 lines here are together called a 'softmax' and are the standard way to generate probabilities.
Image(url="softmax.jpeg", width=300)

In [245]:
# Lets go through the Forward each of the 5 inputs from the first word.
nlls = []

# i idexts through each of the 5 inputs (and corresponding target) from the first word
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('the 27 output probabilities from the neural net:', probs[i])
  print('label or target (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls.append(nll)

print('=========')
nlls = torch.tensor(nlls)
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

--------
bigram example 1: .e (indexes 0,5)
input to the neural net: 0
the 27 output probabilities from the neural net: tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459])
label or target (actual next character): 5
probability assigned by the net to the the correct character: 0.01228625513613224
log likelihood: -4.399273872375488
negative log likelihood: 4.399273872375488
--------
bigram example 2: em (indexes 5,13)
input to the neural net: 5
the 27 output probabilities from the neural net: tensor([0.0290, 0.0796, 0.0248, 0.0521, 0.1989, 0.0289, 0.0094, 0.0335, 0.0097,
        0.0301, 0.0702, 0.0228, 0.0115, 0.0181, 0.0108, 0.0315, 0.0291, 0.0045,
        0.0916, 0.0215, 0.0486, 0.0300, 0.0501, 0.0027, 0.0118, 0.0022, 0.0472])
label or target (actual next character): 13
probability assigned by the ne

In [246]:
# create the dataset
xs = []
ys = []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network' with random
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146


In [247]:

# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character

  # Each row in the prob is from the next input, so for the first 5 inputs, we want row 0, row 1, row 2, etc.
  # torch.arange(num) would give [0, 1, 2, 3, 4] so to get each row, would be  probs[torch.arange(num)].  
  # For each row, we want to pluck out the index that corresponds to the correct target which is the current
  # probability computed for that one out of 27 target result.  So to get those we have
  # probs[torch.arange(num), ys].  Now we then want to get to average negative log probability as the loss, so
  # end up with:
  loss = -1.0 * probs[torch.arange(num), ys].log().mean()
  
  if (k%10 == 0):
     print(f"iteration {k:2}: loss={loss.item()}")
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

print("------------------------------------")
print(f"final loss:        {loss.item()}")

iteration  0: loss=3.758953332901001
iteration 10: loss=2.6890029907226562
iteration 20: loss=2.572789192199707
iteration 30: loss=2.5301806926727295
iteration 40: loss=2.5086867809295654
iteration 50: loss=2.496137857437134
iteration 60: loss=2.488049268722534
iteration 70: loss=2.482424259185791
iteration 80: loss=2.478290319442749
iteration 90: loss=2.475132703781128
------------------------------------
final loss:        2.4728763103485107


In [248]:
# Note, that remember in hte previous counting example, we did model smoothing on the counts, to prevent counts of zero that lead
# to probabilities of 0 and infinity in later math.  In order to do the smoothing we just added a number like 1 to each count.  To smooth
# more we;d add a larger number.  If we add a really large number, we'd have uniform distribution across the row, and each probability the
# same and no prediction power at all in the model.  We'd get uniformly random outputs from our inputs.  Uniform distribution is equivalent
# to all the weights being 0, since the exp in the softmax would take all the zero outputs (due to all W being 0), exponentiate to all 1's and
# then dividing by row, sum ending in uniform distribution per row.  So the a small trend towards weights of 0 is like model normalization.
# so we could add a term to the loss function that when minimized trends the elements of W towards zero, but we only want a small amount of this.
# If we sum the square of each W element, or mean the square of each element, then as as a loss, and loss is minimized, the W will trend toward 
# zero.  To get only a little of this, we can multiply that term bu 0.01. And thus we get the new loss of below.  We call this regularization.

loss = (-1.0 * probs[torch.arange(num), ys].log().mean()) + 0.01*(W**2).mean()

# take the new loss and replace this with the older loss in the code and rerun.


In [249]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

mor.
axwaninaymoryles.
kondmaisah.
anchshizarie.
odaren.
