# Character Level Language Model Using NN

In [None]:
# Import Libraries
import torch

In [None]:
# Get the dataset
!wget "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"

In [None]:
#Reading training data
words =  open('/content/names.txt','r').read().splitlines()

In [None]:
# Create training set for the bigrams
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
# Let's see what stoi and itos contains
print(stoi)
print(itos)

# So these basically is a hashmap that contains string to index pairs in one dict and index to string pair in other

In [None]:
xs, ys = [],[]

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

# Here basically for the first word we are trying to see what the training label and corresponding target label will look like
# ix1 contains the corresponding index of first character and ix2 contains the corresponding index of second character
# xs is a vector containing [ 0,  5, 13, 13,  1] which is equivalent to [.,e,m,m,a]
# ys is a vector containing [ 5, 13, 13,  1,  0] whjch is equivalent to [e,m,m,a,.]
# Here since it is target and labels then it basically means when:
  # input is 0(.) then target is 5(e)
  # input is 5(e) then target is 13(m)
  # input is 13(m) then target is 13(m)
  # input is 1(a) then target is 0(.)


In [None]:
print(xs)
print(ys)

In [None]:
import torch.nn.functional as F
# Here we are doing one hot encoding of the first word i.e .emma
# How can we represent a character as vector?
# Imagine a vector having 27 dimensions representing each alphabet of english characters
# representation of .(dot represents starting of word)-[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# representation of e - [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0]
# representation of m - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0]
# representation of m - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0]
# representation of a - [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0]
xenc = F.one_hot(xs,num_classes=27)

In [None]:
xenc[0]

In [None]:
import matplotlib.pyplot as plt
plt.imshow(xenc)
# Here we can see the yellow path is where the value is 1 and the vector size is 27.

In [None]:
# Since we want the inputs to be flexible since neural networks doesn't guarentee a integer outputs
xenc = F.one_hot(xs,num_classes=27).float()
xenc.dtype

In [None]:
# Sampling Weights
W = torch.randn(27,27)
W
# torch.randn draws numbers from normal distribution that means most of the numbers will be around zero,
# some will be between -3,3 and a very less will be above 3 or below -3

In [None]:
# shape of the below operation should be
# (5,27) * (27,27) = (5,27)
xenc @ W
# This will multiply the one hot encode vectors with sampled weights
# Here we are sending inputs tensor_1 @ tensor_2to 27 neurons and each neurons gets 5 inputs xenc @ W is showing the activations of these
# 27 neurons in parallel

In [None]:
# Lets see what is the effect on 13th neuron by 3rd input
(xenc @ W)[3,13]

In [None]:
# Now we have a task, to get output from these neurons in a way that represents probability
# probability has some characteristics that it is alsways positive, and the sum of all probs is 1.
# The nueral nets will give us log counts and to convert it to positive will pass it through exponent function,
# the exp function has a structure such that if a negative function is input then it will give results below 1 and when a
# postiive number occurs then the value can be anything above 1 till inf

(xenc @ W).exp()

#Above takes above tensor and passes each element through exp function

In [None]:
# xenc @ W is called logits
logits = xenc @ W
counts = logits.exp()
probs = counts/counts.sum(1,keepdims=True)

In [None]:
# Summarising the Code
#randomly generating 27 neurons weights and each neuron recieves 27 inputs
g = torch.Generator().manual_seed(1)
W = torch.randn(27,27,generator=g)

In [None]:
import torch.nn.functional as F
xenc = F.one_hot(xs,num_classes=27).float() # input to network -> onehot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts equivalent to N in our Bigram Probabilities Method
probs = counts/counts.sum(1,keepdims=True) # probabilities
# THE ABOVE TWO LINES ARE NOTHING BUT SOFTMAX FUNCTION WHICH TAKES THE INPUTS AND OUTPUTS PROBABILITIES
#SUMMING UP TO 1

In [None]:
itos[15]

In [None]:
# For the word .emma below code is written
nlls = torch.zeros(5)
for i in range(5):
  input = xs[i].item()
  target = ys[i].item()
  print(f'Bigram example {i+1} : {itos[input],itos[target]}, index{input,target}')
  xenc = F.one_hot(xs[i],num_classes=27).float()
  print(f'vector: {xenc}')
  g = torch.Generator().manual_seed(2147483647)
  W = torch.randn(27,27,generator=g)
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts equivalent to N in our Bigram Probabilities Method
  probs = counts/counts.sum(0,keepdims=True) # probabilities
  print(f'counts:{counts}')
  print(f'probs:{probs}')
  print('Label of targExternalet character', itos[target])
  p = probs[target]
  print('Neural Net Probaility for the label',p.item())
  logp = torch.log(p)
  print('log likelihood', logp.item())
  nll = -logp
  print('negative log likelihood', nll.item() )
  nlls[i] = nll

print("****************************************")
print('The average negative log likelihood was', nlls.mean().item())




In [None]:
# Optimisation ----------------------------------

In [None]:
import torch.nn.functional as F

xs # input
ys # target

xenc = F.one_hot(xs,num_classes= 27).float()

g = torch.Generator().manual_seed(2147483647)
W = torch.randn(27,27,generator = g, requires_grad= True)

In [None]:
# Forward Pass
logits = xenc @ W
counts = logits.exp()
probs = counts/counts.sum(1,keepdims= True)

loss = -probs[torch.arange(5),ys].log().mean()
print(loss.item())

In [None]:
# Bckward pass
W.grad = None # set grad to zero
loss.backward()


In [None]:
W.data += -0.1 * W.grad

In [None]:
import torch.nn.functional as F

# Considering entire dataset
xs, ys = [],[]

for w in words[:]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)



In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(27,27,generator = g, requires_grad= True)

xenc = F.one_hot(xs,num_classes= 27).float()


for i in range(100):
  # Forward Pass
  logits = xenc @ W
  counts = logits.exp()
  probs = counts/counts.sum(1,keepdims= True)

  loss = -probs[torch.arange(list(xs.shape)[0]),ys].log().mean() + 0.01*(W**2).mean()
  print(f"Loss on iteration number {i}",loss.item())
  # Backward pass
  W.grad = None # set grad to zero
  loss.backward()
  W.data += -1 * W.grad


In [None]:
for i in range(100):
  # Forward Pass
  logits = xenc @ W
  counts = logits.exp()
  probs = counts/counts.sum(1,keepdims= True)

  loss = -probs[torch.arange(list(xs.shape)[0]),ys].log().mean() + 0.1*(W**2).mean()
  print(f"Loss on iteration number {i}",loss.item())
  # Backward pass
  W.grad = None # set grad to zero
  loss.backward()
  W.data += -1 * W.grad

In [None]:
# finally sampling from neural net model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):

  out = []
  ix = 0

  while True:
    xenc = F.one_hot(torch.tensor([ix]),num_classes= 27).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts/counts.sum(1,keepdims= True)
    ix = torch.multinomial(p,num_samples=1,replacement=True,generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))