## Takeaway from this homework
* Trigram NN loss is better than Bigram NN loss.
* The current Trigram NN solution is unable to achieve the same loss as the couting solution. It's possible just my bugs...
* Instead of using 1-hot encoding to multiply xs with the NN weight (W), we can just index into rows of W directly
* Use F.cross_entropy instead of manually calculate loss like from ex1 to ex4

# Setup

* Watch https://youtu.be/PaCmpygFfXo?si=6_QhkMrB9g09OEpw

In [164]:
import torch
from torch.utils.data import TensorDataset, random_split
import torch.nn.functional as F

In [165]:
# Mount this colab to the Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [166]:
%cd drive/My Drive/Colab Notebooks

[Errno 2] No such file or directory: 'drive/My Drive/Colab Notebooks'
/content/drive/My Drive/Colab Notebooks


In [167]:
words = open('names.txt', 'r').read().splitlines()

In [168]:
len(words)

32033

In [169]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

## Constants

In [170]:
g = torch.Generator().manual_seed(2147483647)
bigram_input_size = 27
bigram_output_size = 27
trigram_input_size = 27 * 27
trigram_output_size = 27

## Indexing

In [171]:
def stoi(c):
  """Maps 1 character to its index"""
  ord1 = 0 if c == '.' else ord(c) - ord('a') + 1
  if not (0 <= ord1 < 27 and 0 <= ord1 <= 27):
    raise ValueError("Invalid characters")
  return ord1

def itos(v):
  """Maps an index to 1 character"""
  if not (0 <= v < 27):
    raise ValueError("Invalid index number")
  return '.' if v == 0 else chr(ord('a') + v - 1)

# We will need a 2-characters index as we need to encode 2 characters
def sstoi(c1, c2):
  """Maps 2 characters to its index"""
  return stoi(c1) * 27 + stoi(c2)

def itoss(v):
  """Maps an index to 2 characters"""
  if not (0 <= v < 27 * 27):
    raise ValueError("Invalid index number")
  ord1, ord2 = v // 27, v % 27
  return itos(ord1) + itos(ord2)

## Core ML

In [172]:
def split_dataset(xs, ys, train_ratio=0.8, dev_ratio=0.1, test_ratio=0.1):
  """Splits dataset into train, dev, test"""

  # Ensure the ratios sum to 1
  assert abs(train_ratio + dev_ratio + test_ratio - 1.0) < 1e-5, "Ratios must sum to 1"

  # Create a TensorDataset
  dataset = TensorDataset(xs, ys)

  # Calculate the sizes for each split
  total_size = len(dataset)
  train_size = int(train_ratio * total_size)
  dev_size = int(dev_ratio * total_size)
  test_size = total_size - train_size - dev_size  # Ensure sizes sum to total_size

  # Split the dataset
  train_dataset, dev_dataset, test_dataset = random_split(
      dataset, [train_size, dev_size, test_size]
  )

  # Extract xs and ys from each split
  train_xs, train_ys = train_dataset[:]
  dev_xs, dev_ys = dev_dataset[:]
  test_xs, test_ys = test_dataset[:]

  return dict({
    'train': {'xs': train_xs, 'ys': train_ys},
    'dev': {'xs': dev_xs, 'ys': dev_ys},
    'test': {'xs': test_xs, 'ys': test_ys}
  })

def _calculate_logits(xs, W):
  xenc = F.one_hot(xs, num_classes=W.shape[0]).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  return logits

def _calculate_probs(xs, W):
  logits = _calculate_logits(xs, W)
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  return probs

def _loss(ys, probs):
  return -probs[torch.arange(ys.nelement()), ys].log().mean()

def _forward(xs, ys, W, reg_matrix=None):
  """Neural net forward pass"""

  logits = _calculate_logits(xs, W)
  probs = _calculate_probs(logits)
  loss = _loss(ys, probs)
  if reg_matrix is not None:
    loss += reg_matrix
  return loss

  return _forward(xs, ys, W).item()

def loss(xs, ys, W):
  return _forward(xs, ys, W).item()

def train(xs, ys, W, passes, model_name, reg_rate=0.01, verbose = False):
  """Trains model"""

  print(f'Training model {model_name}')

  # Gradient descent
  for k in range(passes):

    # forward pass
    loss = _forward(xs, ys, W, reg_rate*(W**2).mean())
    if verbose:
      print(loss.item())

    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()

    # update
    W.data += -50 * W.grad

## Bigram library

In [173]:
def get_bigram_dataset(words):
  """Converts a list of words into a mapping from a char to the next one."""
  xs, ys = [], []
  for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
      ix1, ix2 = stoi(ch1), stoi(ch2)
      xs.append(ix1)
      ys.append(ix2)
  xs, ys = torch.tensor(xs), torch.tensor(ys)
  return xs, ys

def sample_bigram(W, sample_size=5):
  """Samples words from the bigram model"""
  for i in range(sample_size):
    ix = 0
    word_builder = ['.']
    while True:
      p = _calculate_probs(torch.tensor([ix]), W)

      # Get the next character
      ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

      # Append the character to the word
      word_builder.append(itos(ix))
      if ix == 0:
        break
    print(''.join(word_builder))

## Trigram library

In [174]:
def get_trigram_dataset(words):
  """Converts a list of words into a mapping from 2 chars to the next one."""
  xs, ys = [], []
  for w in words:
    chs = ['.'] + list(w) + ['.']
    for i in range(2, len(chs)):
      c0, c1, c2 = chs[i-2], chs[i-1], chs[i]
      xs.append(sstoi(c0, c1))
      ys.append(stoi(c2))
  xs, ys = torch.tensor(xs), torch.tensor(ys)
  return xs, ys

def sample_trigram(W, sample_size=5):
  """Samples words from the trigram model"""
  for i in range(sample_size):
    ii = torch.randint(0, 27, (1,)).item()
    word_builder = list(itoss(ii))
    while True:
      p = _calculate_probs(torch.tensor([ii]), W)

      # Get the next character
      i = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

      # Get previous 2 characters
      ii = sstoi(itoss(ii)[1], itos(i))

      # Append the character to the word
      word_builder.append(itos(i))
      if i == 0:
        break
    print(''.join(word_builder))

# Exercise 1

* Train a trigram language model, i.e. take two characters as an input to predict the 3rd one.

* Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model? **Answer**: Yes.


## Bigram Counting Solution

In [175]:
# Compute all frequencies
N = torch.zeros((bigram_input_size, bigram_output_size), dtype=torch.int32)

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1, ix2 = stoi(ch1), stoi(ch2)
    N[ix1, ix2] += 1

# Smoothens N so no frequencies are zeroes
P = (N+1).float()

# Calculates Probability Distribution for P
P = P / P.sum(1, keepdim = True)

In [176]:
# Evaluate loss

log_likelihood = 0.0
n = 0
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi(ch1)
    ix2 = stoi(ch2)
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

print("Counting solution loss")
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

Counting solution loss
log_likelihood=tensor(-559951.5625)
nll=tensor(559951.5625)
2.4543561935424805


In [177]:
# Try sampling a few words from the distribution
for i in range(5):
  ix = 0
  word_builder = [itos(ix)]
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    word_builder.append(itos(ix))
    if ix == 0:
      break
  print(''.join(word_builder))

.junide.
.janasah.
.p.
.cony.
.a.


## Trigram Counting Solution

In [178]:
# Compute all frequencies
N = torch.zeros((trigram_input_size, trigram_output_size), dtype=torch.int32)

for w in words:
  chs = ['.'] + list(w) + ['.']
  for i in range(2, len(chs)):
    c0, c1, c2 = chs[i-2], chs[i-1], chs[i]
    N[sstoi(c0, c1), stoi(c2)] += 1

# Smoothens N so no frequencies are zeroes
P = (N+1).float()

# Calculates Probability Distribution for P
P = P / P.sum(1, keepdim = True)

In [179]:
# Evaluate loss

log_likelihood = 0.0
n = 0
for w in words:
  chs = ['.'] + list(w) + ['.']
  for i in range(2, len(chs)):
    c0, c1, c2 = chs[i-2], chs[i-1], chs[i]
    prob = P[sstoi(c0, c1), stoi(c2)]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

print("Couting solution loss")
print(f'{log_likelihood=}')
nll = -log_likelihood / n
print(f'{nll=}')
print(f'{nll/n}')

Couting solution loss
log_likelihood=tensor(-410414.9688)
nll=tensor(2.0927)
1.0671130439732224e-05


In [180]:
# Try sampling a few words from the distribution
for i in range(5):
  word_builder = ['.', 'a']

  # Starts with <start> + a
  ii = sstoi('.', 'a')
  while True:
    # Gets the probability distribution for the 2 characters
    p = P[ii]

    # Get the next character
    i = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

    # Append the character to the word
    word_builder.append(itos(i))

    # Get previous 2 characters
    ii = sstoi(itoss(ii)[1], itos(i))

    if i == 0:
      break

  print(''.join(word_builder))

.anna.
.adi.
.arltoper.
.amaree.
.aviahnia.


## Bigram Neural Net Solution

In [183]:
xs2, ys2 = get_bigram_dataset(words)
split2 = split_dataset(xs2, ys2, 1, 0, 0)
W2 = torch.randn((bigram_input_size, bigram_output_size), generator=g, requires_grad=True)
train(split2['train']['xs'], split2['train']['ys'], W2, 500, 'bigram')
print("Neural network solution loss")
loss(xs2, ys2, W2)

Training model bigram
Neural network solution loss


2.461170196533203

## Trigram Neural Net Solution

In [184]:
xs3, ys3 = get_trigram_dataset(words)
split3 = split_dataset(xs3, ys3, 1, 0, 0)
W3 = torch.randn((trigram_input_size, trigram_output_size), generator=g, requires_grad=True)
train(split3['train']['xs'], split3['train']['ys'], W3, 500, 'trigram')
print("Neural network solution loss")
loss(xs3, ys3, W3)

Training model trigram
Neural network solution loss


2.1555898189544678

# Excercise 2

* Split up the dataset randomly into 80% train set, 10% dev set, 10% test set.
* Train the bigram and trigram models only on the training set.
* Evaluate them on dev and test splits. What can you see? **Answer:** trigram is better.

## Prepare

In [194]:
xs2, ys2 = get_bigram_dataset(words)
xs3, ys3 = get_trigram_dataset(words)
split2 = split_dataset(xs2, ys2)
split3 = split_dataset(xs3, ys3)

## Train

In [196]:
passes = 500

# train bigram
W2 = torch.randn((bigram_input_size, bigram_output_size), generator=g, requires_grad=True)
train(split2['train']['xs'], split2['train']['ys'], W2, passes, 'bigram')

# train trigram
W3 = torch.randn((trigram_input_size, trigram_output_size), generator=g, requires_grad=True)
train(split3['train']['xs'], split3['train']['ys'], W3, passes, 'trigram')

Training model bigram
Training model trigram


## Evaluate

In [197]:
for dataset in ['train', 'dev', 'test']:
  print(f'Comparing bigram vs trigram loss on {dataset}')
  print("bigram: ", loss(split2[dataset]['xs'], split2[dataset]['ys'], W2))
  print("trigram: ", loss(split3[dataset]['xs'], split3[dataset]['ys'], W3))

Comparing bigram vs trigram loss on train
bigram:  2.4602653980255127
trigram:  2.1518821716308594
Comparing bigram vs trigram loss on dev
bigram:  2.467137098312378
trigram:  2.187124252319336
Comparing bigram vs trigram loss on test
bigram:  2.4631457328796387
trigram:  2.189754009246826


## Sample

In [188]:
print("Bigram samples:")
sample_bigram(W2)
print()
print("Trigram samples:")
sample_trigram(W3)

Bigram samples:
.stojafae.
.ll.
.nama.
.ermryny.
.ry.

Trigram samples:
.ka.
.no.
.ham.
.sa.
.prapfgziana.


# Excercise 3

* Use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss.

* What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve? **Answer:** not much. It's probably because I didn't try as many possibilities.

## Tune regularization

In [195]:
reg_rates = torch.linspace(0.001, 50, steps=100)
losses = []
best_reg_rate = None
best_loss = None

for reg_rate in reg_rates:
  W3 = torch.randn((trigram_input_size, trigram_output_size), generator=g, requires_grad=True)
  train(split3['train']['xs'], split3['train']['ys'], W3, 50, 'trigram', reg_rate)
  current_loss = loss(split3['dev']['xs'], split3['dev']['ys'], W3)
  losses.append(current_loss)
  if best_reg_rate is None or current_loss < best_loss:
    best_reg_rate = reg_rate
    best_loss = current_loss
    print(f'new best_reg_rate={best_reg_rate}, new best_loss={best_loss}')

print(f'Best reg_rate, loss is reg_rate={best_reg_rate}, loss={best_loss}')

Training model trigram
new best_reg_rate=0.0010000000474974513, new best_loss=2.5970091819763184
Training model trigram
new best_reg_rate=0.5060403943061829, new best_loss=2.5839364528656006
Training model trigram
new best_reg_rate=1.0110808610916138, new best_loss=2.576690912246704
Training model trigram
new best_reg_rate=1.5161212682724, new best_loss=2.5721275806427
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Training model trigram
Trainin

## Evaluate on test set

In [122]:
W3 = torch.randn((trigram_input_size, trigram_output_size), generator=g, requires_grad=True)
train(split3['train']['xs'], split3['train']['ys'], W3, 50, 'trigram', best_reg_rate)
print("Test loss ", loss(split3['test']['xs'], split3['test']['ys'], W3))

Training model trigram
Test loss  2.6995866298675537


# Exercise 4

* We saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [181]:
# This will override _calculate_logits in the Core ML section

# Explanations:
# Let say W =
#             [1, 2]
#             [3, 4]
#             [5, 6]
#    and xs =
#             [2, 1]
#  => W[xs] =
#             [5, 6]
#             [3, 4]


def _calculate_logits(xs, W):
  return W[xs]

# Exercise 5

* Look up and use F.cross_entropy instead. You should achieve the same result.
* Can you think of why we'd prefer to use F.cross_entropy instead?

In [182]:
# This will override _forward in the Core ML section

def _forward(xs, ys, W, reg_matrix=None):
  """Neural net forward pass with F.cross_entropy"""

  logits = _calculate_logits(xs, W)
  loss = F.cross_entropy(logits, ys)
  if reg_matrix is not None:
    loss += reg_matrix
  return loss

## Trigram Neural Net Solution

In [190]:
xs3, ys3 = get_trigram_dataset(words)
split3 = split_dataset(xs3, ys3, 1, 0, 0)
W3 = torch.randn((trigram_input_size, trigram_output_size), generator=g, requires_grad=True)
train(split3['train']['xs'], split3['train']['ys'], W3, 500, 'trigram')
print("Neural network solution loss")
loss(xs3, ys3, W3)

Training model trigram
Neural network solution loss


2.1552374362945557