In [26]:
### start of solution 

from torch import nn
import torch
from matplotlib import pyplot as plt
import numpy as np
import torch.nn.functional as F
import time as time
from torch import nn
import random

# device = torch.device("cuda:0")
device = torch.device("mps") # training on mac comp, comment if you are too with latest nightly pytorch library
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


def build_dataset(words, device, block_size):
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X, device = device)
  Y = torch.tensor(Y, device = device)
  return X, Y

def eval_loss(context_size, embedding_size, hidden_size, initial_learning_rate, learning_rate_decay_rate,
              batch_size, num_steps):
    print(context_size)
    print(f"evaluating loss for context size {context_size}, embedding size {embedding_size}, hidden_size {hidden_size}, batch_size {batch_size}")
    random.seed(42)
    random.shuffle(words)
    n1 = int(0.8*len(words))
    n2 = int(0.9*len(words))

    xs_train, ys_train = build_dataset(words[:n1], device, context_size)
    xs_dev, ys_dev = build_dataset(words[n1:n2], device, context_size)
    xs_eval, ys_eval = build_dataset(words[n2:], device, context_size)          

    
    W_emb, W1, b1, W2, b2 = get_weights(xs_train, ys_train, context_size, embedding_size, hidden_size, 
                initial_learning_rate, learning_rate_decay_rate, batch_size, xs_train, ys_train, xs_dev, ys_dev, xs_eval, ys_eval, num_steps)

    print(f"train loss is {get_loss(xs_train, ys_train, W_emb, W1, b1, W2, b2, context_size, embedding_size)}")
    print(f"dev loss is {get_loss(xs_dev, ys_dev, W_emb, W1, b1, W2, b2, context_size, embedding_size)}")
#     print('number of examples: ', num)
#     print(f"size of train {len(xs_train)} size of dev {len(xs_dev)} size of test {len(xs_eval)}")
    
    cel = nn.CrossEntropyLoss()
    curr = time.time()

def get_loss(x, y, W_emb, W1, b1, W2, b2, context_size, embedding_size):
    embs = W_emb[x]
#     embs = embs.view(-1, context_size * embedding_size)
#     hidden = embs @ W1 + b1
#     hidden_tan = (hidden)
    embs = embs.view(-1, context_size * embedding_size)
    h = torch.tanh(embs @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    return loss


def get_weights(xs, ys, context_size, embedding_size, hidden_size, 
                initial_learning_rate, learning_rate_decay_rate, batch_size, xs_train, ys_train, xs_dev, ys_dev, xs_eval, ys_eval, num_steps):
    
    # initialize the 'network'
    
    g = torch.Generator().manual_seed(2147483647)
    W_emb = torch.normal(0, 1, (27, embedding_size), generator=g, requires_grad=True, device = device)
    W1 = torch.normal(0, 0.01, (embedding_size * context_size, hidden_size), generator=g, requires_grad=True, device = device)
    b1 = torch.zeros(hidden_size, requires_grad=True, device = device)
    W2 = torch.normal(0, 0.01, (hidden_size, 27), generator=g, requires_grad=True, device = device) 
    b2 = torch.zeros(27, requires_grad=True, device = device) 
    
    params = [W_emb, W1, b1, W2, b2]
    num_params = sum([p.nelement() for p in params])
    print(f"num params is {num_params}")
    for k in range(0,num_steps):
        ixs = torch.randint(0, xs.shape[0], (batch_size,), generator=g, device=device)
        xs_batch = xs[ixs]
        ys_batch = ys[ixs]
        loss = get_loss(xs_batch, ys_batch, W_emb, W1, b1, W2, b2, context_size, embedding_size)
        # backward pass
        for p in params:
            p.grad = None
        loss.backward()
        # update
        # learning_rate =  initial_learning_rate * learning_rate_decay_rate ** k #
        learning_rate =  0.1 if k < 10000 else 0.01
        W_emb.data += -learning_rate * W_emb.grad
        W1.data += -learning_rate * W1.grad
        b1.data += -learning_rate * b1.grad
        W2.data += -learning_rate * W2.grad
        b2.data += -learning_rate * b2.grad
        if k % 10000 == 0:
            print(f" train loss after {k} steps is {get_loss(xs_train, ys_train, W_emb, W1, b1, W2, b2, context_size, embedding_size).item()}") 
            print(f" dev loss after {k} steps is {get_loss(xs_dev, ys_dev, W_emb, W1, b1, W2, b2, context_size, embedding_size).item()}") 
    print(f" train loss after {k} steps is {get_loss(xs_train, ys_train, W_emb, W1, b1, W2, b2, context_size, embedding_size).item()}") 
    print(f" dev loss after {k} steps is {get_loss(xs_dev, ys_dev, W_emb, W1, b1, W2, b2, context_size, embedding_size).item()}") 
    print(f" dev loss after {k} steps is {get_loss(xs_eval, ys_eval, W_emb, W1, b1, W2, b2, context_size, embedding_size).item()}") 
    

    for _ in range(20):
      out = []
      context = [0] * context_size # initialize with all ...
      while True:
        emb = W_emb[torch.tensor([context], device=device)] # (1,block_size,d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
          break
      print(''.join(itos[i] for i in out))

    return W_emb, W1, b1, W2, b2

eval_loss(9, 160, 200, 0.01, 0.999, 100, 30000)

9
evaluating loss for context size 9, embedding size 160, hidden_size 200, batch_size 100
num params is 297947
 train loss after 0 steps is 3.205723524093628
 dev loss after 0 steps is 3.2051217555999756
 train loss after 10000 steps is 1.9961012601852417
 dev loss after 10000 steps is 2.078627586364746
 train loss after 20000 steps is 1.9111264944076538
 dev loss after 20000 steps is 2.013911485671997
 train loss after 29999 steps is 1.8949158191680908
 dev loss after 29999 steps is 2.0109057426452637
 dev loss after 29999 steps is 2.0106992721557617
kedonnah.
alyshney.
keysha.
szona.
kathlyne.
azalynn.
amila.
jaydence.
orgeniz.
pehna.
pennee.
ricas.
bingay.
dawno.
simphany.
lillaamy.
drahdi.
ivitten.
keli.
skeniah.
train loss is 1.8949158191680908
dev loss is 2.0109057426452637
