### Reimplementation (in Torch) of a 100-Line char-RNN Written by Karpathy (in Numpy)
Code: https://gist.github.com/karpathy/d4dee566867f8291f086


In [1]:
import torch
from torch import nn

In [2]:
# Read data
file_path = "data/shakespear.txt"
with open(file_path) as f:
    data = f.read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f"Data has {data_size} characters, {vocab_size} unique.")

char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

Data has 4573337 characters, 67 unique.


In [3]:
inputs = [char_to_ix[ch] for ch in data[2021:2029]]

In [4]:
inputs

[16, 56, 15, 35, 16, 12, 16, 26]

In [5]:
# hyperparams
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [6]:
# Define Model Params
scaling_factor = 1e-2
W_xh = torch.randn(hidden_size, vocab_size, requires_grad=True, dtype=torch.float32) # input to hidden
W_hh = torch.randn(hidden_size, hidden_size, requires_grad=True, dtype=torch.float32) # hidden to hidden
W_hy = torch.randn(vocab_size, hidden_size, requires_grad=True, dtype=torch.float32) # hidden to output

for param in [W_xh, W_hh, W_hy]:
    param = param * scaling_factor
    
# biases
bh = torch.randn(hidden_size, 1, requires_grad=True, dtype=torch.float32)
by = torch.randn(vocab_size, 1, requires_grad=True, dtype=torch.float32)

In [11]:
def loss_funct(inputs, targets, hprev):
    """_summary_

    Args:
        inputs (_type_): Inputs are list of integers
        targets (_type_): Targets are list of integers
        hprev (_type_): Prev Hidden State
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = hprev

    loss = torch.tensor(0, dtype=torch.float32)

    softmax = nn.Softmax(dim=0)
    cross_entropy_loss = nn.CrossEntropyLoss()

    # forward pass
    for t in range(len(inputs)):
        # One-Hot encode input
        xs[t] = torch.zeros(vocab_size, 1)
        xs[t, inputs[t]] = 1

        # Calc hidden state & output
        hs[t] = torch.tanh((W_xh @ xs[t]) + (W_hh @ hs[t-1]) + bh)
        ys[t] = (W_hy @ hs[t]) + by
        
        # get softmax for probabilities
        # ps[t] = softmax(ys[t])
        loss += cross_entropy_loss(ys[t].flatten(), torch.tensor(targets[t]))

    # take average
    loss = loss / len(inputs)

    loss.backward(retain_graph=True)
    # for param in [W_xh, W_hh, W_hy, bh, by]:
    #     param.grad = torch.clamp(param.grad, -150, 150)
    
    return loss

In [12]:
# n, p = 0, 0
# # memory vars for Adagrad
# mWxh, mWhh, mWhy = torch.zeros_like(W_xh), torch.zeros_like(W_hh), torch.zeros_like(W_hy)
# mBh, mBy = torch.zeros_like(bh), torch.zeros_like(by)

# smooth_loss = -torch.log(torch.tensor(1.0/vocab_size)) * seq_length

In [13]:
import random

In [16]:
hprev = torch.zeros((hidden_size, 1))
for iter in range(1000):
    
    start_idx = random.randrange(0, 10000)
    # start_idx = 0
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    # if p + seq_length + 1 >= len(data) or n == 0: 
    #     hprev = torch.zeros((hidden_size, 1)) # reset RNN memory
    #     p = 0 # go from start of data
    
    inputs = [char_to_ix[ch] for ch in data[start_idx:start_idx+seq_length]]
    targets = [char_to_ix[ch] for ch in data[start_idx+1: start_idx+seq_length+1]]

    # sample from the model now and then
    # if n % 100 == 0:
    #     sample_ix = sample(hprev, inputs[0], 200)
    #     txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    #     print '----\n %s \n----' % (txt, )

    # Set the grad to None
    for param in [W_xh, W_hh, W_hy, bh, by]:
        param.grad = None

    # break
    # forward seq_length characters through the net and fetch gradient
    loss = loss_funct(inputs, targets, hprev)
    print(f'iter {iter}, loss: {loss}') # print progress
    
    # for param, mem in zip([W_xh, W_hh, W_hy, bh, by], [mWxh, mWhh, mWhy, mBh, mBy]):
    #     mem += param.grad * param.grad
    #     param.data += -learning_rate * param.grad / torch.sqrt(mem * 1e-8)

    for param in [W_xh, W_hh, W_hy, bh, by]:
        param.data -= learning_rate * param.grad

iter 0, loss: 5.836549282073975
iter 1, loss: 6.386634349822998
iter 2, loss: 6.419184684753418
iter 3, loss: 8.275949478149414
iter 4, loss: 12.663060188293457
iter 5, loss: 9.86199951171875
iter 6, loss: 5.597454071044922
iter 7, loss: 4.785650253295898
iter 8, loss: 3.7904579639434814
iter 9, loss: 8.963748931884766
iter 10, loss: 5.0990376472473145
iter 11, loss: 3.831874132156372
iter 12, loss: 6.915081977844238
iter 13, loss: 4.876081466674805
iter 14, loss: 4.554648399353027
iter 15, loss: 12.116069793701172
iter 16, loss: 3.2244772911071777
iter 17, loss: 7.1199541091918945
iter 18, loss: 6.548785209655762
iter 19, loss: 8.143160820007324
iter 20, loss: 3.185851812362671
iter 21, loss: 3.38752818107605
iter 22, loss: 7.696544170379639
iter 23, loss: 3.9239418506622314
iter 24, loss: 3.7765305042266846
iter 25, loss: 3.8037850856781006
iter 26, loss: 2.9558029174804688
iter 27, loss: 7.206561088562012
iter 28, loss: 11.795632362365723
iter 29, loss: 12.33906364440918
iter 30, lo