# Homework 2 - Recurrent Neural Networks

In this part of the homework we are going to work with Recurrent Neural Networks, in particular GRU. One of the greatest things that Recurrent Neural Networks can do when working with sequences is retaining data from several timesteps in the past. We are going to explore that property by constructing an 'echo' Recurrent Neural Network.

The goal here is to make a model that given a sequence of letters or digits will output that same sequence, but with a certain delay. Let's say the input is a string 'abacaba', we want the model to not output anything for 3 steps (delay length), and then output the original string step by step, except the last 3 characters. So, target output is then 'XXXabac', where 'X' is empty output.

This is similar to [this notebook](https://github.com/Atcold/pytorch-Deep-Learning/blob/master/09-echo_data.ipynb) (which you should refer to when doing this assignment), except we're working not with a binary string, but with a sequence of integers between 0 and some N. In our case N is 26, which is the number of letters in the alphabet.

## Dataset

Let's implement the dataset. In our case, the data is basically infinite, as we can always generate more examples on the fly, so don't need to load anything from disk.

In [103]:
import numpy as np
import random
class EchoData():

    def __init__(self, series_length=40000, batch_size=32,
                 echo_step=3, truncated_length=10, seed=None, num_classes=2):
        self.num_classes=num_classes
        self.series_length = series_length
        self.truncated_length = truncated_length
        self.n_batches = series_length//truncated_length

        self.echo_step = echo_step
        self.batch_size = batch_size
        if seed is not None:
            np.random.seed(seed)
        self.x_batch = None
        self.y_batch = None
        self.x_chunks = []
        self.y_chunks = []
        self.generate_new_series()
        self.prepare_batches()

    def __getitem__(self, index):
        if index == 0:
            self.generate_new_series()
            self.prepare_batches()
        x = self.x_chunks[index].astype(np.float32)
        y = self.y_chunks[index].astype(np.float32)
        return (x, y)

    def __len__(self):
        return self.n_batches

    def generate_new_series(self):
        x = np.random.choice(
            self.num_classes,
            size=(self.batch_size, self.series_length))
        y = np.roll(x, self.echo_step, axis=1)
        y[:, 0:self.echo_step] = 0
        self.x_batch = x
        self.y_batch = y

    def prepare_batches(self):
        x = np.expand_dims(self.x_batch, axis=-1)
        y = np.expand_dims(self.y_batch, axis=-1)
        self.x_chunks = np.split(x, self.n_batches, axis=1)
        self.y_chunks = np.split(y, self.n_batches, axis=1)
        def turn_into_onehot2(x):
            x_new = np.zeros((x.shape[0], x.shape[1], self.num_classes))
            for i in range(x.shape[0]):
                for j in range(x.shape[1]):
                    x_new[i,j, x[i,j,0]] = 1
            return x_new
        for i in range(len(self.x_chunks)):
            self.x_chunks[i] = turn_into_onehot2(self.x_chunks[i])
            self.y_chunks[i] = turn_into_onehot2(self.y_chunks[i])

import torch
import torch.nn as nn
# import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

torch.manual_seed(1);

batch_size = 5
BATCH_SIZE = batch_size
echo_step = 4
DELAY = echo_step
series_length = 20_000
BPTT_T = 20
total_values_in_one_chunck = batch_size * BPTT_T

NUM_CLASSES = 27
train_data = EchoData(
    echo_step=echo_step,
    batch_size=batch_size,
    series_length=series_length,
    truncated_length=BPTT_T,
    num_classes=NUM_CLASSES
    #total_values_in_one_chunck = batch_size * BPTT_T,
)

train_size = len(train_data)

test_data = EchoData(
    echo_step=echo_step,
    batch_size=batch_size,
    series_length=series_length,
    truncated_length=BPTT_T,
    num_classes=NUM_CLASSES
)
test_size = len(test_data)
train_data.generate_new_series()

def turn_into_onehot(x, num_classes):
    x_new = torch.zeros((x.shape[0], num_classes,))
    for i in range(x.shape[0]):
        x_new[i, x[i]] = 1
    return x_new

class EchoDataset(torch.utils.data.IterableDataset):

  def __init__(self, delay=4, seq_length=15, size=1000):
    self.delay = delay
    self.seq_length = seq_length
    self.size = size
  
  def __len__(self):
    return self.size

  def __iter__(self):
    """ Iterable dataset doesn't have to implement __getitem__.
        Instead, we only need to implement __iter__ to return
        an iterator (or generator).
    """
    for _ in range(self.size):
      seq = torch.tensor([random.choice(range(1, NUM_CLASSES)) for i in range(self.seq_length)], dtype=torch.int64)
      result = torch.cat((torch.zeros(self.delay), seq[:self.seq_length - self.delay])).type(torch.int64)
      seq = turn_into_onehot(seq, NUM_CLASSES)
      result = turn_into_onehot(result, NUM_CLASSES)
      yield seq, result
    
      result = torch.roll(seq, shifts=self.delay, dims=0)#
      result[:self.delay,:] = 0
      result[:self.delay,0] = 1
      yield seq, result

ds = EchoDataset(delay=DELAY, size=series_length)

echo_dataloader = torch.utils.data.DataLoader(ds, batch_size=BATCH_SIZE)

DX, DY = next(iter(echo_dataloader))
print(DX[0], DY[0])
print(test_data[1][0][0])
#print("========")
print(test_data[1][1][0])

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0

In [118]:
class SimpleRNN(torch.nn.Module):
    def __init__(self, input_size, rnn_hidden_size, output_size):
        super().__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = torch.nn.RNN(
            input_size=input_size,
            hidden_size=rnn_hidden_size,
            num_layers=1,
            nonlinearity='relu',
            batch_first=True
        )
        self.rnn = torch.nn.GRU(input_size=input_size, hidden_size=rnn_hidden_size, num_layers=1, batch_first=True)
        self.linear = torch.nn.Linear(
            in_features=rnn_hidden_size,
            out_features=output_size
        )
        self.fc = torch.nn.Linear(
            in_features=output_size,
            out_features=output_size
        )

    def forward(self, x, hidden):
        x_orig = x
        x, hidden = self.rnn(x, hidden)  
        x = self.linear(x)
        #x = self.fc(x)
        return x, hidden
    
def train_model3(model, train_dataloader, loss_fn, optimizer, num_epochs, hidden):
    model.train()
    for train_idx in range(num_epochs):
        total_loss = 0.0
        num_samples = 0
        correct = 0
        for batch_idx, (BX, BY) in enumerate(train_dataloader):
        #for batch_idx in range(len(train_data)):
            if False:
                data, target = train_data[batch_idx]
                data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).float().to(device)
                BX = data
                BY = target
            else:
                #BX = BX.reshape(BX.shape[0], BX.shape[1], 1)
                #BY = BY.reshape(BY.shape[0], BY.shape[1], 1)
                BX = BX.to(device)
                BY = BY.to(device)
                #print(BX.shape)
            
            optimizer.zero_grad()
            if hidden is not None: hidden.detach_()
            logits, hidden = model(BX, hidden)

            loss = loss_fn(logits, BY)
            loss.backward()
            optimizer.step()
            
            pred = torch.argmax(torch.sigmoid(logits), dim=2)
            # THIS IS THE KEY, it can't learn this for some reason
            pred[:,0:DELAY] = 0
            target2 = torch.argmax(BY.int(), dim=2)
            #print(pred[0], target2[0], torch.all(pred.int() == target2, dim=1)[0])
            correct += torch.all(pred.int() == target2, dim=1).sum().item()
            num_samples += BX.shape[0]
            
            if batch_idx % 100 == 0:
                print("epoch:", train_idx, 
                      "batch", batch_idx, 
                      "loss:", loss.item(),
                      "correct%:", float(correct) * 100/num_samples
                     )
                correct = 0
                num_samples = 0
        print("epoch:", train_idx, "loss:", loss.item())
        #print(pred[0:2].int(), pred[0:2].int().shape)
        print("X:", onehot_to_str(BX[0:2]), "Pred:", onehotints_to_str(pred[0:2]), "Actual", onehot_to_str(BY[:2]))
        print(pred[0], target2[0], torch.all(pred.int() == target2, dim=1)[0])
    

model = SimpleRNN(
    input_size=NUM_CLASSES,
    rnn_hidden_size=NUM_CLASSES * DELAY,
    output_size=NUM_CLASSES
).to(device)
hidden = None
print(NUM_CLASSES)
        
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)

train_model3(model, echo_dataloader, criterion, optimizer, num_epochs=2, hidden=hidden)

27
epoch: 0 batch 0 loss: 0.6907798051834106 correct%: 0.0
epoch: 0 batch 100 loss: 0.14423011243343353 correct%: 0.0
epoch: 0 batch 200 loss: 0.14570052921772003 correct%: 0.0
epoch: 0 batch 300 loss: 0.1468046009540558 correct%: 0.0
epoch: 0 batch 400 loss: 0.14651091396808624 correct%: 0.0
epoch: 0 batch 500 loss: 0.14730216562747955 correct%: 0.0
epoch: 0 batch 600 loss: 0.14499692618846893 correct%: 0.0
epoch: 0 batch 700 loss: 0.14782045781612396 correct%: 0.0
epoch: 0 batch 800 loss: 0.14268794655799866 correct%: 0.0
epoch: 0 batch 900 loss: 0.14636695384979248 correct%: 0.0
epoch: 0 batch 1000 loss: 0.14640885591506958 correct%: 0.0
epoch: 0 batch 1100 loss: 0.14622709155082703 correct%: 0.0
epoch: 0 batch 1200 loss: 0.14610882103443146 correct%: 0.0
epoch: 0 batch 1300 loss: 0.14603480696678162 correct%: 0.0
epoch: 0 batch 1400 loss: 0.14889195561408997 correct%: 0.0
epoch: 0 batch 1500 loss: 0.14581379294395447 correct%: 0.0
epoch: 0 batch 1600 loss: 0.14485913515090942 corre

epoch: 1 batch 5200 loss: 0.04702805355191231 correct%: 81.4
epoch: 1 batch 5300 loss: 0.046983685344457626 correct%: 81.6
epoch: 1 batch 5400 loss: 0.053814589977264404 correct%: 84.0
epoch: 1 batch 5500 loss: 0.048401106148958206 correct%: 78.8
epoch: 1 batch 5600 loss: 0.04688302427530289 correct%: 85.2
epoch: 1 batch 5700 loss: 0.04978634789586067 correct%: 82.0
epoch: 1 batch 5800 loss: 0.04922318831086159 correct%: 85.2
epoch: 1 batch 5900 loss: 0.04472767934203148 correct%: 83.0
epoch: 1 batch 6000 loss: 0.048076655715703964 correct%: 86.0
epoch: 1 batch 6100 loss: 0.04874015226960182 correct%: 86.6
epoch: 1 batch 6200 loss: 0.0457429401576519 correct%: 85.0
epoch: 1 batch 6300 loss: 0.048776231706142426 correct%: 84.6
epoch: 1 batch 6400 loss: 0.04978688806295395 correct%: 87.6
epoch: 1 batch 6500 loss: 0.044879473745822906 correct%: 85.0
epoch: 1 batch 6600 loss: 0.04518953710794449 correct%: 86.0


KeyboardInterrupt: 

In [106]:
def char_int_to_str(char_int):
    if char_int == 0:
        return " "
    else:
        char = chr(char_int + 97 - 1)
        return char
    
def str_to_onehot(s):
    mat = torch.zeros((2, len(s), N+1))
    
    for i in range(len(s)):
        char = s[i]
        char_int = ord(char.lower())-97 + 1
        if char == " ":
            char_int = 0
        mat[0,i,char_int] = 1
        mat[1,i,char_int] = 1
    return mat

def onehot_to_str(logits):
    pred_char_ints = logits.argmax(dim=2).cpu()
    return onehotints_to_str(pred_char_ints)
    
def onehotints_to_str(pred_char_ints):
    pred_str = ""
    for i in range(pred_char_ints.shape[1]):
        char_int = pred_char_ints[0,i].item()
        pred_str += char_int_to_str(char_int)
    return pred_str

## Model

Now, we want to implement the model. For our purposes, we want to use GRU. The architecture consists of GRU and a decoder. Decoder is responsible for decoding the GRU hidden state to yield a predicting for the next output. The parts you are responsible for filling with your code are marked with `TODO`. 

In [20]:
import random
import string

import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

torch.manual_seed(1);

# Max value of the generated integer. 26 is chosen becuase it's
# the number of letters in English alphabet.


def turn_into_onehot(x, num_classes):
    x_new = torch.zeros((x.shape[0], num_classes,))
    for i in range(x.shape[0]):
        x_new[i, x[i]] = 1
    return x_new

class EchoDataset(torch.utils.data.IterableDataset):

  def __init__(self, delay=4, seq_length=15, size=1000, num_classes=2):
    self.delay = delay
    self.seq_length = seq_length
    self.size = size
    self.num_classes = num_classes
  
  def __len__(self):
    return self.size

  def __iter__(self):
    """ Iterable dataset doesn't have to implement __getitem__.
        Instead, we only need to implement __iter__ to return
        an iterator (or generator).
    """
    for _ in range(self.size):
      seq = torch.tensor([random.choice(range(0, self.num_classes)) for i in range(self.seq_length)], dtype=torch.int64)
      #result = torch.cat((torch.zeros(self.delay), seq[:self.seq_length - self.delay])).type(torch.int64)
      seq = turn_into_onehot(seq, self.num_classes)
      result = torch.roll(seq, self.delay, dims=0)
      result[:self.delay,:] = 0
      result[:self.delay,0] = 1
      #result = turn_into_onehot(result, self.num_classes)

      yield seq, result

#DELAY = 3
#NUM_CLASSES = 27
#DATASET_SIZE = 20000
#BATCH_SIZE = 5
ds = EchoDataset(delay=DELAY, size=DATASET_SIZE, num_classes=NUM_CLASSES)
echo_dataloader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE)

DX, DY = next(iter(echo_dataloader))
print(DX[0], DY[0])

NameError: name 'DATASET_SIZE' is not defined

In [None]:
===================================================================================

In [161]:
import torch.nn.functional as F
N = NUM_CLASSES

class GRUMemory(torch.nn.Module):

  def __init__(self, input_size, hidden_size, output_size=NUM_CLASSES):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    
    self.input_to_reset = torch.nn.Linear(input_size, hidden_size)
    self.hidden_to_reset = torch.nn.Linear(hidden_size, hidden_size)
    
    self.input_to_update = torch.nn.Linear(input_size, hidden_size)
    self.hidden_to_update = torch.nn.Linear(hidden_size, hidden_size)
    
    self.input_to_new = torch.nn.Linear(input_size, hidden_size)
    self.hidden_to_new = torch.nn.Linear(hidden_size, hidden_size)
    
    self.linear = torch.nn.Linear(hidden_size, output_size)
    
    self.x2h = nn.Linear(input_size, 3 * hidden_size)
    self.h2h = nn.Linear(hidden_size, 3 * hidden_size)
    
  def forward_imp(self, x, hidden):
    
    ir = self.input_to_reset(x)
    hr = self.hidden_to_reset(hidden)
    reset_gate_out = F.sigmoid(ir + hr) # R
    
    iu = self.input_to_update(x)
    hu = self.hidden_to_update(hidden)
    update_gate_out = F.sigmoid(iu + hu) # Z
    
    i_n = self.input_to_new(x)

    new_gate = F.tanh(i_n + self.hidden_to_new(reset_gate_out * hidden)) #H_tilda
    hidden_new = (1.0 - update_gate_out) * new_gate + update_gate_out * hidden #H
    return hidden_new

  def forward(self, x, hidden):
    # inputs: x - input tensor of shape (batch_size, seq_length, N+1)
    # returns:
    # logits (scores for softmax) of shape (batch size, seq_length, N + 1)
    # TODO implement forward pass
    outputs = torch.zeros((x.shape[0], x.shape[1], self.output_size)).to(device)
    
    hn = hidden[:,0,:]
    for seq in range(x.shape[1]):
        #hn = hidden[:,seq,:]
        hn = self.forward_imp(x[:,seq,:], hn)
        outputs[:, seq, :] = self.linear(hn)
    
    #output2 = self.linear(output)
    return outputs, hidden

  def init_hidden(self, x):
    hidden = torch.autograd.Variable(torch.zeros(x.shape[0], x.shape[1]+1, self.hidden_size)).to(device)
    return hidden

  @torch.no_grad()
  def test_run(self, s):
    # This function accepts one string s containing lowercase characters a-z. 
    # You need to map those characters to one-hot encodings, 
    # then get the result from your network, and then convert the output 
    # back to a string of the same length, with 0 mapped to ' ', 
    # and 1-26 mapped to a-z.
    mat = torch.zeros((2, len(s), self.input_size))
    
    for i in range(len(s)):
        char = s[i]
        char_int = ord(char.lower())-97 + 1
        if char == " ":
            char_int = 0
        mat[0,i,char_int] = 1
        mat[1,i,char_int] = 1
        
    mat = mat.to(device)
    hidden = self.init_hidden(mat)
    logits, hidden = self.forward(mat, hidden)
    pred_char_ints = logits.argmax(dim=2).cpu()
    
    pred_str = ""
    pred_str2 = ""
    
    for i in range(pred_char_ints.shape[1]):
        char_int = pred_char_ints[0,i].item()
        char_int2 = pred_char_ints[1,i].item()
        pred_str += char_int_to_str(char_int)
        pred_str2 += char_int_to_str(char_int2)
    print("Pred:", pred_str, "2", pred_str2)
    return pred_str

def char_int_to_str(char_int):
    if char_int == 0:
        return " "
    else:
        char = chr(char_int + 97 - 1)
        return char
    
def str_to_onehot(s):
    mat = torch.zeros((2, len(s), N+1))
    
    for i in range(len(s)):
        char = s[i]
        char_int = ord(char.lower())-97 + 1
        if char == " ":
            char_int = 0
        mat[0,i,char_int] = 1
        mat[1,i,char_int] = 1
    return mat

def onehot_to_str(logits):
    pred_char_ints = logits.argmax(dim=2).cpu()
    return onehotints_to_str(pred_char_ints)
    
def onehotints_to_str(pred_char_ints):
    pred_str = ""
    for i in range(pred_char_ints.shape[1]):
        char_int = pred_char_ints[0,i].item()
        pred_str += char_int_to_str(char_int)
    return pred_str
    
model_gru = GRUMemory(input_size=NUM_CLASSES, hidden_size=DELAY*NUM_CLASSES).to(device)

#model_gru.test_run("hello there")

criterion_gru = torch.nn.BCEWithLogitsLoss()
optimizer_gru = torch.optim.RMSprop(model_gru.parameters(), lr=0.001)

hidden=model_gru.init_hidden(next(iter(echo_dataloader))[0])
train_model3(model_gru, echo_dataloader, criterion_gru, optimizer_gru, num_epochs=2, hidden=hidden)

epoch: 0 batch 0 loss: 0.7000271081924438 correct%: 0.0
epoch: 0 batch 100 loss: 0.1381944864988327 correct%: 0.0
epoch: 0 batch 200 loss: 0.12589429318904877 correct%: 0.0
epoch: 0 batch 300 loss: 0.12078474462032318 correct%: 0.0
epoch: 0 batch 400 loss: 0.11878122389316559 correct%: 0.0
epoch: 0 batch 500 loss: 0.11707454919815063 correct%: 0.0
epoch: 0 batch 600 loss: 0.11724920570850372 correct%: 0.0
epoch: 0 batch 700 loss: 0.11528179794549942 correct%: 0.0
epoch: 0 batch 800 loss: 0.11901368945837021 correct%: 0.0
epoch: 0 batch 900 loss: 0.11679990589618683 correct%: 0.0
epoch: 0 batch 1000 loss: 0.11554388701915741 correct%: 0.0
epoch: 0 batch 1100 loss: 0.11665476858615875 correct%: 0.0
epoch: 0 batch 1200 loss: 0.11602777987718582 correct%: 0.0
epoch: 0 batch 1300 loss: 0.11546003818511963 correct%: 0.0
epoch: 0 batch 1400 loss: 0.11599144339561462 correct%: 0.0
epoch: 0 batch 1500 loss: 0.1162181869149208 correct%: 0.0
epoch: 0 batch 1600 loss: 0.11555540561676025 correct%:

epoch: 1 batch 4800 loss: 5.582403173320927e-05 correct%: 100.0
epoch: 1 batch 4900 loss: 2.6146606614929624e-05 correct%: 100.0
epoch: 1 batch 5000 loss: 2.672799200809095e-05 correct%: 100.0
epoch: 1 batch 5100 loss: 2.4974613552330993e-05 correct%: 100.0
epoch: 1 batch 5200 loss: 5.069262260803953e-05 correct%: 97.0
epoch: 1 batch 5300 loss: 4.307605922804214e-05 correct%: 100.0
epoch: 1 batch 5400 loss: 2.8165377443656325e-05 correct%: 100.0
epoch: 1 batch 5500 loss: 2.574163045210298e-05 correct%: 100.0
epoch: 1 batch 5600 loss: 2.4531304006814025e-05 correct%: 100.0
epoch: 1 batch 5700 loss: 0.00016950452118180692 correct%: 96.0
epoch: 1 batch 5800 loss: 0.00019201914255972952 correct%: 99.6
epoch: 1 batch 5900 loss: 3.1066476367414e-05 correct%: 100.0
epoch: 1 batch 6000 loss: 3.433397796470672e-05 correct%: 100.0
epoch: 1 batch 6100 loss: 3.808172550634481e-05 correct%: 100.0
epoch: 1 batch 6200 loss: 2.245393261546269e-05 correct%: 100.0
epoch: 1 batch 6300 loss: 1.91822546184

## Training
Below you need to implement the training of the model. We give you more freedom as for the implementation. The two limitations are that it has to execute within 10 minutes, and that error rate should be below 1%.

In [None]:
D = DELAY
def test_model(model, sequence_length=15):
  """
  This is the test function that runs 100 different strings through your model,
  and checks the error rate.
  """
  total = 0
  correct = 0
  for i in range(500):
    s = ''.join([random.choice(string.ascii_lowercase) for i in range(random.randint(15, 25))])
    result = model.test_run(s)
    assert D > 0, 's[:-D] won\'t work for D=0'
    print("res:", result, s[:-D], result[D:])
    for c1, c2 in zip(s[:-D], result[D:]):
      correct += int(c1 == c2)
    total += len(s) - D

  return correct / total

def train_model(model, train_dataloader, loss_fn, optimizer, num_epochs):
    model.train()
    
    for train_idx in range(num_epochs):
        total_loss = 0.0
        num_samples = 0
        for batch_idx, (BX, BY) in enumerate(train_dataloader):
            #BX = BX.reshape(BX.shape[0], BX.shape[1], 1)
            #BY = BY.reshape(BY.shape[0], BY.shape[1], 1)
            BX = BX.to(device)
            BY = BY.to(device)
            optimizer.zero_grad()
            Yhat = model(BX)
            loss = loss_fn(Yhat, BY)
            total_loss += loss.item()
            num_samples+= BX.shape[0]
            loss.backward()
            optimizer.step()
            
            if batch_idx % 100 == 0:
                print("epoch:", train_idx, "batch", batch_idx, "loss:", total_loss/num_samples)
        print("epoch:", train_idx, "loss:", total_loss/num_samples)
        
    

In [None]:
import time
start_time = time.time()

# TODO: initialize and train your model here.
model = GRUMemory(input_size=N+1, hidden_size=6).to(device)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

text = "hello there"
print(model.test_run(text))
train_model(model, echo_dataloader, loss_fn, optimizer, num_epochs=1)

def do_test_model():
    end_time = time.time()
    duration = end_time - start_time
    accuracy = test_model(model)
    assert duration < 600, 'execution took f{duration:.2f} seconds, which longer than 10 mins'
    assert accuracy > 0.99, f'accuracy is too low, got {accuracy}, need 0.99'
    print('tests passed')
    
    
print("After:", model.test_run(text))
#do_test_model()

In [None]:
num_layers = 1

class RNNWrapper(torch.nn.Module):

  def __init__(self, input_size, hidden_size, output_size=1):
    super().__init__()
    self.rnn = torch.nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
    self.linear = torch.nn.Linear(hidden_size, output_size)
    self.linear2 = torch.nn.Linear(output_size, output_size)
    self.hidden=None
    
  def forward(self, x, h0):
    #num_hidden = 4
    #out = torch.roll(x, shifts=num_hidden, dims=1)
    #out[:,:num_hidden,:] = 0
    #out[:,:num_hidden,0] = 1
    #return out
    output, hidden = self.rnn(x, h0)
    return output
    #print("Output shape:", output.shape, "Hidden shape:", hidden.shape)
    return self.linear(torch.sigmoid(output))


hidden_size = 4
rnn = RNNWrapper(input_size=1, hidden_size=hidden_size).to(device)
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.RMSprop(rnn.parameters(), lr=30)
#input = torch.randn(3, 5, 10)
#h0 = torch.randn(1, 3, 20)
#output, hn = rnn(input, h0)
#print(output.shape, hn.shape)
#print([p.shape for p in rnn.parameters()])

def train_model2(model, train_dataloader, loss_fn, optimizer, num_epochs):
    model.train()
    
    for train_idx in range(num_epochs):
        total_loss = 0.0
        num_samples = 0
        for batch_idx, (BX, BY) in enumerate(train_dataloader):
            #BX = BX.reshape(BX.shape[0], BX.shape[1], 1)
            #BY = BY.reshape(BY.shape[0], BY.shape[1], 1)
            BX = BX.to(device)
            BY = BY.to(device)
            print(BX.shape)
            hidden = torch.zeros(num_layers, BX.shape[0], hidden_size).to(device)
            optimizer.zero_grad()
            Yhat = model(BX, hidden)
            #print("Input:", onehot_to_str(BX[0:1]), "Expected:", onehot_to_str(BY[0:1]), "Guess:", onehot_to_str(Yhat[0:1]))
            #loss = loss_fn(Yhat[0:1], BY[0:1])
            #print("Loss should be 0", loss)
            #return
            loss = loss_fn(Yhat, BY)
            total_loss += loss.item()
            num_samples+= BX.shape[0]
            loss.backward()
            optimizer.step()
            
            if batch_idx % 100 == 0:
                print("epoch:", train_idx, "batch", batch_idx, "loss:", total_loss/num_samples)
        print("epoch:", train_idx, "loss:", total_loss/num_samples)
        hidden = torch.zeros(num_layers, 2, hidden_size).to(device)
        pred = rnn(BX[0:2, :, :], hidden)
        #print("Pred:", onehot_to_str(pred), "Actual", onehot_to_str(BY[:2]))
        
hidden = torch.zeros(num_layers, 2, hidden_size).to(device)
#pred = rnn(str_to_onehot("hello there").to(device), hidden)
#print("Before:", onehot_to_str(pred))
train_model2(rnn, echo_dataloader, loss_fn, optimizer, num_epochs=10)
#pred = rnn(str_to_onehot("hello there").to(device), hidden)
#print("After:", onehot_to_str(pred))

In [None]:
a = torch.zeros((2, 5))
a[1] = 2
b = torch.zeros((2, 5))
torch.all(a == b, dim=1)
random.choice(range(0, 2))

## Variable delay model

Now, to make this more complicated, we want to have varialbe delay. So, now, the goal is to transform a sequence of pairs (character, delay) into a character sequence with given delay. Delay stays constant within one sequence.

### Dataset
As before, we first implement the dataset:

In [None]:
class VariableDelayEchoDataset(torch.utils.data.IterableDataset):

  def __init__(self, max_delay=8, seq_length=20, size=1000):
    self.max_delay = max_delay
    self.seq_length = seq_length
    self.size = size
  
  def __len__(self):
    return self.size

  def __iter__(self):
    for _ in range(self.size):
      seq = torch.tensor([random.choice(range(1, N + 1)) for i in range(self.seq_length)], dtype=torch.int64)
      delay = random.randint(0, self.max_delay)
      result = torch.cat((torch.zeros(delay), seq[:self.seq_length - delay])).type(torch.int64)
      yield seq, delay, result

### Model

And the model.

In [None]:
class VariableDelayGRUMemory(torch.nn.Module):

  def __init__(self, hidden_size, max_delay):
    super().__init__()
    #TODO

  def forward(self, x, delays):
    # inputs:
    # x - tensor of shape (batch size, seq length, N + 1)
    # delays - tensor of shape (batch size)
    # returns:
    # logits (scores for softmax) of shape (batch size, seq_length, N + 1)

    # TODO
    pass

  @torch.no_grad()
  def test_run(self, s, delay):
    # This function accepts one string s containing lowercase characters a-z, 
    # and a delay - the desired output delay.
    # You need to map those characters to one-hot encodings, 
    # then get the result from your network, and then convert the output 
    # back to a string of the same length, with 0 mapped to ' ', 
    # and 1-26 mapped to a-z.

    # TODO
    pass


### Train

As before, you're free to do what you want, as long as training finishes within 10 minutes and accuracy is above 0.99 for delays between 0 and 8.

In [None]:
def test_variable_delay_model(model, seq_length=20):
  """
  This is the test function that runs 100 different strings through your model,
  and checks the error rate.
  """
  total = 0
  correct = 0
  for i in range(500):
    s = ''.join([random.choice(string.ascii_lowercase) for i in range(seq_length)])
    d = random.randint(0, model.max_delay)
    result = model.test_run(s, d)
    if d > 0:
      z = zip(s[:-d], result[d:])
    else:
      z = zip(s, result)
    for c1, c2 in z:
      correct += int(c1 == c2)
    total += len(s) - d

  return correct / total

In [None]:
import time
start_time = time.time()

MAX_DELAY = 8
SEQ_LENGTH = 20

# TODO: implement model training here.
model = None

end_time = time.time()
assert end_time - start_time < 600, 'executing took longer than 10 mins'
assert test_variable_delay_model(model) > 0.99, 'accuracy is too low'
print('tests passed')