In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from enum import Enum
import re
import random
import wandb
from tqdm import tqdm

In [2]:
# Replace test-project by experiment
wandb.init(project="test-project", entity="atnlp")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
SOS_token = 0
EOS_token = 1
OOV_token = 2

## Dataloading

In [4]:
# !git clone https://github.com/brendenlake/SCAN

fatal: destination path 'SCAN' already exists and is not an empty directory.


In [3]:
class ScanSplit(Enum):
    SIMPLE_SPLIT = 'simple_split'
    LENGTH_SPLIT = 'length_split'
    FEW_SHOT_SPLIT = 'few_shot_split'
    ADD_PRIM_JUMP_SPLIT = 'add_prim_split'
    ADD_PRIM_TURNLEFT_SPLIT = 'add_prim_split' # shouldn't have same value => if condition won't happen

In [4]:
class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS", OOV_token: 'OOV'}
        self.n_words = len(self.index2word)  # Count tokens

        self.max_length = 0

    def add_sentence(self, sentence):
        """Add sentence to vocab"""
        for word in sentence.split(' '):
            self._add_word(word)

    def _add_word(self, word):
        """Add word to vocab"""
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
            self.max_length = max(len(word), self.max_length)
        else:
            self.word2count[word] += 1

    def indexes_from_sentence(self, sentence: str):
        """Get word ids from sentence"""
        indexes = [self.word2index.get(word,OOV_token) for word in sentence.split(' ')]
        return indexes

    def tensor_from_sentence(self, sentence:str):
        """Convert sentence to torch tensor"""
        indexes = self.indexes_from_sentence(sentence)
        return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [5]:
class ScanDataset(Dataset):
    def __init__(self, split: ScanSplit, input_lang: Lang, output_lang: Lang, train: bool = True):
        
        self.input_lang = input_lang
        self.output_lang = output_lang


        self.X, self.y = self._get_data(split, train)


    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


    def convert_to_tensor(self, X, y):
        input_tensor = self.input_lang.tensor_from_sentence(X)
        target_tensor = self.output_lang.tensor_from_sentence(y)
        return (input_tensor, target_tensor)
    

    def _get_data(self, split: ScanSplit, train: bool = True):
        """Retrieve the right data for the selected split"""
        
        if split == ScanSplit.SIMPLE_SPLIT:
            X_train, y_train = self._extract_data_from_file('SCAN/simple_split/tasks_train_simple.txt')
            X_test, y_test = self._extract_data_from_file('SCAN/simple_split/tasks_test_simple.txt')
        elif split == ScanSplit.LENGTH_SPLIT:
            X_train, y_train = self._extract_data_from_file('SCAN/length_split/tasks_train_length.txt')
            X_test, y_test = self._extract_data_from_file('SCAN/length_split/tasks_test_length.txt')
        elif split == ScanSplit.ADD_PRIM_JUMP_SPLIT:
            X_train, y_train = self._extract_data_from_file('SCAN/add_prim_split/tasks_train_addprim_jump.txt')
            X_test, y_test = self._extract_data_from_file('SCAN/add_prim_split/tasks_test_addprim_jump.txt')
        elif split == ScanSplit.ADD_PRIM_TURNLEFT_SPLIT:
            X_train, y_train = self._extract_data_from_file('SCAN/add_prim_split/tasks_train_addprim_turn_left.txt')
            X_test, y_test = self._extract_data_from_file('SCAN/add_prim_split/tasks_test_addprim_turn_left.txt')
        else:
            raise Exception('Split not implemented')
        
        if train:
            X = X_train
            y = y_train

            # Add words to vocabs
            for sen in X:
                self.input_lang.add_sentence(sen)

            for sen in y:
                self.output_lang.add_sentence(sen)
        else:
            X = X_test
            y = y_test

        return X,y
        
    def _extract_data_from_file(self, filepath: str):
        """Get X and y from SCAN file"""
        with open(filepath) as f:
            txt_data = f.readlines()

        # Format is in IN: ... OUT: ...
        lead_token = 'IN:'
        split_token = 'OUT:'

        # Split at OUT and remove IN
        txt_data = [sen.strip(lead_token).split(split_token) for sen in txt_data]

        in_txt = [sen[0] for sen in txt_data]
        out_txt = [sen[1] for sen in txt_data]

        return in_txt, out_txt

In [6]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [7]:
input_lang = Lang()
output_lang = Lang()

train_dataset = ScanDataset(
    split=ScanSplit.SIMPLE_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=True
)

test_dataset = ScanDataset(
    split=ScanSplit.SIMPLE_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=False
)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)

MAX_LENGTH = max(train_dataset.input_lang.max_length, train_dataset.output_lang.max_length)

## Model

In [8]:
def init_hidden(rnn_type, n_layers, hidden_size):
    if rnn_type == 'LSTM':
        return (
            torch.zeros(n_layers, 1, hidden_size, device=device),
            torch.zeros(n_layers, 1, hidden_size, device=device)
        )
    return torch.zeros(n_layers, 1, hidden_size, device=device)

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, config):
        super(EncoderRNN, self).__init__()
        self.hidden_size = config['HIDDEN_SIZE']
        self.n_layers = config['N_LAYERS']

        self.embedding = nn.Embedding(input_size, self.hidden_size)

        self.dropout = nn.Dropout(config['DROPOUT'])

        self.RNN_type = config['RNN_TYPE']

        self.rnn = nn.__dict__[self.RNN_type](
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            num_layers=self.n_layers,
            dropout=config['DROPOUT']
        )

    def forward(self, encoder_input, hidden):
        output = self.embedding(encoder_input).view(1, 1, -1)
        output = self.dropout(output)
        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def init_hidden(self):
        return init_hidden(self.RNN_type, self.n_layers, self.hidden_size)

In [261]:
class DecoderRNN(nn.Module):
    def __init__(self, output_size, config):
        super(DecoderRNN, self).__init__()
        self.hidden_size = config['HIDDEN_SIZE']
        self.n_layers = config['N_LAYERS']

        self.embedding = nn.Embedding(output_size, self.hidden_size)

        self.dropout = nn.Dropout(config['DROPOUT'])

        self.RNN_type = config['RNN_TYPE']

        self.rnn = nn.__dict__[self.RNN_type](
            input_size=self.hidden_size,
            hidden_size=self.hidden_size,
            num_layers=self.n_layers,
            dropout=config['DROPOUT']
        )

        self.out = nn.Linear(self.hidden_size*2, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, decoder_input, hidden):
        output = self.embedding(decoder_input).view(1, 1, -1)
        output = self.dropout(output)
        output = F.relu(output)
        print(output.size(), hidden[0].size())
        output, hidden = self.rnn(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return init_hidden(self.RNN_type, self.n_layers, self.hidden_size)

In [278]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, output_size, config):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = config['HIDDEN_SIZE']
        self.n_layers = config['N_LAYERS']
        self.output_size = output_size
        self.dropout = config['DROPOUT']
        self.RNN_type = config['RNN_TYPE']
        self.max_length = MAX_LENGTH

        self.rnn = nn.__dict__[self.RNN_type](
            input_size=self.hidden_size,
            hidden_size=self.hidden_size*2,
            num_layers=self.n_layers,
            dropout=config['DROPOUT']
        )
        self.W = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
        self.U = nn.Parameter(torch.randn((self.hidden_size, self.hidden_size)))
        self.v = nn.Parameter(torch.randn((self.hidden_size, 1)))

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        self.dropout = nn.Dropout(self.dropout)

        self.out = nn.Linear(self.hidden_size*2, output_size)

    def e(self, g, h):
        """Computes the similarity between the previous decoder hidden state g and an encoder hidden state h"""
        # vT tanh(W g_(i-1) + U h_t)
        return self.v.T @ torch.tanh(self.W * g + self.U * h)

    def alpha(self, encoder_hiddens, input_hidden, t):
        """Computes the attention weight for a given encoder hidden state"""
        # alpha_it = exp(e(g_(i-1), h_t)) / sum(exp(e(g_(i-1), h_j)))
        T = len(encoder_hiddens)
        numerator = torch.exp(self.e(input_hidden, encoder_hiddens[t]))

        denominator = 0

        for j in range(T):
            denominator += torch.exp(self.e(input_hidden, encoder_hiddens[j]))

        return numerator/denominator


    def forward(self, input, input_hidden, encoder_hiddens):

        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        # c_i = sum(alpha_it * h_t)
        c_i = 0

        for t in range(len(encoder_hiddens)):
            alpha_it = self.alpha(encoder_hiddens, input_hidden, t)
            h_t = encoder_hiddens[t]
            c_i += alpha_it * h_t

        hidden = torch.concat((input_hidden, c_i), dim=2) # Concatenate the context vector and the decoder hidden state
        output, hidden = self.rnn(embedded, hidden) 
        output = F.log_softmax(self.out(output[0]), dim=0)

        # Seperate the concatenated hidden state into the decoder hidden state and the context vector
        hidden, context = torch.split(hidden, self.hidden_size, dim=2)

        return output, hidden

In [285]:
hidden_size = 256

config = {
    'HIDDEN_SIZE': 256, # 25, 50, 100, 200, or 400
    'RNN_TYPE': 'RNN', # RNN, GRU or LSTM
    'N_LAYERS': 2, # 1 or 2
    'DROPOUT': 0, # 0, 0.1 or 0.5
}

wandb.config = config


encoder1 = EncoderRNN(train_dataset.input_lang.n_words, config).to(device)
# decoder1 = DecoderRNN(train_dataset.output_lang.n_words, config).to(device)
decoder1 = AttnDecoderRNN(train_dataset.output_lang.n_words, config).to(device)

train_iterations(encoder1, decoder1, 1000, print_every=100)

6m 10s (- 55m 30s) (100 10%) 0.0000


KeyboardInterrupt: 

In [283]:
teacher_forcing_ratio = .5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
          max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_hidden_all = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        if encoder.RNN_type == 'LSTM':
            encoder_hidden_all[ei] = encoder_hidden[0][0, 0]
        else:
            encoder_hidden_all[ei] = encoder_hidden[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            if decoder.__class__.__name__ == 'AttnDecoderRNN':
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden, encoder_hidden_all)
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                

            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            if decoder.__class__.__name__ == 'AttnDecoderRNN':
                decoder_output, decoder_hidden = decoder(
                    decoder_input, decoder_hidden, encoder_hidden_all)
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                

            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [87]:
import math
import time


def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [88]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def show_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [284]:
def train_iterations(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=1e-2):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iteration in range(1, n_iters + 1):
        X, y = train_dataset[random.randrange(len(train_dataset))]
        input_tensor, target_tensor = train_dataset.convert_to_tensor(X, y)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iteration % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            # wandb.log({"avg_loss": print_loss_avg})
            print('%s (%d %d%%) %.4f' % (time_since(start, iteration / n_iters),
                                         iteration, iteration / n_iters * 100, print_loss_avg))

        if iteration % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    show_plot(plot_losses)

In [90]:
def evaluate(dataset, encoder, decoder, num_layers, hidden_size, EOS_token, device, verbose=False, batch_size=1, shuffle=False):
    encoder.eval()
    decoder.eval()

    accs = []
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    with torch.no_grad():
        for i, (x, y) in tqdm(enumerate(dataloader), total=len(dataloader), leave=False, desc="Evaluating"):
            preds = []

            x = x.squeeze()
            x = torch.cat((x, EOS_tensor), dim=0)
            x = x.to(device)

            y = y.squeeze()
            y = torch.cat((y, EOS_tensor), dim=0)
            y = y.to(device)

            encoder_hidden = torch.zeros(num_layers, 1, hidden_size, device=device)
            encoder_cx = torch.zeros((num_layers, 1, hidden_size), device=device)

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_cx = torch.zeros((num_layers, 1, hidden_size), device=device)

            for j in range(len(x)):
                _, encoder_hidden, encoder_cx = encoder(x[j], encoder_hidden, encoder_cx)

            decoder_hidden = encoder_hidden

            for j in range(len(y)):
                decoder_output, decoder_hidden, decoder_cx = decoder(
                    decoder_input, decoder_hidden, decoder_cx
                )

                decoder_input = decoder_output.topk(1)[1]

                preds.append(decoder_input.item())

                if decoder_input.item() == EOS_token:
                    break

            preds = np.array(preds)
            gts = y.detach().cpu().numpy()
            
            if len(preds) == len(gts):
                accs.append(np.all(preds == gts))
            else:
                accs.append(0)
          
    if verbose:
        print("Accuracy", np.mean(accs))
    
    return np.mean(accs)

In [91]:
def evaluate(dataset, encoder, decoder, device=device, verbose=False, batch_size=1, shuffle=False, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    accuracies = []

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    with torch.no_grad():
        for i, (sentence, truth) in tqdm(enumerate(dataloader), total=len(dataloader), leave=False, desc="Evaluating"):
            preds = []

            input_tensor, target_tensor = dataset.convert_to_tensor(sentence[0], truth[0])
            input_length = input_tensor.size()[0]

            encoder_hidden = encoder.init_hidden()
            encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
                encoder_outputs[ei] += encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = decoder.init_hidden()
            decoded_words = []
            decoder_attentions = torch.zeros(max_length, max_length)

            for di in range(max_length):
                try:
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                except TypeError:
                    decoder_output, decoder_hidden, decoder_attention = decoder(
                        decoder_input, decoder_hidden, encoder_outputs)

                    decoder_attentions[di - 1] = decoder_attention.data

                topv, topi = decoder_output.data.topk(1)
                if topi.item() == EOS_token:
                    decoded_words.append('<EOS>')
                    break
                else:
                    decoded_words.append(dataset.output_lang.index2word[topi.item()])

                # decoder_input = topi.squeeze().detach()

                decoder_input = decoder_output.topk(1)[1]
                preds.append(decoder_input.item())

                if decoder_input.item() == EOS_token:
                    break

            preds = np.array(preds)
            gts = target_tensor.detach().cpu().numpy()

            if len(preds) == len(gts):
                accuracies.append(np.all(preds == gts))
            else:
                accuracies.append(0)

    if verbose:
        print("Accuracy", np.mean(accuracies))

    return np.mean(accuracies)

In [92]:
def evaluate_random(encoder, decoder, n=10):
    for i in range(n):
        rand_idx = random.randrange(len(test_dataset))
        pair = test_dataset.X[rand_idx], test_dataset.y[rand_idx]
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print(output_sentence)

In [118]:
hidden_size = 256

config = {
    'HIDDEN_SIZE': 256, # 25, 50, 100, 200, or 400
    'RNN_TYPE': 'LSTM', # RNN, GRU or LSTM
    'N_LAYERS': 2, # 1 or 2
    'DROPOUT': 0, # 0, 0.1 or 0.5
}

wandb.config = config


encoder1 = EncoderRNN(train_dataset.input_lang.n_words, config).to(device)
# decoder1 = DecoderRNN(train_dataset.output_lang.n_words, config).to(device)
decoder1 = AttnDecoderRNN(train_dataset.output_lang.n_words, config).to(device)

train_iterations(encoder1, decoder1, 1000, print_every=100)

Encoder hidden: torch.Size([13, 256])


RuntimeError: The expanded size of the tensor (13) must match the existing size (16) at non-singleton dimension 0.  Target sizes: [13].  Tensor sizes: [16]

### Experiment 1

The top-performing architecture was a LSTM with no attention, 2
layers of 200 hidden units, and no dropout. The best-overall
network achieved 99.7% correct.

SCAN tasks were randomly split into a training set (80%) and a test set (20%).

### Experiment 2

The best result (20.8% on average, again over 5 runs) is achieved
by a GRU with attention, one 50-dimensional hidden layer,
and dropout 0.5

In [35]:
wandb.init(project="experiment-2", entity="atnlp")

0,1
avg_loss,█▆▆▆▅▅▅▄▄▄▄▄▄▃▃▂▃▂▃▃▃▂▂▂▂▂▂▂▁▁▂▂▁▁▂▁▂▁▁▁

0,1
avg_loss,0.9095


In [36]:
input_lang = Lang()
output_lang = Lang()

In [37]:
train_dataset = ScanDataset(
    split=ScanSplit.LENGTH_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=True
)

test_dataset = ScanDataset(
    split=ScanSplit.LENGTH_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=False
)

assert (len(train_dataset) == 16990)
assert (len(test_dataset) == 3920)

In [38]:
experiment_2_config = dict(HIDDEN_SIZE=50, N_LAYERS=1, DROPOUT=.5, RNN_TYPE='GRU')
overall_best_config = dict(HIDDEN_SIZE=200, N_LAYERS=2, DROPOUT=.5, RNN_TYPE='LSTM')

config = experiment_2_config

In [39]:
encoder_exp_2 = EncoderRNN(input_lang.n_words, config=config).to(device)
decoder_exp_2 = DecoderRNN(output_lang.n_words, config=config).to(device)
attn_decoder_exp_2 = AttnDecoderRNN(output_lang.n_words, config=config).to(device)



In [40]:
train_iterations(encoder_exp_2, attn_decoder_exp_2, 10000, print_every=100)

0m 4s (- 7m 30s) (100 1%) 2.1491
0m 9s (- 7m 53s) (200 2%) 1.8391
0m 14s (- 7m 40s) (300 3%) 1.8041
0m 19s (- 7m 37s) (400 4%) 1.7459
0m 24s (- 7m 37s) (500 5%) 1.7441
0m 28s (- 7m 27s) (600 6%) 1.7132
0m 33s (- 7m 20s) (700 7%) 1.7615
0m 38s (- 7m 19s) (800 8%) 1.6810
0m 43s (- 7m 18s) (900 9%) 1.6580
0m 48s (- 7m 13s) (1000 10%) 1.6702
0m 52s (- 7m 6s) (1100 11%) 1.6138
0m 57s (- 7m 0s) (1200 12%) 1.4693
1m 2s (- 6m 56s) (1300 13%) 1.5826
1m 7s (- 6m 53s) (1400 14%) 1.5038
1m 12s (- 6m 52s) (1500 15%) 1.5233
1m 18s (- 6m 49s) (1600 16%) 1.3632
1m 23s (- 6m 46s) (1700 17%) 1.4715
1m 28s (- 6m 43s) (1800 18%) 1.4320
1m 34s (- 6m 40s) (1900 19%) 1.4505
1m 39s (- 6m 36s) (2000 20%) 1.4093
1m 44s (- 6m 32s) (2100 21%) 1.3209
1m 49s (- 6m 26s) (2200 22%) 1.3297
1m 54s (- 6m 21s) (2300 23%) 1.3695
1m 58s (- 6m 15s) (2400 24%) 1.3888
2m 3s (- 6m 10s) (2500 25%) 1.2416
2m 8s (- 6m 7s) (2600 26%) 1.3154
2m 14s (- 6m 3s) (2700 27%) 1.3073
2m 20s (- 6m 0s) (2800 28%) 1.2706
2m 25s (- 5m 55s) (29

In [41]:
evaluate(test_dataset, encoder_exp_2, attn_decoder_exp_2)



0.0

### Experiment 3

The best performance is achieved by
a GRU network with attention, one layer with 100 hidden
units, and dropout of 0.1 (90.3% accuracy). 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ec00d141-8917-4313-a10a-78395d2ec852' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>