In [97]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from enum import Enum
import re
import random
import wandb
from tqdm import tqdm
import helper
import time
import scan_dataset
import models

In [2]:
# Replace test-project by experiment
wandb.init(project="test-project", entity="atnlp")

[34m[1mwandb[0m: Currently logged in as: [33mchristian2903[0m ([33matnlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# !git clone https://github.com/brendenlake/SCAN

In [4]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [5]:
input_lang = scan_dataset.Lang()
output_lang = scan_dataset.Lang()

train_dataset = scan_dataset.ScanDataset(
    split=scan_dataset.ScanSplit.SIMPLE_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=True
)

test_dataset = scan_dataset.ScanDataset(
    split=scan_dataset.ScanSplit.SIMPLE_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=False
)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)

MAX_LENGTH = max(train_dataset.input_lang.max_length, train_dataset.output_lang.max_length)

In [106]:
teacher_forcing_ratio = .5

def train_iteration(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    """A single training iteration."""
    # Reset the gradients and loss
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    # Encode the input
    encoder_hidden, encoder_hidden_all = encoder(input_tensor)

    # Prepare the initial decoder input
    decoder_input = torch.tensor([[scan_dataset.SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    target_length = target_tensor.size(0)
    for di in range(target_length):
        # Decode next token
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_hidden_all)
            
        loss += criterion(decoder_output, target_tensor[di])

        # If teacher forcing is used, the next input is the target
        # Otherwise, the next input is the output with the highest probability
        if use_teacher_forcing:
            decoder_input = target_tensor[di]
        else:
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

        # If the decoder input is the EOS token, stop decoding
        if decoder_input.item() == scan_dataset.EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [107]:
def train(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=1e-2):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iteration in range(1, n_iters + 1):
        X, y = train_dataset[random.randrange(len(train_dataset))]
        input_tensor, target_tensor = train_dataset.convert_to_tensor(X, y)

        loss = train_iteration(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iteration % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            # wandb.log({"avg_loss": print_loss_avg})
            print('%s (%d %d%%) %.4f' % (helper.time_since(start, iteration / n_iters),
                                         iteration, iteration / n_iters * 100, print_loss_avg))

        if iteration % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    helper.show_plot(plot_losses)

In [None]:
def evaluate(dataset, encoder, decoder, max_length, verbose=False, batch_size=1, shuffle=False):
    encoder.eval()
    decoder.eval()
    
    accs = []
    
    with torch.no_grad():
        for input_tensor, target_tensor in tqdm(dataset, total=len(dataset), leave=False, desc="Evaluating"):
            input_tensor, target_tensor = dataset.convert_to_tensor(input_tensor, target_tensor)
            
            preds = []

            encoder_hidden = encoder.init_hidden()

            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(
                    input_tensor[ei], encoder_hidden)
                encoder_outputs[ei] = encoder_output[0, 0]

            decoder_input = torch.tensor([[scan_dataset.SOS_token]], device=device)

            decoder_hidden = encoder_hidden

            for di in range(target_length):
                try:
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                except TypeError:
                    decoder_output, decoder_hidden, decoder_attention = decoder(
                        decoder_input, decoder_hidden, encoder_outputs)

                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze().detach()  # detach from history as input

                preds.append(decoder_input.item())

                if decoder_input.item() == scan_dataset.EOS_token:
                    break

            preds = np.array(preds)
            gts = target_tensor.detach().cpu().numpy().squeeze()

            if len(preds) == len(gts):
                accs.append(np.all(preds == gts))
            else:
                accs.append(0)
            
    if verbose:
        print("Accuracy", np.mean(accs))
        
    encoder.train()
    decoder.train()

    return np.mean(accs)

In [109]:
hidden_size = 256

config = {
    'HIDDEN_SIZE': 256, # 25, 50, 100, 200, or 400
    'RNN_TYPE': 'RNN', # RNN, GRU or LSTM
    'N_LAYERS': 2, # 1 or 2
    'DROPOUT': 0, # 0, 0.1 or 0.5
}

wandb.config = config

encoder1 = models.EncoderRNN(train_dataset.input_lang.n_words, config['HIDDEN_SIZE'], device, config['N_LAYERS'], config['RNN_TYPE'], config['DROPOUT']).to(device)
decoder1 = models.DecoderRNN(train_dataset.output_lang.n_words, config['HIDDEN_SIZE'], config['N_LAYERS'], config['RNN_TYPE'], config['DROPOUT']).to(device)
# decoder1 = AttnDecoderRNN(train_dataset.output_lang.n_words, config['HIDDEN_SIZE'], config['N_LAYERS'], config['RNN_TYPE'], config['RNN_TYPE']).to(device)

train(encoder1, decoder1, 10000, print_every=10)

0m 0s (- 2m 3s) (10 0%) 3.3669
0m 0s (- 1m 49s) (20 0%) 7.8019
0m 0s (- 1m 40s) (30 0%) 5.7184
0m 0s (- 1m 38s) (40 0%) 4.1964
0m 0s (- 1m 38s) (50 0%) 3.0679
0m 0s (- 1m 36s) (60 0%) 2.7543
0m 0s (- 1m 36s) (70 0%) 2.7801
0m 0s (- 1m 35s) (80 0%) 2.3384
0m 0s (- 1m 36s) (90 0%) 3.0742
0m 0s (- 1m 35s) (100 1%) 2.8176
0m 1s (- 1m 34s) (110 1%) 2.6878
0m 1s (- 1m 34s) (120 1%) 3.1294
0m 1s (- 1m 34s) (130 1%) 3.4889
0m 1s (- 1m 34s) (140 1%) 3.0169
0m 1s (- 1m 34s) (150 1%) 2.8700
0m 1s (- 1m 35s) (160 1%) 3.6825
0m 1s (- 1m 37s) (170 1%) 2.9920
0m 1s (- 1m 40s) (180 1%) 2.9205
0m 1s (- 1m 39s) (190 1%) 3.4127
0m 2s (- 1m 39s) (200 2%) 3.7497
0m 2s (- 1m 38s) (210 2%) 2.9122
0m 2s (- 1m 38s) (220 2%) 3.0041
0m 2s (- 1m 37s) (230 2%) 2.8855
0m 2s (- 1m 37s) (240 2%) 3.7606
0m 2s (- 1m 38s) (250 2%) 2.6033
0m 2s (- 1m 37s) (260 2%) 2.6463
0m 2s (- 1m 37s) (270 2%) 2.9833
0m 2s (- 1m 37s) (280 2%) 2.9599


KeyboardInterrupt: 

### Experiment 1

The top-performing architecture was a LSTM with no attention, 2
layers of 200 hidden units, and no dropout. The best-overall
network achieved 99.7% correct.

SCAN tasks were randomly split into a training set (80%) and a test set (20%).

### Experiment 2

The best result (20.8% on average, again over 5 runs) is achieved
by a GRU with attention, one 50-dimensional hidden layer,
and dropout 0.5

In [13]:
wandb.init(project="experiment-2", entity="atnlp")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.03348856767018636, max=1.0)…

In [14]:
input_lang = Lang()
output_lang = Lang()

NameError: name 'Lang' is not defined

In [None]:
train_dataset = ScanDataset(
    split=ScanSplit.LENGTH_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=True
)

test_dataset = ScanDataset(
    split=ScanSplit.LENGTH_SPLIT,
    input_lang=input_lang,
    output_lang=output_lang,
    train=False
)

assert (len(train_dataset) == 16990)
assert (len(test_dataset) == 3920)

In [None]:
experiment_2_config = dict(HIDDEN_SIZE=50, N_LAYERS=1, DROPOUT=.5, RNN_TYPE='GRU')
overall_best_config = dict(HIDDEN_SIZE=200, N_LAYERS=2, DROPOUT=.5, RNN_TYPE='LSTM')

config = experiment_2_config

In [None]:
encoder_exp_2 = EncoderRNN(input_lang.n_words, config=config).to(device)
decoder_exp_2 = DecoderRNN(output_lang.n_words, config=config).to(device)
attn_decoder_exp_2 = AttnDecoderRNN(output_lang.n_words, config=config).to(device)



In [None]:
train_iterations(encoder_exp_2, attn_decoder_exp_2, 10000, print_every=100)

0m 4s (- 7m 30s) (100 1%) 2.1491
0m 9s (- 7m 53s) (200 2%) 1.8391
0m 14s (- 7m 40s) (300 3%) 1.8041
0m 19s (- 7m 37s) (400 4%) 1.7459
0m 24s (- 7m 37s) (500 5%) 1.7441
0m 28s (- 7m 27s) (600 6%) 1.7132
0m 33s (- 7m 20s) (700 7%) 1.7615
0m 38s (- 7m 19s) (800 8%) 1.6810
0m 43s (- 7m 18s) (900 9%) 1.6580
0m 48s (- 7m 13s) (1000 10%) 1.6702
0m 52s (- 7m 6s) (1100 11%) 1.6138
0m 57s (- 7m 0s) (1200 12%) 1.4693
1m 2s (- 6m 56s) (1300 13%) 1.5826
1m 7s (- 6m 53s) (1400 14%) 1.5038
1m 12s (- 6m 52s) (1500 15%) 1.5233
1m 18s (- 6m 49s) (1600 16%) 1.3632
1m 23s (- 6m 46s) (1700 17%) 1.4715
1m 28s (- 6m 43s) (1800 18%) 1.4320
1m 34s (- 6m 40s) (1900 19%) 1.4505
1m 39s (- 6m 36s) (2000 20%) 1.4093
1m 44s (- 6m 32s) (2100 21%) 1.3209
1m 49s (- 6m 26s) (2200 22%) 1.3297
1m 54s (- 6m 21s) (2300 23%) 1.3695
1m 58s (- 6m 15s) (2400 24%) 1.3888
2m 3s (- 6m 10s) (2500 25%) 1.2416
2m 8s (- 6m 7s) (2600 26%) 1.3154
2m 14s (- 6m 3s) (2700 27%) 1.3073
2m 20s (- 6m 0s) (2800 28%) 1.2706
2m 25s (- 5m 55s) (29

In [None]:
evaluate(test_dataset, encoder_exp_2, attn_decoder_exp_2)



0.0

### Experiment 3

The best performance is achieved by
a GRU network with attention, one layer with 100 hidden
units, and dropout of 0.1 (90.3% accuracy). 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ec00d141-8917-4313-a10a-78395d2ec852' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>