In [1]:
# This notebook loads the final trained model, checks its performance and gets it to translate some 
# input sentences from the validation dataset.
#
# Possible extensions: Look at attention patterns.
#                     Look at the learned embeddings.
#                     One could train with teacher forcing, instead of Scheduled Sampling, and see if the results seem qualitatively
#                        different, especially for longer sequences.
#

In [4]:
!pwd

/Users/sr_old/Desktop/attention_seq2seq


In [5]:
import sys
print(sys.executable)
print(sys.version)

/Users/sr_old/Desktop/attention_seq2seq/p3.10_attention_seq2seq/bin/python
3.10.13 (main, Aug 24 2023, 22:36:46) [Clang 14.0.3 (clang-1403.0.22.14.1)]


## Imports

In [6]:
import math
import re

In [7]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
#from torchviz import make_dot
from torch.utils.data import Dataset, DataLoader #, TensorDataset
from torch.utils.data.dataset import random_split
import datetime

In [8]:
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [9]:
from torch.distributions.categorical import Categorical

In [10]:
import matplotlib.pyplot as plt
%matplotlib inline    
from matplotlib.pyplot import rcParams
plt.style.use('ggplot')
rcParams['figure.figsize'] = 5,5

In [11]:
import itertools
import random

In [12]:
%load_ext autoreload
%autoreload 2

In [13]:
import pickle

In [14]:
from dataprep_functions import LanguageDataset

In [15]:
from model_functions import *

## Device and seed

In [16]:
device = 'cuda' if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else 'cpu'

device

'mps'

In [17]:
seed = 42

In [18]:
torch.manual_seed(seed)
#rng = np.random.default_rng(seed)
random.seed(seed)

## Variables

In [19]:
# Name of the folder where data is accessed and saved
path = "datasets"

In [20]:
results_path = "results"

In [21]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

## Loding the data we need

In [22]:
# Unpickling
with open(path + "/input_lang.txt", "rb") as y:
    input_lang = pickle.load(y)
with open(path + "/output_lang.txt", "rb") as y:
    output_lang = pickle.load(y)

In [23]:
train_dataset = torch.load(path + '/train.pt', weights_only=False)
val_dataset = torch.load(path + '/val.pt', weights_only=False)

## Now load the final model and investigate

In [24]:
# Set the hyperparameters right

In [25]:
m = Luong_full(vocab = input_lang.n_words, h_size = 90, dropout = 0.2, n_layers = 2, 
                    att_method = 'general', vocab_out = output_lang.n_words, device = device)

In [26]:
os.system("ls " + results_path + "/*");

results/5e-03_0.05_32_{'h_size':90,'dropout':0.2,'n_layers':2,'att_method':'general','c':'final_model'}.png
results/5e-03_1_32_{'h_size':30,'dropout':0,'n_layers':2,'att_method':'dot','c':'first'}.png

results/atlases:
att_method-dropout.png
hsize-layers.png
lr-ratio.png
lr-ratio_old.png

results/models:
5e-03_0.05_32_{'h_size':90,'dropout':0.2,'n_layers':2,'att_method':'general','c':'final_model'}.pt

results/ratio5,ss10:
5e-03_5_32_{'c':'','dropout':0,'h_size':90,'n_layers':3,'att_method':'dot'}.png
5e-03_5_32_{'c':'','dropout':0.1,'h_size':60,'n_layers':2,'att_method':'general'}.png
5e-03_5_32_{'c':'','dropout':0.3,'h_size':60,'n_layers':2,'att_method':'concat'}.png
5e-03_5_32_{'h_size':30,'dropout':0,'n_layers':2,'att_method':'dot','c':''}.png


In [27]:
# Pick the right model:
name = "5e-03_0.05_32_{'h_size':90,'dropout':0.2,'n_layers':2,'att_method':'general','c':'final_model'}.pt"

In [28]:
m.load_state_dict(torch.load(results_path + '/models/' + name, map_location=torch.device('cpu')))
m.to(device)

Luong_full(
  (encoder): EncoderRNN(
    (embedding): Embedding(655, 90)
    (gru): GRU(90, 45, num_layers=2, dropout=0.2, bidirectional=True)
  )
  (decoder): LuongDecoder(
    (embedding): Embedding(705, 90)
    (gru): GRU(90, 90, num_layers=2, dropout=0.2)
    (joinerFF): Linear(in_features=180, out_features=90, bias=True)
    (projFF): Linear(in_features=90, out_features=705, bias=True)
    (dropout_layer): Dropout(p=0.2, inplace=False)
    (alignment_vector): Attn(
      (mFF): Linear(in_features=90, out_features=90, bias=False)
    )
  )
)

In [29]:
# Now check that we can replicate the loss obtained.
# Set the epoch parameter to a high number, to make sure there is no teacher forcing.

val_loss = epochend_lcalc(m, lossmaker1, batcher(val_dataset, 800), device, epoch = 28)

In [30]:
val_loss

np.float64(1.277029554049174)

In [None]:
# Close enough.

## Now give it something to translate

In [31]:
val_dataset[1:5]

((tensor([[ 60, 387,   8, 193],
          [ 61, 211,  61,   3],
          [314,  61,  39,  59],
          [285, 511, 557,   6],
          [  6,   6,   9,   2],
          [  2,   2,   2,   0]]),
  [6, 6, 6, 5]),
 (tensor([[ 22, 456,   7, 606],
          [ 80, 539, 209, 209],
          [495, 264, 370,  37],
          [245,   4, 558,   4],
          [  4,   2,   8,   2],
          [  2,   0,   2,   0]]),
  [6, 5, 6, 5]))

In [34]:
def dec(inpt, index2word_dictn):
    # This function takes a single tokenised sentence and translates the tokens, based on the dictionary provided.

    inpt = inpt.numpy()
    assert inpt.shape[1] == 1, "More than one sequence input"

    return [index2word_dictn[k] for k in inpt.flatten()]

In [35]:
def translate(i, dataset, beamsize, model, input_index2word_dictn, output_index2word_dictn, device = device):
    # This function is mostly needed for neat printing-out of the results.
    # This calls the .beam_decode() method and we print the results in an orderly fashion.
    # i is the sentence index in the dataset

    inp = dataset[i][0][0]

    print("INPUT:")
    print(dec(inp, input_index2word_dictn))
    print("MODEL ANSWER:")
    print(dec(dataset[i][1][0], output_index2word_dictn))
    print("")
    
    beam = model.beam_decode(beamsize, inp.to(device), max_dec_length = 20)

    # Now reorder and print the info from the beam
    seqs = []
    probs = []
    for i in beam:
        seq, prob, _ = i
        seqs.append(seq)
        probs.append(prob)

    for k in zip(seqs, probs):
            print(dec(torch.tensor(k[0]).unsqueeze(1), output_index2word_dictn), "     ", round(k[1][0], 2))


In [36]:
translate(10, val_dataset, 5, m, input_lang.index2word, output_lang.index2word)

INPUT:
['tu', 'es', 'important', '.', 'EOS']
MODEL ANSWER:
['you', 'are', 'important', '.', 'EOS']

['you', 're', 'important', '.', 'EOS']       -0.6
['you', 're', 'important', 'important', 'EOS']       -1.39
['you', 're', 'important', '.', '.', 'EOS']       -2.94
['you', 're', 'important', 'important', '.', 'EOS']       -3.41
['you', 'are', 'important', '.', 'EOS']       -3.98


In [39]:
2**(-0.6)

0.6597539553864471

In [None]:
# The model is fairly confident but (just technically) wrong in its topmost prediction.
# I would have expected ['you', 'are', 'important', '.', 'EOS'] to have a higher log probability.

In [40]:
translate(11, val_dataset, 5, m, input_lang.index2word, output_lang.index2word)

INPUT:
['garde', 'le', '.', 'EOS']
MODEL ANSWER:
['keep', 'that', '.', 'EOS']

['keep', 'it', '.', '.', 'EOS']       -4.16
['keep', 'does', '.', '.', 'EOS']       -4.26
['keep', 'does', 'it', '.', 'EOS']       -4.31
['keep', 'it', '.', 'EOS']       -4.35
['keep', 'saw', 'it', '.', 'EOS']       -4.63


In [None]:
# I see evidence of the model learning that "that" and "it" are semantically related.
# The emergence of "does it" and "saw it" may indicate the model learning typical 2-grams of the English language.
# 

In [42]:
translate(12, val_dataset, 10, m, input_lang.index2word, output_lang.index2word)

INPUT:
['c', 'est', 'pour', 'toi', '.', 'EOS']
MODEL ANSWER:
['that', 's', 'for', 'you', '.', 'EOS']

['it', 's', 'pretty', '.', '.', 'EOS']       -3.53
['that', 's', 'for', '.', '.', 'EOS']       -3.74
['that', 's', 'pretty', '.', '.', 'EOS']       -3.96
['it', 's', 'for', '.', '.', 'EOS']       -4.04
['that', 's', 'for', '.', 'EOS']       -4.28
['it', 'is', 'pretty', '.', '.', 'EOS']       -4.3
['that', 's', 'yourself', '.', 'EOS']       -4.42
['this', 's', 'pretty', '.', '.', 'EOS']       -4.44
['this', 's', 'for', '.', '.', 'EOS']       -4.49
['it', 's', 'for', '.', 'EOS']       -4.5


In [None]:
# Surprised I do not see "for you" as a learned 2-gram here.

In [43]:
translate(221, val_dataset, 5, m, input_lang.index2word, output_lang.index2word)

INPUT:
['soyez', 'honnete', 'avec', 'moi', '.', 'EOS']
MODEL ANSWER:
['be', 'honest', 'with', 'me', '.', 'EOS']

['be', 'with', 'with', '.', 'EOS']       -1.7
['honest', 'with', 'with', '.', 'EOS']       -2.2
['be', 'honest', 'with', '.', 'EOS']       -2.31
['take', 'with', 'with', '.', 'EOS']       -3.36
['honest', 'honest', 'with', '.', 'EOS']       -3.63


In [44]:
translate(50, val_dataset, 25, m, input_lang.index2word, output_lang.index2word)

INPUT:
['je', 'devrais', 'etre', 'heureuse', '.', 'EOS']
MODEL ANSWER:
['i', 'should', 'be', 'happy', '.', 'EOS']

['i', 'should', 'be', 'happy', '.', 'EOS']       -0.79
['i', 'should', 'be', 'happy', 'EOS']       -2.1
['i', 'should', 'be', '.', '.', 'EOS']       -2.51
['i', 'should', 'be', '.', 'EOS']       -3.32
['i', 'can', 'be', 'happy', '.', 'EOS']       -3.69
['i', 'should', 'be', 'friends', '.', 'EOS']       -3.88
['i', 'll', 'be', 'happy', '.', 'EOS']       -4.48
['i', 'should', 'be', 'be', '.', 'EOS']       -4.69
['i', 'can', 'be', 'happy', 'EOS']       -4.78
['i', 'should', 'be', 'wrong', '.', 'EOS']       -4.85
['i', 'should', 'be', 'this', '.', 'EOS']       -4.86
['i', 'should', 'be', 'being', '.', 'EOS']       -5.11
['i', 'can', 'be', '.', '.', 'EOS']       -5.37
['i', 'should', 'be', 'friends', 'EOS']       -5.43
['i', 'should', 'be', 'EOS']       -5.44
['i', 'should', 'be', 'drink', '.', 'EOS']       -5.58
['i', 'll', 'be', 'happy', 'EOS']       -5.61
['i', 'should', 'be