In [None]:
!pip install icecream
from icecream import ic

Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting colorama>=0.3.9 (from icecream)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting executing>=0.3.1 (from icecream)
  Downloading executing-2.0.1-py2.py3-none-any.whl (24 kB)
Collecting asttokens>=2.0.1 (from icecream)
  Downloading asttokens-2.4.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.4.1 colorama-0.4.6 executing-2.0.1 icecream-2.1.3


Copy this notebook (File>Save a copy in Drive) and then work on your copy.
==
To send me your work: use the sharing menu (top-right of the window) to share it with timothee.m.r.bernard@gmail.com.
(I don't check this address very often, so, for questions, please use Moodle or my u-paris.fr address.)


Goal
==
We are about to train a *sequence-to-sequence model* to predict a paragraph of Gustave Flaubert's *Madame Bovary* given the preceding paragraph.
The model (at least in its first version) does not use words as units of text but characters.

*   The encoder part, based on a bidirectional LSTM, reads an input paragraph and turns it into a set of tensors that serves as initial state for the decoder part.
*   The decoder part is based on an (unidirectional) LSTM. The state of the LSTM is used to compute a probability distribution over the alphabet (including space and punctuation marks) and is updated each time a character is predicted by the LSTM reading this character's embedding.
*   The goal is to get the best model. It is part of the job to define what this means. It is also part of the job to explain me how you get your best model.

This is an assignment.
==

*   Work in groups of two or three students.
*   Due date: December 4th (Monday), 23:59
*   Malus: -1 per day of delay.

Loading PyTorch is important.
==

In [None]:
# Imports PyTorch.
import torch

Remarks:
==
*   Follow the instructions very carefully. Do not ignore any comment.
*   Keep in mind all remarks given in previous TPs.
*   Comment your code (including the role of all functions and the type of their arguments). A piece of code not appropriately commented can be considered incorrect (irrespectively of whether it works or not).
*   Indicate the shape of each tensor that you define.
*   Document all the changes that you make. Any work that is not properly explained can be ignored.

Downloading the dataset
==
The dataset we are going to use is there: "https://www.gutenberg.org/cache/epub/14155/pg14155.txt"

We have to pre-process it a little bit in order to remove everything that is not part of the text and to split the actual text into paragraphs.

In [None]:
use_toy_dataset = False # If True, a toy dataset (see below) is use instead of the real one.

In [None]:
# Downloads the dataset.
import urllib

tmp = urllib.request.urlretrieve("https://www.gutenberg.org/cache/epub/14155/pg14155.txt")
filename = tmp[0]
print(filename)

/tmp/tmp42x1hwae


In [None]:
# Prints the first 200 lines in the file with their line number.
# This shows that we have a little bit of preprocessing to do in order to clean the data.
with open(filename) as f:
  for i in range(200):
    print(f"[{i}] {f.readline()}", end='')

[0] ﻿The Project Gutenberg eBook of Madame Bovary
[1]     
[2] This ebook is for the use of anyone anywhere in the United States and
[3] most other parts of the world at no cost and with almost no restrictions
[4] whatsoever. You may copy it, give it away or re-use it under the terms
[5] of the Project Gutenberg License included with this ebook or online
[6] at www.gutenberg.org. If you are not located in the United States,
[7] you will have to check the laws of the country where you are located
[8] before using this eBook.
[9] 
[10] Title: Madame Bovary
[11] 
[12] 
[13] Author: Gustave Flaubert
[14] 
[15] Release date: November 26, 2004 [eBook #14155]
[16]                 Most recently updated: December 18, 2020
[17] 
[18] Language: French
[19] 
[20] 
[21] 
[22] *** START OF THE PROJECT GUTENBERG EBOOK MADAME BOVARY ***
[23] 
[24] 
[25] 
[26] Produced by Ebooks libres et gratuits at http://www.ebooksgratuits.com
[27] 
[28] 
[29] 
[30] 
[31] 
[32] Gustave Flaubert
[33] MADAME BOVARY
[3

In [None]:
import re # Regular expression library
roman_regex = re.compile('^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$') # This regular expression matches Roman numerals but also the empty string.

EOP = '\n' # The end-of-line character will be used to mark the end of paragraphs.

with open(filename) as f:
  # We want to skip everything before the actual text of the novel.
  # The line "PREMIÈRE PARTIE" appears twice: in the table of content and then at the start of the first part of the actual text.
  # The following lines discard everything up to this second occurence (included).
  skip = 2
  while(skip > 0):
    line = f.readline().strip()
    if(line == "PREMIÈRE PARTIE"): skip -= 1;

  paragraphs = [] # Note that each dialog line will be considered a separate paragraph.
  paragraph_buffer = [] # List[str]; each element corresponds to a line in the original text file + an additonal space if necessary.
  while(True):
    line = f.readline().strip()
    if("END OF THE PROJECT GUTENBERG EBOOK MADAME BOVARY" in line): break # End of the actual text.

    if(line == ""): # We've reached the end of a paragraph.
      if(len(paragraph_buffer) > 0):
        paragraph_buffer.append(EOP) # End of the paragraph.

        paragraph = "".join(paragraph_buffer) # The different lines that make up the paragraph are joined into a single string.
        paragraphs.append(paragraph)
        paragraph_buffer = []
      continue

    if(roman_regex.match(line)): continue # Ignores the lines that indicate the beginning of a chapter.
    if(line.endswith(" PARTIE")): continue # Ignores the lines that indicate the beginning of a part.

    if((len(paragraph_buffer) > 0) and (paragraph_buffer[-1][-1] != '-')): paragraph_buffer.append(' ') # Adds a space between consecutive lines except when the first one ends with "-" (e.g. if the word "pomme-de-terre" is split with "pomme-de-" at the end of a line and "terre" at the beginning of the next, we do not want to join the two lines with a space).
    paragraph_buffer.append(line)

print(f"{len(paragraphs)} paragraphs read.")
for i in range(3): print(paragraphs[i], end='')

2995 paragraphs read.
Nous étions à l'Étude, quand le Proviseur entra, suivi d'un nouveau habillé en bourgeois et d'un garçon de classe qui portait un grand pupitre. Ceux qui dormaient se réveillèrent, et chacun se leva comme surpris dans son travail.
Le Proviseur nous fit signe de nous rasseoir; puis, se tournant vers le maître d'études:
-- Monsieur Roger, lui dit-il à demi-voix, voici un élève que je vous recommande, il entre en cinquième. Si son travail et sa conduite sont méritoires, il passera dans les grands, où l'appelle son âge.


Here, we define a toy dataset on which your model, if correctly implemented, should be able to learn more easily.

In [None]:
if(use_toy_dataset):
  paragraphs = []

  import random, string
  characters = list(string.ascii_lowercase + string.ascii_lowercase.upper() + "_-/\'[]()")
  random.shuffle(characters)
  k = random.randint(1, 10)
  a = "a"
  paragraph = (a * k)
  for _ in range(100):
    random.shuffle(characters)
    for a in characters:
      k = random.randint(1, 16)
      paragraph += f"? Now, please write {k} {a}.{EOP}"
      paragraphs.append(paragraph)
      paragraph = (a * k)
  print(f"{len(paragraphs)} paragraphs generated.")

  print(paragraphs[:10])

In [None]:
import collections
import numpy as np

# Computes the frequency of all characters in the dataset.
char_counts = collections.defaultdict(int)
for paragraph in paragraphs:
  for char in paragraph: char_counts[char] += 1

print(f"{len(char_counts)} different characters found in the dataset.")
print(sorted(char_counts.items(), key=(lambda x: x[1]), reverse=True)) # Shows each character with its frequency, in decreasing frequency order.


94 different characters found in the dataset.
[(' ', 109336), ('e', 76369), ('a', 44997), ('s', 42972), ('t', 38893), ('i', 38322), ('n', 35099), ('r', 34314), ('l', 33711), ('u', 32914), ('o', 27415), ('d', 19187), ('c', 14786), ('m', 14638), ('p', 13790), (',', 12378), ('v', 8441), ('é', 8263), ("'", 7451), ('.', 6225), ('b', 5519), ('q', 5455), ('f', 5406), ('h', 5386), ('g', 4704), ('-', 4243), ('\n', 2995), ('à', 2722), ('x', 2057), ('j', 1728), ('è', 1644), ('y', 1619), ('!', 1512), ('E', 1477), (';', 1425), ('ê', 1188), ('L', 981), ('C', 945), ('I', 769), ('M', 743), ('z', 674), ('A', 543), ('?', 530), (':', 480), ('ç', 470), ('B', 427), ('â', 410), ('P', 394), ('î', 327), ('R', 319), ('D', 313), ('O', 301), ('S', 298), ('ô', 296), ('ù', 293), ('H', 270), ('û', 241), ('Q', 237), ('J', 233), ('T', 211), ('V', 181), ('N', 155), ('U', 122), ('«', 120), ('»', 112), ('À', 84), ('F', 84), ('Y', 80), ('_', 64), ('G', 62), ('(', 55), (')', 55), ('ï', 37), ('É', 25), ('k', 16), ('1', 16)

In [None]:
# Here you have to build a dictionary 'char_vocabulary' that assigns an integer id to each character, along with a list/array 'id_to_char' that implements the reverse mapping.
#################
id_to_char = [char for char in char_counts.keys()]
char_vocabulary = {char:i for i, char in enumerate(id_to_char)}
#################

In [None]:
EOP_id = char_vocabulary[EOP] # Id for the end-of-paragraph symbol

print(char_vocabulary)
print(id_to_char)
print(f"EOP_id = {EOP_id}")

# Here you have to implement a test that proves that your implementations of 'char_vocabulary' and 'id_to_char' are consistent.
#################
if len(char_vocabulary) == len(id_to_char):
  print(f"Both mappings have the same number of elements (which is {len(id_to_char)})")
  print()
for elt in char_vocabulary:
  if elt not in id_to_char:
    print(f"{elt} is missing from id_to_char")
  else:
    char_in_id = True
if char_in_id: print("All elements from char_vocabulary are in id_to_char")
for elt in id_to_char:
  if elt not in char_vocabulary:
    print(f"{elt} is missing from char_vocabulary")
  else:
    id_in_char = True
if id_in_char: print("All elements from id_to_char are in char_vocabulary")
#################

{'N': 0, 'o': 1, 'u': 2, 's': 3, ' ': 4, 'é': 5, 't': 6, 'i': 7, 'n': 8, 'à': 9, 'l': 10, "'": 11, 'É': 12, 'd': 13, 'e': 14, ',': 15, 'q': 16, 'a': 17, 'P': 18, 'r': 19, 'v': 20, 'h': 21, 'b': 22, 'g': 23, 'ç': 24, 'c': 25, 'p': 26, '.': 27, 'C': 28, 'x': 29, 'm': 30, 'è': 31, '\n': 32, 'L': 33, 'f': 34, ';': 35, 'î': 36, ':': 37, '-': 38, 'M': 39, 'R': 40, 'j': 41, 'S': 42, 'ù': 43, 'â': 44, 'z': 45, 'I': 46, 'Q': 47, 'û': 48, 'ê': 49, 'O': 50, 'y': 51, 'k': 52, 'ï': 53, 'E': 54, 'T': 55, 'U': 56, 'D': 57, '!': 58, '(': 59, ')': 60, 'B': 61, '?': 62, '_': 63, 'G': 64, '1': 65, '8': 66, '2': 67, 'A': 68, 'À': 69, 'H': 70, 'ô': 71, 'V': 72, 'Ê': 73, '«': 74, '»': 75, 'Y': 76, 'F': 77, 'J': 78, 'ë': 79, 'W': 80, 'X': 81, '0': 82, '5': 83, '7': 84, '9': 85, '6': 86, '3': 87, 'w': 88, 'Î': 89, 'Ç': 90, '4': 91, 'ü': 92, '°': 93}
['N', 'o', 'u', 's', ' ', 'é', 't', 'i', 'n', 'à', 'l', "'", 'É', 'd', 'e', ',', 'q', 'a', 'P', 'r', 'v', 'h', 'b', 'g', 'ç', 'c', 'p', '.', 'C', 'x', 'm', 'è', '

modified below

In [None]:
from typing import List
# Turns a list of lists of ids into a list of strings.
# Do not forget that an occurrence of EOP means that the paragraph ends here.
def ids_to_texts(ids:list((list(int)))) -> list(str):
  # Here you have to turn each list of character ids of 'ids' into a string and then return all strings as a list.
  #################
  sentences = []
  for char_id_list in ids:  #iter over all id sequences
    words = ""
    for id in char_id_list: #iter over all id

      #type checks
      if type(char_id_list) == list: key = id
      if type(char_id_list) == torch.Tensor: key = id.item()
      # If id is different from EOP'id, add the corresponding character to the string
      if id != EOP_id:
        words += id_to_char[key] #decode id
      else:
        break
    sentences.append(words)
  return sentences
  #################

TypeError: ignored

In [None]:
ps = ["Bonjour.", "Comment allez vous ?"]
ids = [[char_vocabulary[c] for c in p] for p in ps]
print(ids)
print(ids_to_texts(ids))
print(f"'ids_to_texts(ids) == ps' should be True: {ids_to_texts(ids) == ps}")

In [None]:
ps = ["Bonjour.", "Comment allez vous ?"]
ids = [[char_vocabulary[c] for c in p] for p in ps]
ids[0].extend([EOP_id, (EOP_id+1), (EOP_id+1)]) # With the end-of-paragraph token id and additional (padding-like) stuff for the first string.
print(ids)
print(ids_to_texts(ids))
print(f"'ids_to_texts(ids) == ps' should be True: {ids_to_texts(ids) == ps}") # If you have a problem here, remember that EOP indicates the end of the text (this might be related to your problem).

Batch generator
==

In [None]:
# Defines a class of objects that produce batches from the dataset.
# A training instance is composed of a pair of consecutive paragraphs. The goal will be to predict the second given the first.
# TODO: (Possible improvement: As is, ends of chapter are completely ignored: the last paragraph of a chapter and the first of the following chapter form a training instance. We might want to predict the end of the chapter instead, or simply remove these pairs from the dataset.)
class BatchGenerator:
  def __init__(self, paragraphs, char_vocabulary):
    tr = int(len(paragraphs)*0.9)
    self.train_paragraphs = paragraphs[:tr]
    self.dev_paragraphs = paragraphs[tr:]

    print(f" ** train ({len(self.train_paragraphs)}) & dev ({len(self.dev_paragraphs)}) split generated ** ")

    self.char_vocabulary = char_vocabulary # Dictionary
    self.padding_idx = len(char_vocabulary)

  # Returns the number of training instances (i.e. of pairs of consecutive paragraphs).
  def length(self, split):
    if split == 'train':
      return (len(self.train_paragraphs) - 1)
    if split == 'dev':
      return (len(self.dev_paragraphs) - 1)

  # Returns a random training batch (composed of pairs of consecutive paragraphs).
  # If `subset` is an integer, only a subset of the corpus is used. This can be useful when debugging the system.
  def get_batch(self, batch_size, split, subset=None):
    max_i = self.length(split) if(subset is None) else min(subset, self.length(split))
    paragraph_ids = np.random.randint(max_i, size=batch_size) # Randomly picks some paragraph ids.

    return self._ids_to_batch(paragraph_ids, split)

  def _ids_to_batch(self, paragraph_ids, split):
    firsts = [] # First paragraph of each pair
    seconds = [] # Second paragraph of each pair
    for paragraph_id in paragraph_ids:
      if split == 'train':
        firsts.append([self.char_vocabulary[char] for char in self.train_paragraphs[paragraph_id]])
        seconds.append([self.char_vocabulary[char] for char in self.train_paragraphs[paragraph_id + 1]])
      if split == 'dev':
        firsts.append([self.char_vocabulary[char] for char in self.dev_paragraphs[paragraph_id]])
        seconds.append([self.char_vocabulary[char] for char in self.dev_paragraphs[paragraph_id + 1]])

    # Padding
    self.pad(firsts)
    self.pad(seconds)

    firsts = torch.tensor(firsts, dtype=torch.long) # Conversion to a tensor
    seconds = torch.tensor(seconds, dtype=torch.long) # Conversion to a tensor

    return (firsts, seconds)

  # Pads a list of lists (i.e. adds fake word ids so that all sequences in the batch have the same length, so that we can use a matrix to represent them).
  # In place
  def pad(self, sequences):
    max_length = max([len(s) for s in sequences])
    for s in sequences: s.extend([self.padding_idx] * (max_length - len(s)))

  # Returns a generator of training batches for a full epoch. (Note that this function is not used in the training loop implemented below. `get_batch` is used instead.)
  # If `subset` is an integer, only a subset of the corpus is used. This can be useful when debugging the system.
  def all_batches(self, batch_size, split, subset=None):
    max_i = self.length(split) if(subset is None) else min(subset, self.length(split))

    # Loop that generates all full batches (batches of size 'batch_size').
    i = 0
    while((i + batch_size) <= max_i):
      instance_ids = np.arange(i, (i + batch_size))
      yield self._ids_to_batch(instance_ids, split)
      i += batch_size

    # Possibly generates the last (not full) batch.
    if(i < max_i):
      instance_ids = np.arange(i, max_i)
      yield self._ids_to_batch(instance_ids, split)

  # Turns a list of arbitrary paragraphs into a prediction batch.
  def turn_into_batch(self, paragraphs):
    firsts = []
    for paragraph in paragraphs:
        # Unknown characters are ignored (removed).
        tmp = []
        for char in paragraph:
          if(char in self.char_vocabulary): tmp.append(self.char_vocabulary[char])

        if(tmp[-1] != EOP_id): tmp.append(EOP_id) # Adds an end-of-paragraph character if necessary.

        firsts.append(tmp)

    self.pad(firsts)
    return torch.tensor(firsts, dtype=torch.long)

batch_generator = BatchGenerator(paragraphs=paragraphs, char_vocabulary=char_vocabulary)
print(batch_generator.length('train'))
print(batch_generator.length('dev'))

In [None]:
(firsts, seconds) = batch_generator.get_batch(3, 'dev')
print(ids_to_texts(firsts))
print(ids_to_texts(seconds))

The model
==
For this model, we will not define a `forward` method, but two methods: `trainingLogits` and `predictionStrings`.

*    `trainingLogits` is used at training time, when each batch is split in two parts: input paragraphs and output paragraphs. This function outputs, for each output paragraph of the batch, a log-probability distribution (i.e. a vector of "logits") before each token and after the last one. These distributions depend on the encoding of the corresponding input paragraph. They will then be used to compute a loss value.
*    `predictionStrings` is used at prediction time, when each batch is only composed of input paragraphs. This function outputs, for each input paragraph, a string obtained by decoding the encoding of the paragraph.

(Don't forget to read carefully all comments and to make sure that you understand them.)

Here is a graphical representation of the architecture: https://moodle.u-paris.fr/mod/resource/view.php?id=648001
Before starting the implementation, make sure you understand it.

modified below

In [None]:
class Model(torch.nn.Module):
  # 'size_vocabulary' does not include a padding character, but does include the end-of-paragraph one.
  def __init__(self, size_vocabulary, EOP_id, embedding_dim, lstm_hidden_size, lstm_layers, device='cpu'):
    super().__init__()

    self.device = device

    self.EOP_id = EOP_id # At prediction time, this index is used to stop the generation at the end of the paragraph.

    # Here you have to define:
    #################
    #  (i) an embedding layer 'self.char_embeddings' with 'torch.nn.Embedding' for the characters, including an padding embedding;
    self.char_embeddings = torch.nn.Embedding(num_embeddings = size_vocabulary+1, # need to add an additional embedding for padding
                                              embedding_dim  = embedding_dim,
                                              padding_idx=size_vocabulary) # padding token is at end of vocabulary

    #  (ii) a bidirectional LSTM 'self.encoder_lstm' with a hidden size of 'lstm_hidden_size' and 'lstm_layers' layers (use batch_first=True);
    self.encoder_lstm = torch.nn.LSTM(input_size    = embedding_dim,        # encoder's inputs are token embeddings
                                      hidden_size   = lstm_hidden_size,     # size of hidden size of lstm, an hyperparameter
                                      num_layers    = lstm_layers,          # stacks num_layers lstms on each others
                                      batch_first   = True,                 # we want batches to be on first layer
                                      bidirectional = True)                 # encoder is bidirectional since at encoding time we have access to the entire sentence
                                                                            # LSTM OUTPUT = output        (shape: batch_size, sequence_length, 2 * hidden_size),
                                                                            #               hidden_state  (shape: batch_size, lstm_layers, lstm_hidden_size)
                                                                            #               cell_state    (shape: batch_size, lstm_layers, lstm_hidden_size)

    #  (iii) a unidirectional LSTM 'self.decoder_lstm' with a hidden size of 'lstm_hidden_size' and 'lstm_layers' layers (use batch_first=True);
    self.decoder_initialiser = torch.nn.Sequential(torch.nn.Linear(in_features= 2* lstm_hidden_size,    # output from encoder (is bidirectional)
                                                                   out_features= lstm_hidden_size),     # input for decoder   (is unidirectional)
                                                   torch.nn.ELU())

    #  (iv) a network 'self.decoder_initialiser' meant to turn the final hidden and cell states of the encoder into the initial hidden and cell states of the decoder;
    self.decoder_lstm = torch.nn.LSTM(input_size    = embedding_dim,        # decoder's input are also token embeddings
                                      hidden_size   = lstm_hidden_size,     # size of hidden size of lstm, an hyperparameter
                                      num_layers    = lstm_layers,          # stacks num_layers lstms on each others
                                      batch_first   = True,                 # we want batches to be on first layer
                                      bidirectional = False)                # decoder is unidirectional since at decoding time we only have information about left context
                                                                            # LSTM OUTPUT = output        (shape: batch_size, sequence_length, 2 * hidden_size),
                                                                            #               hidden_state  (shape: batch_size, lstm_layers, lstm_hidden_size)
                                                                            #               cell_state    (shape: batch_size, lstm_layers, lstm_hidden_size)

    #  (v) a network 'self.distribution_nn' meant to turn the hidden state of the decoder at each step into the logits of a probability distribution over the vocabulary. The logits of a probability distribution are simply the log-probabilities (you might want to use torch.nn.LogSoftmax).
    self.distribution_nn = torch.nn.Sequential(torch.nn.Linear(in_features  = lstm_hidden_size,   # output from decoder serves as input to final head
                                                               out_features = size_vocabulary),   # final output is a probability distribution over vocabulary
                                               torch.nn.LogSoftmax(dim=-1))

    # Send all parts to 'device', so that we can use a GPU.
    self.char_embeddings.to(device)
    self.encoder_lstm.to(device)
    self.decoder_initialiser.to(device)
    self.decoder_lstm.to(device)
    self.distribution_nn.to(device)
    #################

  # This function encodes the input paragraphs and turns them into initial states for the decoder. It is used both at training and prediction time.
  # 'in_paragraphs' is a matrix (batch size, max in length) of character ids (Integer).
  # You might want to understand what is the output of PyTorch's LSTMs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
  def initStates(self, in_paragraphs):
    batch_size = in_paragraphs.size(0)

    in_char_embeddings = self.char_embeddings(in_paragraphs) # Shape: (batch_size, max length, embedding size)
    #print(in_char_embeddings); print(in_char_embeddings.shape)
    in_lengths = (in_paragraphs != self.char_embeddings.padding_idx).sum(axis=1) # Shape: (batch_size)
    #print(in_lengths); print(in_lengths.shape)
    in_char_embeddings = torch.nn.utils.rnn.pack_padded_sequence(input=in_char_embeddings, lengths=in_lengths.cpu(), batch_first=True, enforce_sorted=False) # Enables the biLSTM to ignore padding elements.

    # The input paragraphs are encoded; the final hidden and cell states of the network will be used to initialise the decoder after a little transformation.
    _, (h_n, c_n) = self.encoder_lstm(in_char_embeddings) # 'h_n' and 'c_n' are both of shape (num_layers * 2, batch_size, hidden_size)

    # Concatenates the left-to-right and right-to-left final hidden states of the biLSTM.
    h_n = h_n.view(self.encoder_lstm.num_layers, 2, batch_size, self.encoder_lstm.hidden_size) # The second dimension (of size 2) of this tensor corresponds to left-to-right (0) and right-to-left (1).
    #print(h_n); print(h_n.shape)
    lr_h_n = h_n[:,0] # left-to-right; shape: (num_layers, batch_size, hidden_size)
    rl_h_n = h_n[:,1] # right-to-left; shape: (num_layers, batch_size, hidden_size)
    bi_h_n = torch.cat([lr_h_n, rl_h_n], axis=2) # Shape: (num_layers, batch_size, (2 * hidden_size))
    #print(bi_h_n); print(bi_h_n.shape)

    # Concatenates the left-to-right and right-to-left final cell states of the biLSTM.
    c_n = c_n.view(self.encoder_lstm.num_layers, 2, batch_size, self.encoder_lstm.hidden_size) # The second dimension (of size 2) of this tensor corresponds to left-to-right (0) and right-to-left (1).
    #print(c_n); print(c_n.shape)
    lr_c_n = c_n[:,0] # left-to-right; shape: (num_layers, batch_size, hidden_size)
    rl_c_n = c_n[:,1] # right-to-left; shape: (num_layers, batch_size, hidden_size)
    bi_c_n = torch.cat([lr_c_n, rl_c_n], axis=2) # Shape: (num_layers, batch_size, (2 * hidden_size))
    #print(bi_c_n); print(bi_c_n.shape)

    # What should be the shape of the two tensors of the following pair? Answer: for both, the output of the decoder_initializer has shape = batch_size, num_layers, lstm_hidden_size
    return (self.decoder_initialiser(bi_h_n), self.decoder_initialiser(bi_c_n))

  # Training time: This function outputs the logits for each time step.
  # Because at training time, the output paragraph is known, there is no need to generate anything sequentially — all positions can be processed at the same time. In fact, there is a loop hidden in the call to the decoder LSTM, but you should not write any explicit loop here.
  # Do not forget the distribution for the first character.
  # 'in_paragraphs' is a matrix (batch size, max in length) of character ids (Integer).
  # 'out_paragraphs' is a matrix (batch size, max out length) of character ids (Integer) at training time. Assume it does not include the final end-of-paragraph character.
  # You might want to understand what is the output of PyTorch's LSTMs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
  def trainingLogits(self, in_paragraphs, out_paragraphs):
    decoder_init_states = self.initStates(in_paragraphs) # These tensors are not only used to initialise the decoder but also (for the first tensor) to compute the probability distributions for the first character.

    # Feed a packed sequence to the decoder (use 'torch.nn.utils.rnn.pack_padded_sequence' and 'torch.nn.utils.rnn.pad_packed_sequence').
    # You don't need to implement a loop, because at training time, you know in advance the decisions of the system (i.e. the tokens that are generated).
    #################
    batch_size = out_paragraphs.size(0)
    max_out_length = out_paragraphs.size(1)

    # Extract probability distribution from first hidden state. This should ideally have as argmax the index of the token at position 0
    first_logits = self.distribution_nn(decoder_init_states[0][-1]).view(batch_size,1,-1) # Shape: (batch_size, 1, vocabulary_size)

    #ignore last token, since its final probability distribution will not point toward an expected token in our sequence
    out_paragraphs = out_paragraphs[:,range(max_out_length-1)] # Shape: (batch_size, max_out_length-1)

    # retrieve corresponding embeddings, we do teacher forcing so we use the expected output as next input to our decoder, this helps to learn faster
    out_char_embeddings = self.char_embeddings(out_paragraphs) # Shape: (batch_size, max_out_length-1, embedding size)


    out_lengths = (out_paragraphs != self.char_embeddings.padding_idx).sum(axis=1) # Shape: (batch_size)

    # Pack embeddings in order to feed them to lstm
    out_char_embeddings = torch.nn.utils.rnn.pack_padded_sequence(input=out_char_embeddings,
                                                                  lengths=out_lengths.cpu(),
                                                                  batch_first=True,
                                                                  enforce_sorted=False) # Enables the LSTM to ignore padding elements.

    # Pass all embeddings to decoder, we dont need to retrieve hidden and cell states
    outputs, (_, _) = self.decoder_lstm(out_char_embeddings, decoder_init_states) # outputs shape: (batch_size, max_out_length-1, lstm_hidden_size)

    # Process again the packed embeddings to retrieve an actual torch tensor
    outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)

    # retrieve probability distributions for each decoder's output
    logits = self.distribution_nn(outputs) # Shape: (batch_size, max_out_length-1, vocabulary_size)

    #concatenate first logits and the rest of the sequences logits
    logits = torch.cat((first_logits, logits), dim=1) # Shape: (batch_size, max_out_length, vocabulary_size)

    return logits
    #################

  # Prediction time: This function generates a text up to 'max_predicted_char' character long for each paragraph in the batch.
  # 'in_paragraphs' is a matrix (batch size, max in length) of character ids (Integer).
  # You might want to understand what is the output of PyTorch's LSTMs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
  def predictionStrings(self, in_paragraphs, max_predicted_char=1000) -> List[str]:
    ids = []
    batch_size = in_paragraphs.size(0)
    decoder_init_states = self.initStates(in_paragraphs) # These tensors are not only used to initialise the decoder but also (for the first tensor) to compute the probability distributions for the first character.

    # Decode 'decoder_init_states' into a matrix a character ids (on line per input paragraph in the batch) and then convert it to strings of actual characters.
    # You will need to implement a loop at some point.
    # To work with probability distributions, you may use "torch.distributions.Categorical", but not necessarily.
    #################

    # we only take the last layer from hidden states, hence -1
    last_logits = self.distribution_nn(decoder_init_states[0][-1,:,:])

    # Retrieve index where the probability is the highest for each probability distribution
    input_tokens = torch.argmax(last_logits, dim=-1)

    # Process sequence one by one
    for batch_index in range(batch_size):
        # Token, hidden and cell states that will be the first inputs fed to the decoder
        last_token, last_hidden_state, last_cell_state = input_tokens[batch_index], decoder_init_states[0][:,batch_index,:], decoder_init_states[1][:,batch_index,:]

        # Already put first token of the decoded sequence
        predicted = [last_token.item()]

        # We stop generating tokens when we generate EOP token or that output length exceeds max_predicted_char
        while len(predicted)<max_predicted_char and last_token.item()!=EOP_id:

            # Decode next token
            output, (last_hidden_state, last_cell_state) = self.decoder_lstm(self.char_embeddings(last_token).view(1,-1).contiguous(),
                                                                             (last_hidden_state.contiguous(), last_cell_state.contiguous())) # Shape: output (lstm_hidden_size)
                                                                                                                                             #        hidden (lstm_layers, lstm_hidden_size)
            # Get index of most probable token from probability distribution                                                                                                                                 #        cell   (lstm_layers, lstm_hidden_size)
            last_token = torch.argmax(self.distribution_nn(output.data), dim=-1)

            # Update generated sequence
            predicted.append(last_token.item())
        ids.append(predicted)

    # turn ids into texts and return them
    return ids_to_texts(ids)
    #################

model = Model(size_vocabulary=len(char_vocabulary), EOP_id=EOP_id, embedding_dim=19, lstm_hidden_size=13, lstm_layers=7, device='cpu')

# Tests the training method.
in_paragraphs = torch.tensor([(list(range(5)) + ([batch_generator.padding_idx] * 0))]).to(model.device) # A batch that contains only one sentence with no padding.
# print(in_paragraphs)
out_paragraphs = in_paragraphs
model.trainingLogits(in_paragraphs, out_paragraphs)

# Tests the prediction methods.
batch = batch_generator.get_batch(2, 'dev')
model.predictionStrings(batch[0].to(model.device), max_predicted_char=16)

In [None]:
# Tests the training method.
in_paragraphs = torch.tensor([(list(range(5)) + ([batch_generator.padding_idx] * 0))]).to(model.device) # A batch that contains only one sentence with no padding.
# print(in_paragraphs)
out_paragraphs = in_paragraphs
model.trainingLogits(in_paragraphs, out_paragraphs)

In [None]:
# Tests the training method (again).
in_paragraphs = torch.tensor([(list(range(5)) + ([batch_generator.padding_idx] * 10)), (list(range(10)) + ([batch_generator.padding_idx] * 5))]).to(model.device) # A batch that contains two sentences with some padding (more than necessary).
# print(in_paragraphs)
out_paragraphs = in_paragraphs
model.trainingLogits(in_paragraphs, out_paragraphs)

In [None]:
# Tests the prediction methods.
batch = batch_generator.get_batch(2, 'train')
model.predictionStrings(batch[0].to(model.device), max_predicted_char=16)

Training
==

In [None]:
model = Model(size_vocabulary=len(char_vocabulary), EOP_id=EOP_id, embedding_dim=256, lstm_hidden_size=512, lstm_layers=1, device='cuda')

import time

model.eval() # Tells Pytorch we are in evaluation/inference mode (can be useful if dropout is used, for instance).

# Training procedure
learning_rate = 0.002
momentum = 0.99
l2_reg = 0.0001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=l2_reg) # Once the backward propagation has been done, call the 'step' method (with no argument) to update the parameters.
batch_size = 64 if(not use_toy_dataset) else 8
subset = None # Use an integer to train on a smaller portion of the training set, otherwise use None.
epoch_size = batch_generator.length('train') if(subset is None) else subset # In number of instances

nb_epoch = 20 if(not use_toy_dataset) else 50
epoch_id = 0 # Id of the current epoch
instances_processed = 0 # Number of instances trained on in the current epoch
epoch_loss = [] # Will contain the loss for each batch of the current epoch
time_0 = time.time()
while(epoch_id < nb_epoch):
  model.train() # Tells Pytorch we are in training mode (can be useful if dropout is used, for instance).

  model.zero_grad() # Makes sure the gradient is reinitialised to zero.

  batch = batch_generator.get_batch(batch_size, subset=subset, split = 'train')
  #print(ids_to_texts(batch[0])); print(ids_to_texts(batch[1]))
  in_paragraphs = batch[0].to(model.device)
  out_paragraphs = batch[1].to(model.device)

  # You have to (i) compute the prediction of the model, (ii) compute the loss, (iii) call "backward" on the loss and (iv) store the loss in "epoch_loss".
  # For the loss, use torch.nn.functional.nll_loss. Computes an average over all tokens of the batch, but do not take into account distribution logits that corresonds to padding characters. Read the documentation and be careful about the shape of your tensors.
  ###################
  preds = model.trainingLogits(in_paragraphs=in_paragraphs,
                               out_paragraphs=out_paragraphs)

  loss = torch.nn.functional.nll_loss(preds.transpose(1,2), out_paragraphs, ignore_index=batch_generator.padding_idx)

  epoch_loss.append(loss)

  loss.backward()
  ###################

  optimizer.step() # Updates the parameters.

  instances_processed += batch_size
  if(instances_processed > epoch_size):
    print(f"-- END OF EPOCH {epoch_id}.")
    print(f"Average loss: {sum(epoch_loss) / len(epoch_loss)}.")
    duration = time.time() - time_0
    print(f"{duration} s elapsed (i.e. {duration / (epoch_id + 1)} s/epoch)")

    # Example of generation
    batch = batch_generator.get_batch(1, subset=subset, split = 'dev')
    print(ids_to_texts(batch[0])) # Input paragraph
    print(model.predictionStrings(batch[0].to(model.device), max_predicted_char=512)) # Generated output paragraph.

    epoch_id += 1
    instances_processed -= epoch_size
    epoch_loss = []

In [None]:
use_toy_dataset = True

In [None]:
if(use_toy_dataset):
  prompt = ["AAAA. Now, please write 3 i." + EOP] #[paragraphs[0]]
  print(prompt[0])
  print(f"(Is this prompt in the training set? {prompt[0] in paragraphs})\n")

  for _ in range(10):
    batch = batch_generator.turn_into_batch(prompt)
    gen_texts = model.predictionStrings(batch.to(model.device), max_predicted_char=128)

    print(gen_texts[0])
    prompt = [gen_texts[0] + EOP]
else:
  prompt = ["Charles Bovary sortit une bonne bouteille de vin et alla chercher des verres pour ses invités."] * 10
  batch = batch_generator.turn_into_batch(prompt)
  gen_texts = model.predictionStrings(batch.to(model.device), max_predicted_char=1024)

  print(prompt[0])
  print()
  for i, gen_text in enumerate(gen_texts):
    print(f"{i}: ", end="")
    print(gen_text)

If your system does not work as expected, check that you are using a sensible loss function, but also check that your implementation matches the architecture depicted in https://moodle.u-paris.fr/mod/resource/view.php?id=648001.

If you cannot get your model to work even on the toy dataset, then there must be a bug somewhere.

Read the remarks at the beginning of the TP again.

Once you are sure that your system is correctly implemented and generates texts that look a little bit like natural language, find ways to improve the system.
Here are some ideas (ordered arbitrarily):

*   Compute a measure that evaluates the performance of the model.
*   Split your dataset into a training and a development section, and use this split in a relevant way.
*   Implement beam decoding instead of greedy decoding.
*   Use other units of text instead of characters (ex: words, word-pieces).
*   Add more data to the dataset.
*   Use graphs to visualise the training process and the predictions.

Document in a text cell all of the changes that you make to the system and describe their impact (qualitatively **and** quantitatively).

_____________________________
- evaluation: ~BLEU~ / **ROUGE?** / combien de mots sont pas de vrais mots (pas dans le dictionnaire) (noter auee BLEU a bien marché pour toy set mais pas pour Flaubert - score > 1.0)
- ~split et tuner sur dev (early stopping~ + random search)
- beam https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/
- tokenization en mots ou BPE pour les subwords (HuggingFace comme d'hab)
- ajouter qqch d'autres de Flaubert je sais pas
- il y a un todo dans le batch generator - fin de chapitre

# Evaluation

In [None]:
!pip install -q sacrebleu

In [None]:
from sacrebleu.metrics import BLEU

bleu = BLEU(tokenize='char')

def evaluate(model, eval_batch_size, split, subset):

  if split == 'train':
    paragraphs = batch_generator.train_paragraphs
  elif split == 'dev':
    paragraphs = batch_generator.dev_paragraphs

  predicted = []
  golden = []

  for batch in batch_generator.all_batches(eval_batch_size, split=split, subset=subset):

    predicted = model.predictionStrings(batch[0].to(model.device))
    sentences = [x.tolist() for i, x in enumerate(batch[1])]
    golden = [ids_to_texts([sent]) for sent in sentences]
  print(len(batch[0], len(batch[1])))
  print(len(predicted),predicted)
  print(len(golden),golden)

  return bleu.corpus_score(predicted, golden).score


In [None]:
# test the evaluate method
score = evaluate(model, 64, split = 'train', subset=None)
print(score)

# Wrapping everything into suitable form

In [None]:
import time
import matplotlib.pyplot as plt
def trainer(nb_epoch=10, learning_rate=0.002,
            momentum=0.99, l2_reg=0.0001, batch_size=16,
            early_stopping=True,
            plot=True,
            device = 'cuda'
            ):

  model = Model(size_vocabulary=len(char_vocabulary), EOP_id=EOP_id, embedding_dim=128, lstm_hidden_size=256, lstm_layers=3, device=device)


  model.eval() # Tells Pytorch we are in evaluation/inference mode (can be useful if dropout is used, for instance).

  optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=l2_reg) # Once the backward propagation has been done, call the 'step' method (with no argument) to update the parameters.
  subset = None # Use an integer to train on a smaller portion of the training set, otherwise use None.
  epoch_size = batch_generator.length(split='train') if(subset is None) else subset # In number of instances


# ----------------------------------- # variables used for early stopping # ----------------------------------- #

  patience = 4                         # how many epochs without improvement we are ready to tolerate
  nb_epochs_no_improvement = 0         # how many consequent epochs without improvement there really are
  best_loss = 10000                    # beat accuracy achieved (nedded to define if there is an improvement)

# ------------------------------------------------------------------------------------------------------------- #

  epoch_id = 0 # Id of the current epoch
  instances_processed = 0 # Number of instances trained on in the current epoch
  time_0 = time.time()

  epoch_losses = []
  train_losses = []
  train_scores = []
  dev_scores = []
  # dev_losses = []     # or accuracies if we find a way to calculate it

  while(epoch_id < nb_epoch):
    model.train() # Tells Pytorch we are in training mode (can be useful if dropout is used, for instance).

    model.zero_grad() # Makes sure the gradient is reinitialised to zero.

    batch = batch_generator.get_batch(batch_size, subset=subset, split = 'train')
    in_paragraphs = batch[0].to(model.device)
    out_paragraphs = batch[1].to(model.device)

    preds = model.trainingLogits(in_paragraphs=in_paragraphs,
                                out_paragraphs=out_paragraphs)

    loss = torch.nn.functional.nll_loss(preds.transpose(1,2), out_paragraphs, ignore_index=batch_generator.padding_idx)

    epoch_losses.append(loss.item())

    loss.backward()

    optimizer.step() # Updates the parameters.

    instances_processed += batch_size
    if(instances_processed > epoch_size):
      print(f"\n ---- END OF EPOCH {epoch_id} ----")
      print(f"Average loss: {sum(epoch_losses) / len(epoch_losses)}.")
      train_losses.append(sum(epoch_losses) / len(epoch_losses))

  # -------------------------------------------------------- evaluation -------------------------------------------------------- #

      model.eval()
      with torch.no_grad():
        score_train = evaluate(model, eval_batch_size = 128, split = "train", subset = 10)
        print(f"Score on the train set: {score_train}.")
        train_scores.append(score_train)

        score_dev = evaluate(model, 128, split = "dev", subset = 10)
        print(f"Score on the dev set: {score_dev}.")
        dev_scores.append(score_dev)


  # ---------------------------------------------------------------------------------------------------------------------------- #

      # Example of generation
      print(' * example of the output * ')
      batch = batch_generator.get_batch(1, subset=subset, split='train')
      print(ids_to_texts(batch[0])) # Input paragraph
      print(model.predictionStrings(batch[0].to(model.device), max_predicted_char=16)) # Generated output paragraph.

  # --------------------------------------------- # early stopping implementation # -------------------------------------------- #

      if early_stopping:


        if loss < best_loss:                                   # if the current loss is the new best one ->
          best_loss = loss                                     # update the best loss and set the counter back to 0
          nb_epochs_no_improvement = 0
        else:
          nb_epochs_no_improvement += 1                                        # if there is no loss improvement -> count this epoch
          if nb_epochs_no_improvement == patience:                             # if the patience is reached -> stop the training
            print(f'Early stopping at epoch {epoch_id}')
            print(f"Smallest loss achieved is {best_loss}")
            nb_epochs_total = epoch_id
            break

  # ----------------------------------------------------------------------------------------------------------------------------- #

      epoch_id += 1
      instances_processed -= epoch_size
      epoch_loss = []


  duration = time.time() / 1000.0 #convert ms to s
  print(f"\n This config took {duration} to train \n")

  # ---------------------------- # plotting the accuracies on the dev an train set during training # ---------------------------- #


  if plot:

    epochs = [e + 1 for e in range(nb_epochs_total + 1)] if early_stopping else [e+1 for e in range(nb_epoch)]

    fig, (ax1, ax2) = plt.subplots(2)
    print(train_losses)
    ax1.plot(epochs, train_losses, label='Train Losses', marker='o')
    ax2.plot(epochs, train_scores, label='Train Scores', marker='o')
    ax2.plot(epochs, dev_scores, label='Dev Scores', marker='o')

    ax1.set(ylabel = 'Loss', xticks = epochs)
    ax2.set(xlabel = 'Epochs', ylabel = 'BLEU score', xticks = epochs)


    plt.show()

In [None]:
# used for toy corpus

#  def trainer(nb_epoch=10, learning_rate=0.002,
#             momentum=0.99, l2_reg=0.0001, batch_size=16,
#             early_stopping=True,
#             plot=True,
#             device = 'cuda'
#             ):

#   model = Model(size_vocabulary=len(char_vocabulary), EOP_id=EOP_id, embedding_dim=128, lstm_hidden_size=256, lstm_layers=3, device=device)

In [None]:
trainer(nb_epoch=20, batch_size=16, early_stopping=False)