In [1]:
import os
import pickle
import re
import sys

import fasttext
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence, PackedSequence
from torch.utils.data import DataLoader

from poutyne import set_seeds, Model

In [2]:
set_seeds(42)

In [3]:
cuda_device = 0
device = torch.device("cuda:%d" % cuda_device if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
class EmbeddingVectorizer:
    def __init__(self):
        """
        Embedding vectorizer
        """
        self.embedding_model = fasttext.load_model("../alexi/extraction/rnn99.fasttext")

    def __call__(self, address):
        """
        Convert address to embedding vectors
        :param address: The address to convert
        :return: The embeddings vectors
        """
        embeddings = []
        for word in address.split():
            embeddings.append(self.embedding_model[word])
        return embeddings
embedding_model = EmbeddingVectorizer()



In [5]:
from alexi.pagernn import PageDataset, simplify_tags
from pathlib import Path
ds = PageDataset(Path("../data/train/").glob("*.csv"), label_transform=simplify_tags)

In [6]:
batch_size = 16
lr = 0.001
dimension = ds.pages[0][0].shape[1]
num_layer = 1
bidirectional = True
hidden_size = 64

lstm_network = nn.LSTM(
    input_size=dimension,
    hidden_size=hidden_size,
    num_layers=num_layer,
    bidirectional=bidirectional,
    batch_first=True,
)

input_dim = hidden_size * 2 if bidirectional else 1
tag_dimension = max(ds.vocab.values()) + 1
fully_connected_network = nn.Linear(input_dim, tag_dimension)
print(dimension, input_dim, tag_dimension)

11 128 13


In [7]:
train_data, valid_data = torch.utils.data.random_split(ds, [0.9, 0.1])

In [8]:
def pad_collate_fn(batch):
    """
    The collate_fn that can add padding to the sequences so all can have
    the same length as the longest one.

    Args:
        batch (List[List, List]): The batch data, where the first element
        of the tuple are the word idx and the second element are the target
        label.

    Returns:
        A tuple (x, y). The element x is a tensor of packed sequence .
        The element y is a tensor of padded tag indices. The word vectors are
        padded with vectors of 0s and the tag indices are padded with -100s.
        Padding with -100 is done because of the cross-entropy loss and the
        accuracy metric ignores the targets with values -100.
    """

    # This gets us two lists of tensors and a list of integer.
    # Each tensor in the first list is a sequence of word vectors.
    # Each tensor in the second list is a sequence of tag indices.
    # The list of integer consist of the lengths of the sequences in order.
    sequences_vectors, sequences_labels, lengths = zip(
        *[
            (torch.FloatTensor(np.stack(seq_vectors)), torch.LongTensor(labels), len(seq_vectors))
            for (seq_vectors, labels) in sorted(batch, key=lambda x: len(x[0]), reverse=True)
        ]
    )

    lengths = torch.LongTensor(lengths)

    padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=0)
    pack_padded_sequences_vectors = pack_padded_sequence(
        padded_sequences_vectors, lengths.cpu(), batch_first=True
    )  # We pack the padded sequence to improve the computational speed during training

    padded_sequences_labels = pad_sequence(sequences_labels, batch_first=True, padding_value=-100)

    return pack_padded_sequences_vectors, padded_sequences_labels

In [9]:
train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=pad_collate_fn,
)
valid_loader = DataLoader(valid_data, batch_size=batch_size, collate_fn=pad_collate_fn)

In [10]:
vectors, labels = next(iter(train_loader))
labels

tensor([[   8,    2,    2,  ...,    2,    2,    2],
        [   5,    2,    2,  ..., -100, -100, -100],
        [   5,    2,    2,  ..., -100, -100, -100],
        ...,
        [   4,    2,    2,  ..., -100, -100, -100],
        [   4,    2,    2,  ..., -100, -100, -100],
        [   4,    2,    2,  ..., -100, -100, -100]])

In [11]:
class FullNetWork(nn.Module):
    def __init__(self, lstm_network, fully_connected_network):
        super().__init__()
        self.hidden_state = None

        self.lstm_network = lstm_network
        self.fully_connected_network = fully_connected_network

    def forward(self, pack_padded_sequences_vectors: PackedSequence):
        """
        Defines the computation performed at every call.
        """
        lstm_out, self.hidden_state = self.lstm_network(pack_padded_sequences_vectors)
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

        tag_space = self.fully_connected_network(lstm_out)
        return tag_space.transpose(-1, 1)  # We need to transpose since it's a sequence


full_network = FullNetWork(lstm_network, fully_connected_network)

In [12]:
optimizer = optim.AdamW(full_network.parameters(), lr)
loss_function = nn.CrossEntropyLoss()

model = Model(
    full_network,
    optimizer,
    loss_function,
    batch_metrics=['accuracy'],
    device=device,
)

model.fit_generator(train_loader, valid_loader, epochs=100)

Epoch:   1/100 Train steps: 8 Val steps: 1 1.44s loss: 2.262218 acc: 47.396558 val_loss: 1.842793 val_acc: 86.662842
Epoch:   2/100 Train steps: 8 Val steps: 1 1.41s loss: 1.599728 acc: 85.504379 val_loss: 1.242491 val_acc: 86.873322
Epoch:   3/100 Train steps: 8 Val steps: 1 1.27s loss: 1.077600 acc: 85.212161 val_loss: 0.808188 val_acc: 86.892464
Epoch:   4/100 Train steps: 8 Val steps: 1 1.28s loss: 0.722276 acc: 85.614225 val_loss: 0.554010 val_acc: 86.969002
Epoch:   5/100 Train steps: 8 Val steps: 1 1.27s loss: 0.548160 acc: 85.026287 val_loss: 0.424166 val_acc: 87.198624
Epoch:   6/100 Train steps: 8 Val steps: 1 1.38s loss: 0.460802 acc: 84.737669 val_loss: 0.351254 val_acc: 87.543053
Epoch:   7/100 Train steps: 8 Val steps: 1 1.27s loss: 0.404213 acc: 85.495909 val_loss: 0.304790 val_acc: 88.136246
Epoch:   8/100 Train steps: 8 Val steps: 1 1.27s loss: 0.363777 acc: 86.799857 val_loss: 0.259612 val_acc: 90.604668
Epoch:   9/100 Train steps: 8 Val steps: 1 1.26s loss: 0.337801 

Epoch:  71/100 Train steps: 8 Val steps: 1 1.26s loss: 0.055486 acc: 98.017498 val_loss: 0.022193 val_acc: 99.559891
Epoch:  72/100 Train steps: 8 Val steps: 1 1.42s loss: 0.048630 acc: 98.588574 val_loss: 0.020154 val_acc: 99.483353
Epoch:  73/100 Train steps: 8 Val steps: 1 1.28s loss: 0.048580 acc: 98.289996 val_loss: 0.024494 val_acc: 99.425949
Epoch:  74/100 Train steps: 8 Val steps: 1 1.27s loss: 0.047179 acc: 98.586150 val_loss: 0.017982 val_acc: 99.521622
Epoch:  75/100 Train steps: 8 Val steps: 1 1.28s loss: 0.050885 acc: 98.269296 val_loss: 0.019824 val_acc: 99.502487
Epoch:  76/100 Train steps: 8 Val steps: 1 1.41s loss: 0.054173 acc: 97.989514 val_loss: 0.031833 val_acc: 99.330269
Epoch:  77/100 Train steps: 8 Val steps: 1 1.28s loss: 0.050529 acc: 98.455178 val_loss: 0.017366 val_acc: 99.502487
Epoch:  78/100 Train steps: 8 Val steps: 1 1.27s loss: 0.056244 acc: 97.893055 val_loss: 0.023064 val_acc: 99.483353
Epoch:  79/100 Train steps: 8 Val steps: 1 1.28s loss: 0.045725 

[{'epoch': 1,
  'time': 1.4401972999912687,
  'loss': 2.2622180871727053,
  'acc': 47.39655817835784,
  'val_loss': 1.8427928686141968,
  'val_acc': 86.662841796875},
 {'epoch': 2,
  'time': 1.4064530259929597,
  'loss': 1.5997280591775562,
  'acc': 85.50437889414385,
  'val_loss': 1.2424908876419067,
  'val_acc': 86.87332153320312},
 {'epoch': 3,
  'time': 1.2650642509979662,
  'loss': 1.077599920517157,
  'acc': 85.21216072523889,
  'val_loss': 0.8081880807876587,
  'val_acc': 86.89246368408203},
 {'epoch': 4,
  'time': 1.2774836199969286,
  'loss': 0.7222762575819472,
  'acc': 85.61422477280799,
  'val_loss': 0.5540096163749695,
  'val_acc': 86.96900177001953},
 {'epoch': 5,
  'time': 1.2725920679949922,
  'loss': 0.5481603633273732,
  'acc': 85.02628673206677,
  'val_loss': 0.4241655468940735,
  'val_acc': 87.19862365722656},
 {'epoch': 6,
  'time': 1.375001530002919,
  'loss': 0.46080178490354995,
  'acc': 84.73766888074638,
  'val_loss': 0.35125401616096497,
  'val_acc': 87.54305

In [13]:
test_ds = PageDataset(["../data/train/xx-2020-04-20-RGL-1289-Formation-CCE-adopte_1.csv"], predict=True)
len(test_ds)

2

In [14]:
import pandas as pd
test_df = pd.read_csv("../data/train/xx-2020-04-20-RGL-1289-Formation-CCE-adopte_1.csv")

In [24]:
test_ds[0][0].shape

torch.Size([382, 11])

In [25]:
pages = [page for _, page in test_df.groupby("page")]
pages[0].shape

(382, 16)

In [30]:
def pad_collate_fn_predict(batch):
    # Discard bogus (or unwanted) tags
    sequences_vectors, lengths = zip(
        *[
            (torch.FloatTensor(seq_vectors), len(seq_vectors))
            for seq_vectors, _ in sorted(batch, key=lambda x: len(x[0]), reverse=True)
        ]
    )

    lengths = torch.LongTensor(lengths)

    padded_sequences_vectors = pad_sequence(sequences_vectors, batch_first=True, padding_value=0)
    pack_padded_sequences_vectors = pack_padded_sequence(padded_sequences_vectors, lengths.cpu(), batch_first=True)

    return pack_padded_sequences_vectors

predict_loader = DataLoader(
    test_ds,
    batch_size=batch_size,
    collate_fn=pad_collate_fn_predict,
)

In [31]:
predictions = model.predict_dataset(
    test_ds,
    batch_size=batch_size,
    collate_fn=pad_collate_fn_predict,
    concatenate_returns=False,
)

Prediction steps: 1 0.03s                          


In [32]:
predictions[0].shape

(2, 13, 383)

In [33]:
idx_predictions = []
for batch in predictions:
    idx_predictions.extend(batch.argmax(axis=1).tolist())

In [34]:
inv_vocab = dict((v, k) for k, v in ds.vocab.items())

In [35]:
tags_predictions = []
for page in idx_predictions:
    tags_predictions.append([inv_vocab.get(tag) for tag in page])
tags_predictions[0]

['B-Section',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Enumeration',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'B-Alinea',
 'I',
 'I',
 'I',
 'I',
 'I',

In [37]:
pages[0].assign(tag=tags_predictions[1][:len(pages[0])]).loc[:, ["tag", "text"]].to_numpy()

array([['I', 'RÈGLEMENT'],
       ['I', '1289'],
       ['I', 'concernant'],
       ['I', 'formation'],
       ['I', 'd’un'],
       ['I', 'Comité'],
       ['I', 'consultatif'],
       ['I', 'en'],
       ['I', 'environnement'],
       ['B-Article', 'Séance'],
       ['O', 'ordinaire'],
       ['O', 'du'],
       ['O', 'conseil'],
       ['O', 'municipal,'],
       ['O', 'tenue'],
       ['O', 'à'],
       ['O', 'huis'],
       ['O', 'clos'],
       ['O', 'le'],
       ['O', '20'],
       ['O', 'avril'],
       ['O', '2020'],
       ['O', 'à'],
       ['O', '19'],
       ['O', 'h,'],
       ['O', 'dans'],
       ['O', 'la'],
       ['O', 'salle'],
       ['O', 'du'],
       ['O', 'conseil'],
       ['O', 'municipal'],
       ['O', 'située'],
       ['I', 'au'],
       ['I', '1386,'],
       ['I', 'rue'],
       ['I', 'Dumouchel,'],
       ['O', 'Sainte-Adèle,'],
       ['O', 'lieu'],
       ['I', 'ordinaire'],
       ['O', 'des'],
       ['O', 'séances'],
       ['O', 'à'],
       ['O