In [1]:
import os
import random
import collections
import operator
import itertools
import pickle
import tqdm

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import simpleclock
import sklearn.metrics

In [3]:
import torch
import torchtext

## Dataset, iterators

In [38]:
data_path = os.path.join(os.path.abspath(''), "data_cine_cleaned.csv")

In [39]:
TEXT = torchtext.data.Field(tokenize = "spacy",
                            tokenizer_language="fr_core_news_sm",
                            include_lengths=True)
LABEL = torchtext.data.LabelField(dtype=torch.float, use_vocab=False, preprocessing=lambda x: float(x) / 5)
# labels are linearly rescaled to a 0-1 range
# todo: test if preprocessing data before isn't faster

In [40]:
dataset = torchtext.data.TabularDataset(path=data_path,
                                        format="CSV",
                                        fields={"critique": ("input", TEXT), "note": ("target", LABEL)})

In [41]:
data_train, data_test = dataset.split()

In [42]:
data_train, data_valid = data_train.split()

In [43]:
print(f"""training data: {len(data_train)} examples.
validation data: {len(data_valid)} examples.
test data: {len(data_test)} examples.""")

training data: 36057 examples.
validation data: 15453 examples.
test data: 22076 examples.


In [None]:
vectors = torchtext.vocab.Vectors("cc.fr.300.vec", os.path.join(os.path.expanduser("~"), "Downloads"))

In [44]:
VOCAB_MAX_SIZE = 50000
TEXT.build_vocab(data_train, max_size=VOCAB_MAX_SIZE, vectors=vectors)

In [17]:
DEVICE = torch.device("cuda")

In [45]:
BATCH_SIZE = 256

In [46]:
iter_train, iter_valid, iter_test = \
    torchtext.data.BucketIterator.splits(datasets=(data_train, data_valid, data_test),
                                         batch_size=BATCH_SIZE,
                                         device=DEVICE,
                                         sort_within_batch=True,
                                         sort_key=lambda example: len(example.input),
                                         sort=False)

## Model definition

In [7]:
class RNN(torch.nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, output_dim, dropout, bidirectional,
                 n_layers, pad_idx):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bidirectional = bidirectional
        num_dir = 2 if bidirectional else 1
        self.embedding = torch.nn.Embedding(n_vocab, embedding_dim, padding_idx=pad_idx)
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim,
                                 bidirectional=bidirectional,
                                 num_layers=n_layers)
        self.fc = torch.nn.Linear(hidden_dim * num_dir, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input_lengths):
        input, lengths = input_lengths
        torch.nn.utils.rnn.pack_padded_sequence(input, lengths)
        embedded = self.embedding(input)  # ((sent_len, batch), emb_dim)
        packed_output, (hidden, cell) = self.rnn(embedded)  # hidden: (num_layers * num_directions,
                                                            #          batch, hidden_size * num_directions)
        hidden = (torch.cat([hidden[-2, :, :], hidden[-1, :, :]], dim=1)
                  if self.bidirectional else hidden).squeeze(0)  # (batch, hidden_size * num_directions)
        return self.sigmoid(self.fc(self.dropout(hidden)))  # (batch, 1)

In [None]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
N_VOCAB = len(TEXT.vocab)

In [12]:
DEFAULT_PARAMS = {
    "n_vocab": N_VOCAB,
    "embedding_dim": 300,
    "hidden_dim": 256,
    "output_dim": 1,
    "dropout": 0.5,
    "bidirectional": True,
    "n_layers": 1,
    "pad_idx": PAD_IDX,
}

def default_model(**kwargs):
    _d = {}
    _d.update(DEFAULT_PARAMS)
    _d.update(kwargs)
    return RNN(**_d)

In [None]:
def pseudo_init(model, criterion, device=DEVICE, learn_embedding_param=True):
    model.embedding.weight.data.copy_(TEXT.vocab.vectors)
    model.embedding.weight.data[UNK_IDX] = torch.zeros(model.embedding_dim)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(model.embedding_dim)
    
    for name, param in model.named_parameters():
        if name == "embedding.weight":
            param.requires_grad = learn_embedding_param
    
    print("The model has {:,} trainable parameters"
         .format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
    
    model = model.to(device)
    criterion = criterion.to(device)
    
    return model, criterion

In [34]:
def output_to_pred(output):
    return (output * 10).round() / 2

## Training utils

In [None]:
def accuracy(preds, y):
    correct = (preds == y).float()  # convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        output = model(batch.input).squeeze(1)
        loss = criterion(output, batch.target)
        loss.backward()
        optimizer.step()
        
        acc = accuracy(output_to_pred(output), batch.target * 5)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
      
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            output = model(batch.input).squeeze(1)
            loss = criterion(output, batch.target)
            acc = accuracy(output_to_pred(output), batch.target * 5)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
class TrainInfo:
    def __init__(self, valid={}, train={}):
        self.valid = collections.defaultdict(lambda: [])
        self.valid.update(valid)
        self.train = collections.defaultdict(lambda: [])
        self.train.update(train)
    
    def save(self, path):
        packed = {
            "valid": dict(self.valid),
            "train": dict(self.train),
        }
        with open(path, "wb") as f:
            pickle.dump(packed, f)
    
    @classmethod
    def load(cls, path):
        with open(path, "rb") as f:
            packed = pickle.load(f)
            return cls(valid=packed["valid"],
                       train=packed["train"])
    
    @staticmethod
    def _dict_to_repr(d):
        return dict(map(lambda k_v: (k_v[0], f"{len(k_v[1])} elements"), d.items()))
    
    def __repr__(self):
        return pprint.pformat({"valid": self._dict_to_repr(self.valid),
                     "train": self._dict_to_repr(self.train),})

In [None]:
def do_training(model, name, iter_train, iter_valid, optimizer, criterion, fun_train,
                fun_eval, n_epochs=100, train_info=None):

    clock = simpleclock.Clock.started()
    torch.cuda.empty_cache()
    train_info = train_info if train_info is not None else TrainInfo()
    best_valid_loss = min(train_info.valid["loss"]) if train_info.valid["loss"] else float("inf")
    
    for epoch in range(n_epochs):

        clock.elapsed_since_start.call()  # meh

        train_loss, train_acc = fun_train(model, iter_train, optimizer, criterion)
        valid_loss, valid_acc = fun_eval(model, iter_valid, criterion)
        
        is_best = valid_loss < best_valid_loss
        print("Epoch: {e:<3}. T, V acc: {train:.1f}%, {valid:.1f}%. Took {t:.2f}s."
             .format(e=epoch + 1,
                     train=100 * train_acc,
                     valid=100 * valid_acc,
                     t=clock.elapsed_since_last_call())
             + (" (+)" if is_best else "")
             )
        
        train_info.train["loss"].append(train_loss)
        train_info.valid["loss"].append(valid_loss)

        if is_best:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), f"{name}.pt")

    clock.elapsed_since_start.print(f"Trained {name}, {n_epochs} epochs, for")
    return train_info

In [None]:
class TrainSet:
    def __init__(self, model, name, iter_train, iter_valid,
                 fun_optimizer, fun_criterion, fun_train, fun_eval,
                 device=DEVICE, n_epochs=100):
        self.model = model
        self.name = name
        self.iter_train = iter_train
        self.iter_valid = iter_valid
        self.fun_optimizer = fun_optimizer
        self.fun_criterion = fun_criterion
        self.fun_train = fun_train
        self.fun_eval = fun_eval
        self.n_epochs = n_epochs
        self.device = device
        
        self.optimizer = None
        self.criterion = None
        
    def init(self, learn_embedding_param=True):
        self.model, self.criterion = pseudo_init(self.model, self.fun_criterion(), self.device,
                                                 learn_embedding_param=learn_embedding_param)
        self.optimizer = self.fun_optimizer(self.model.parameters())
    
    def do_training(self):
        if self.optimizer is None or self.criterion is None:
            raise Exception("It looks like an init is needed: optimizer or criterion is None")
        return do_training(model=self.model,
                           name=self.name,
                           iter_train=self.iter_train,
                           iter_valid=self.iter_valid,
                           optimizer=self.optimizer,
                           criterion=self.criterion,
                           fun_train=self.fun_train,
                           fun_eval=self.fun_eval,
                           n_epochs=self.n_epochs)

## Saving/loading utils

In [None]:
def save_vocab_embedding(path, vocab, embedding):
    with open(path, "w") as f:
        for word, vector in tqdm.tqdm(zip(vocab.itos, embedding)):
            
            # skip words with unicode symbols
            if len(word) != len(word.encode()):
                continue
            
            # 'words' like " " or "\n" fail to be loaded
            if word.strip() == "":
                continue

            f.write(f"{word} {' '.join(str(e) for e in vector.tolist())}\n")

In [None]:
def load_context(model, field, path_model, path_vocab, cache_embeddings="cache_embeddings", device=DEVICE):
    
    _vectors = torchtext.vocab.Vectors(path_vocab, cache_embeddings)  # voir unk_init
    field.build_vocab(data_train, max_size=VOCAB_MAX_SIZE, vectors=_vectors)
    
    model.load_state_dict(torch.load(path_model))
    
    model.embedding.weight.data.copy_(field.vocab.vectors)
    
    model = model.to(device)
    
    iter_train, iter_valid, iter_test = \
        torchtext.data.BucketIterator.splits(datasets=(data_train, data_valid, data_test),
                                             batch_size=BATCH_SIZE,
                                             device=DEVICE,
                                             sort_within_batch=True,
                                             sort_key=lambda example: len(example.input),
                                             sort=False)
    
    return model, (iter_train, iter_valid, iter_test)

In [4]:
def save_for_pred(folder_path, model, itos):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    torch.save(model.state_dict(), os.path.join(folder_path, "model.pt"))
    with open(os.path.join(folder_path, "itos"), "wb") as f:
        pickle.dump(itos, f)

In [96]:
def load_for_pred(folder_path, model, device=DEVICE):
    model.load_state_dict(torch.load(os.path.join(folder_path, "model.pt")))
    model = model.to(DEVICE)
    with open(os.path.join(folder_path, "itos"), "rb") as f:
        itos = pickle.load(f)
    stoi = collections.defaultdict(lambda: 0)
    stoi.update(map(lambda t: t[::-1], enumerate(itos)))
    return model, itos, stoi

## Training

In [None]:
train_sets = []
N_EPOCHS = 10

for hidden_dim, n_layers in itertools.product([512], [4]):
    train_sets.append(TrainSet(
        model=default_model(hidden_dim=hidden_dim, n_layers=n_layers),
        name=f"rnn_hidden-{hidden_dim}_nlayers-{n_layers}_nepochs-{N_EPOCHS}",
        iter_train=iter_train,
        iter_valid=iter_valid,
        fun_optimizer=torch.optim.Adam,
        fun_criterion=torch.nn.MSELoss,
        fun_train=train,
        fun_eval=evaluate,
        n_epochs=N_EPOCHS
    ))

In [None]:
for train_set in train_sets:
    train_set.init()
    train_info = train_set.do_training()
    
    train_info.save(f"info_{train_set.name}.pickle")
    
    # plot loss data
    fig, ax = plt.subplots(figsize=(20, 8))
    ax_train = ax.plot(list(range(train_set.n_epochs)), train_info.train["loss"], label="train")
    ax_valid = ax.plot(list(range(train_set.n_epochs)), train_info.valid["loss"], label="valid")
    fig.legend()
    fig.suptitle("Validation loss")
    plt.show()
    
    # save
    save_vocab_embedding("vocab_emb", TEXT.vocab, model.embedding.weight)

## Peeking

In [6]:
import spacy

nlp = spacy.load('fr_core_news_sm')

In [112]:
def predict_tokens(tokens, model, stoi=TEXT.vocab.stoi, device=DEVICE):
    model.eval()
    idxs = [stoi[t] for t in tokens]
    inp = torch.LongTensor(idxs).reshape(-1, 1).to(device)
    output = output_to_pred(model((inp, torch.LongTensor([len(tokens)]))))
    return output.item()


def predict(sentence, model, stoi):
    return predict_tokens(list(map(str, nlp.tokenizer(sentence))), model, stoi)

class Predictor:
    def __init__(self, model, stoi, device=DEVICE):
        self.model = model
        self.stoi = stoi
        self.device = device
    
    def predict(self, sentence):
        return predict(sentence, self.model, self.stoi)
    
    def predict_tokens(self, tokens):
        return predict_tokens(tokens, self.model, self.stoi, self.device)
    
    def __call__(self, sentence):
        return self.predict(sentence)

In [113]:
model, itos, stoi = load_for_pred("testsave", default_model(hidden_dim=512, n_layers=4))
predictor = Predictor(model, stoi)

In [109]:
predictor("Un mauvais film."), \
predictor("Un bon film.")

(1.5, 4.0)

In [115]:
for example in random.sample(list(data_train), 3):
    tokens, note = example.input, float(example.target)
    print(" ".join(tokens))
    print(f"true) {note * 5} - {predictor.predict_tokens(tokens)} (pred")

Un peintre et un jardinier refont le monde ... zzzzz ... pas le cinéma !
true) 2.0 - 3.5 (pred
( ... ) vrai - faux voyeurisme aussi spectaculaire sur le papier qu' il ne l' est pas sur l' écran . Un comble pour un film baptisé Les Acteurs qui regroupe ce qui se fait de mieux en la matière . Blier a tout écrit , il ne leur reste plus rien à faire .
true) 3.0 - 2.0 (pred
Le film , qui parle de survie , de rédemption , d' humanité retrouvée , ménage des scènes de déminage sous très haute tension . De celles que l' on n' est pas près d' oublier .
true) 4.0 - 4.5 (pred


In [142]:
unks = sents = 0
for batch in iter_train:
    sentences = batch.input[0]
    unks += (sentences == 0).sum().item()
    sents += batch.batch_size
print(f"avg number of <unk> per sentence: {unks / sents}")

avg number of <unk> per sentence: 0.03793992844662618


In [145]:
print(itos[10000:10010])
print(TEXT.vocab.itos[10000:10010])

['allergiques', 'alourdie', 'alter', 'altérité', 'amoral', 'anciennes', 'angles', 'aperçoit', 'appels', 'apportant']
['durs', 'dynamiter', 'dystopie', 'débiles', 'débrouille', 'débutants', 'décadent', 'décapante', 'déchiré', 'décorative']


In [148]:
diff = set(itos).symmetric_difference(TEXT.vocab.itos)

In [149]:
len(diff)

18698

## Looser accuracies for evaluation

Is the model really doing bad if it predicts a 4.5 instead of a 5 ? There are at least two ways to allow for forgivable divergence with the test data :
* decrease notation's granularity, e.g. tranform the marks into good/bad, or good/bad/neutral.
* consider a prediction correct if it belongs to a 'small' interval containing the true value.

### Good/Neutral/Bad prediction

In [36]:
def normalized_to_3_way(pred_tensor, bad_treshold=.375, good_treshold=.625):
    """np array with values: 0: bad, 1: neutral, 2: good"""
    return np.digitize(pred_tensor.cpu().detach().numpy(), [bad_treshold, good_treshold])
    
def eval_accuracy_3w(model, iterator):
    n_examples = 0
    n_success = 0
    for batch in iterator:
        predictions = model(batch.input).squeeze(1)
        n_examples += len(batch)
        n_success += sum(normalized_to_3_way(predictions) == normalized_to_3_way(batch.target))
    return n_success / n_examples

def classif_report_3w(model, iterator):
    def preds_and_trues_to_array(predictions, true_targets):
        return np.concatenate([normalized_to_3_way(true_targets).reshape(-1, 1),
                               normalized_to_3_way(predictions).reshape(-1, 1)], axis=1)
    
    array = None
    for batch in iterator:
        predictions = model(batch.input).squeeze(1)
        
        if array is None:
            array = preds_and_trues_to_array(predictions, batch.target)
        else:
            array = np.concatenate([array,
                                    preds_and_trues_to_array(predictions, batch.target)], axis=0)
    print(sklearn.metrics.classification_report(array[:, 0],
                                                array[:, 1],
                                                labels=[0, 1, 2],
                                                target_names=["bad", "neutral", "good"]))

In [47]:
iterator = iter_test

classif_report_3w(model, iterator)


              precision    recall  f1-score   support

         bad       0.11      0.08      0.09      1651
     neutral       0.47      0.37      0.41      9738
        good       0.51      0.63      0.57     10687

    accuracy                           0.47     22076
   macro avg       0.36      0.36      0.36     22076
weighted avg       0.46      0.47      0.46     22076



### Fuzzy accuracy

In [None]:
def eval_accuracy_fuzzy(model, iterator, fuzziness=.1):
    n_examples = 0.
    n_success = 0.
    for batch in iterator:
        predictions = model(batch.input).squeeze(1)
        n_examples += len(batch)
        n_success += sum(torch.abs(predictions - batch.target) <= fuzziness).item()
    return n_success / n_examples

In [None]:
eval_accuracy_fuzzy(model, iter_test, fuzziness=.2)