In [9]:
import pandas as pd
import os

In [10]:
import random
import matplotlib.pyplot as plt
import collections
import numpy as np
import operator
import simpleclock
import sklearn.metrics

In [12]:
import torch
import torchtext

## Dataset, iterators

In [11]:
data_path = os.path.join(os.path.abspath(''), "data_cine_cleaned.csv")

In [13]:
TEXT = torchtext.data.Field(tokenize = "spacy",
                            tokenizer_language="fr_core_news_sm",
                            include_lengths=True)
LABEL = torchtext.data.LabelField(dtype=torch.float, use_vocab=False)

In [14]:
dataset = torchtext.data.TabularDataset(path=data_path,
                                        format="CSV",
                                        fields={"critique": ("critique", TEXT), "note": ("note", LABEL)})

In [15]:
data_train, data_test = dataset.split()

In [16]:
data_train, data_valid = data_train.split()

In [17]:
print(f"""training data: {len(data_train)} examples.
validation data: {len(data_valid)} examples.
test data: {len(data_test)} examples.""")

training data: 36057 examples.
validation data: 15453 examples.
test data: 22076 examples.


In [18]:
vectors = torchtext.vocab.Vectors("cc.fr.300.vec", os.path.join(os.path.expanduser("~"), "Downloads"))

In [19]:
VOCAB_MAX_SIZE = 50000
TEXT.build_vocab(data_train, max_size=VOCAB_MAX_SIZE, vectors=vectors)

In [20]:
device = torch.device("cuda")

In [21]:
BATCH_SIZE = 64

In [22]:
iter_train, iter_valid, iter_test = \
    torchtext.data.BucketIterator.splits(datasets=(data_train, data_valid, data_test),
                                         batch_size=BATCH_SIZE,
                                         device=device,
                                         sort_within_batch=True,
                                         sort_key=lambda example: len(example.critique),
                                         sort=False)

## Model definition and training

In [23]:
class RNN(torch.nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, output_dim, dropout, pad_idx):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(n_vocab, embedding_dim, padding_idx = pad_idx)
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim,
                                 bidirectional=True,
                                 num_layers=2)
        self.fc = torch.nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input, lengths):
        torch.nn.utils.rnn.pack_padded_sequence(input, lengths)
        embedded = self.embedding(input)  # ((sent_len, batch_size), emb_dim)
        packed_output, (hidden, cell) = self.rnn(embedded)  # hidden: (num_layers * num_directions, batch, hidden_size)
        return self.sigmoid(self.fc(self.dropout(
            torch.cat([hidden[-2, :, :], hidden[-1, :, :]], dim=1).squeeze(0))))  # (batch_size, 1)

In [24]:
N_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(N_VOCAB, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, dropout=DROPOUT, pad_idx=PAD_IDX)

In [25]:
print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters')

The model has 17720857 trainable parameters


In [26]:
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [27]:
criterion = torch.nn.MSELoss()

In [28]:
model = model.to(device)
criterion = criterion.to(device)

In [29]:
optimizer = torch.optim.Adam(model.parameters())

In [30]:
def output_to_pred(output):
    return (output * 10).round() / 2

In [142]:
def accuracy(preds, y):
    correct = (preds == y).float()# convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        padded_sequences, lengths = batch.critique
        output = model(padded_sequences, lengths).squeeze(1)
        loss = criterion(output, batch.note / 5.)
        acc = accuracy(output_to_pred(output), batch.note)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
      
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            padded_sequences, lengths = batch.critique
            output = model(padded_sequences, lengths).squeeze(1)
            loss = criterion(output, batch.note / 5.)
            acc = accuracy(output_to_pred(output), batch.note)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [179]:
N_EPOCHS = 10

best_valid_loss = float('inf')

clock = simpleclock.Clock.started()

for epoch in range(N_EPOCHS):

    clock.elapsed_since_start.call()  # meh
    
    train_loss, train_acc = train(model, iter_train, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, iter_valid, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    clock.elapsed_since_last_call.print(f"Epoch: {epoch+1:02} | Epoch Time")
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
clock.elapsed_since_start.print("Total time")

Epoch: 01 | Epoch Time: 15.98s
	Train Loss: 0.000 | Train Acc: 99.10%
	 Val. Loss: 0.034 |  Val. Acc: 39.71%
Epoch: 02 | Epoch Time: 15.70s
	Train Loss: 0.000 | Train Acc: 98.94%
	 Val. Loss: 0.033 |  Val. Acc: 40.63%
Epoch: 03 | Epoch Time: 15.66s
	Train Loss: 0.000 | Train Acc: 98.88%
	 Val. Loss: 0.034 |  Val. Acc: 40.43%
Epoch: 04 | Epoch Time: 15.75s
	Train Loss: 0.000 | Train Acc: 99.22%
	 Val. Loss: 0.033 |  Val. Acc: 40.62%
Epoch: 05 | Epoch Time: 15.61s
	Train Loss: 0.000 | Train Acc: 99.23%
	 Val. Loss: 0.033 |  Val. Acc: 40.30%
Epoch: 06 | Epoch Time: 15.58s
	Train Loss: 0.000 | Train Acc: 99.23%
	 Val. Loss: 0.034 |  Val. Acc: 40.85%
Epoch: 07 | Epoch Time: 15.52s
	Train Loss: 0.000 | Train Acc: 99.27%
	 Val. Loss: 0.033 |  Val. Acc: 41.77%
Epoch: 08 | Epoch Time: 15.59s
	Train Loss: 0.000 | Train Acc: 99.04%
	 Val. Loss: 0.033 |  Val. Acc: 40.90%
Epoch: 09 | Epoch Time: 15.48s
	Train Loss: 0.000 | Train Acc: 99.14%
	 Val. Loss: 0.034 |  Val. Acc: 41.07%
Epoch: 10 | Epoch T

156.3513687650011

## Peeking

In [35]:
import spacy

nlp = spacy.load('fr_core_news_sm')

In [352]:
def predict_tokens(tokens, model):
    model.eval()
    idxs = [TEXT.vocab.stoi[t] for t in tokens]
    inp = torch.LongTensor(idxs).reshape(-1, 1).to(device)
    output = output_to_pred(model(inp, torch.LongTensor([len(tokens)])))
    return output.item()


def predict(sentence, model):
    return predict_tokens(list(map(str, nlp.tokenizer(sentence))), model)

In [39]:
model.load_state_dict(torch.load('best_model.pt'))

<All keys matched successfully>

In [356]:
predict("Du temps perdu.", model), \
predict("Un très bon film, à voir avec toute la famille.", model)

(2.0, 4.0)

In [364]:
for example in random.sample(list(data_test), 3):
    tokens, note = example.critique, float(example.note)
    print(" ".join(tokens))
    print(f"true) {note} - {predict_tokens(tokens, model)} (pred")

La monstrueuse énergie de son [ Na Hong - Jin ] film se transmet littéralement au spectateur , qui en ressort à bout de souffle mais content , comme après un marathon .
true) 4.0 - 4.0 (pred
Etienne Chatilez ( … ) possède un vrai talent : l' art des signes , genre dans lequel il excelle . N ° 404 - février 1988
true) 5.0 - 4.5 (pred
Si la partie comédie à sketchs est fort bien enlevée , pour l' aventure , ça traîne un peu . Mais , au final , on se marre bien !
true) 2.0 - 3.0 (pred


## Testing predictions on extremal marks

Is the model really doing bad if it predicts a 4.5 instead of a 5 ? There are at least two ways to allow for forgivable divergence with the test data :
* decrease notation's granularity, e.g. tranform the marks into good/bad, or good/bad/neutral.
* consider a prediction correct if it belongs to a 'small' interval containing the true value.

### Good/Neutral/Bad prediction

In [300]:
def mark_to_3_way(mark_tensor, bad_treshold=2, good_treshold=3.5):
    """np array with values: 0: bad, 1: neutral, 2: good"""
    return np.digitize(mark_tensor.cpu().detach().numpy(), [bad_treshold, good_treshold])

def pred_to_3_way(pred_tensor, bad_treshold=0.4, good_treshold=0.651):
    """np array with values: 0: bad, 1: neutral, 2: good"""
    return np.digitize(pred_tensor.cpu().detach().numpy(), [bad_treshold, good_treshold])
    
def eval_accuracy_3w(model, iterator):
    n_examples = 0
    n_success = 0
    for batch in iterator:
        padded_sequences, lengths = batch.critique
        predictions = model(padded_sequences, lengths).squeeze(1)
        n_examples += len(batch)
        n_success += sum(pred_to_3_way(predictions) == mark_to_3_way(batch.note))
    return n_success / n_examples

def classif_report_3w(model, iterator):
    def preds_and_trues_to_array(predictions, true_notes):
        return np.concatenate([mark_to_3_way(true_notes).reshape(-1, 1),
                               pred_to_3_way(predictions).reshape(-1, 1)], axis=1)
    
    array = None
    for batch in iterator:
        padded_sequences, lengths = batch.critique
        predictions = model(padded_sequences, lengths).squeeze(1)
        
        if array is None:
            array = preds_and_trues_to_array(predictions, batch.note)
        else:
            array = np.concatenate([array,
                                    preds_and_trues_to_array(predictions, batch.note)], axis=0)
    print(sklearn.metrics.classification_report(array[:, 0],
                                                array[:, 1],
                                                labels=[0, 1, 2],
                                                target_names=["bad", "neutral", "good"]))

In [301]:
iterator = iter_test

print(eval_accuracy_3w(model, iterator))
classif_report_3w(model, iterator)


0.6627106359847799
              precision    recall  f1-score   support

         bad       0.38      0.38      0.38      1743
     neutral       0.64      0.62      0.63      9669
        good       0.73      0.75      0.74     10664

    accuracy                           0.66     22076
   macro avg       0.58      0.58      0.58     22076
weighted avg       0.66      0.66      0.66     22076



### Fuzzy accuracy

In [348]:
def eval_accuracy_fuzzy(model, iterator, fuzziness=.5):
    n_examples = 0.
    n_success = 0.
    for batch in iterator:
        padded_sequences, lengths = batch.critique
        predictions = model(padded_sequences, lengths).squeeze(1)
        n_examples += len(batch)
        n_success += sum(torch.abs(output_to_pred(predictions) - batch.note) <= fuzziness).item()
    return n_success / n_examples

In [349]:
print(eval_accuracy_fuzzy(model, iter_test, fuzziness = 1))

0.8902427976082624
