In [None]:
import pandas as pd
import os

In [None]:
import random
import matplotlib.pyplot as plt
import collections
import numpy as np
import operator
import simpleclock

In [None]:
resources_path = os.path.join(os.path.dirname(os.path.abspath('')), "resources")
data_path = os.path.join(resources_path, "data_cine.csv")
cleaned_data_path = os.path.join(resources_path, "data_cine_cleaned.csv")

In [None]:
df_all = pd.read_csv(data_path)

In [None]:
print(df_all.shape)
print(df_all.columns.values)

## preprocessing

In [None]:
# remove 'empty' reviews (those sending to another website) // to be removed from the DS
def is_review_empty(s): 
    _s = s.lower().replace("  ", " ")
    return len(_s.split()) < 16 and any(sub in _s for sub in 
                                        ("voir la critique", 
                                         "voir le site", 
                                         "voir sur le site", 
                                         "voir le magazine",
                                         "voir le point",
                                         "express.fr",
                                         "point.fr",
                                         "studio ciné live",
                                         "www.",
                                        ))


df_ = df_all.loc[~df_all["critique"].apply(is_review_empty)]
df_removed = df_all.loc[df_all["critique"].apply(is_review_empty)]
empty_reviews = collections.Counter(df_removed["critique"].values)
print(f"{df_removed.shape[0]} ({df_removed.shape[0] / df_all.shape[0]:.2f}%) lines removed")

df_.loc[:, "critique"] = df_["critique"].str.strip()

df_clean = df_

# empty_reviews  # to check

In [None]:
collections.Counter(map(len, map(str.split, map(str.lower, empty_reviews))))
# sorted(set(map(str.lower, empty_reviews)), key=lambda s: len(s.split()), reverse=True)

In [None]:
df = df_clean

In [None]:
df.to_csv(cleaned_data_path, index=False)

## exploration

In [None]:
f"{df.shape[0]} critiques pour {len(set(df.id))} films ({df.shape[0] / len(set(df.id)):.1f} en moy.)"

In [None]:
for idx, row in df.iloc[random.sample(range(df.shape[0]), 5), :].iterrows():  # or: ... in df.sample(5).iterrows()
    print(row.titre, ":", row.note)
    print(row.critique)
    print()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
axs[0].hist(df.note.values, density=True, bins=list(np.arange(1, 6, .5)), align="left")
axs[0].set_title("marks distribution")

lens = df.critique.apply(lambda s: len(s.split()))
axs[1].hist(lens, density=True, bins=range(min(lens), max(lens) + 1))
axs[1].set_title("nb of words per review")

In [None]:
for t in df.loc[df.critique.str.contains("(...)", regex=False)].sample(5).itertuples():
    print(t.note)
    print(t.critique)
    print()

## learning

In [None]:
import torch
import torchtext

In [None]:
TEXT = torchtext.data.Field(tokenize = "spacy",
                            tokenizer_language="fr_core_news_sm",
                            include_lengths=True)
LABEL = torchtext.data.LabelField(dtype=torch.float, use_vocab=False)

In [None]:
dataset = torchtext.data.TabularDataset(path=cleaned_data_path,
                                        format="CSV",
                                        fields={"critique": ("critique", TEXT), "note": ("note", LABEL)})

In [None]:
data_train, data_test = dataset.split()

In [None]:
data_train, data_valid = data_train.split()

In [None]:
print(f"""training data: {len(data_train)} examples.
validation data: {len(data_valid)} examples.
test data: {len(data_test)} examples.""")

In [None]:
vectors = torchtext.vocab.Vectors("cc.fr.300.vec", os.path.join(os.path.expanduser("~"), "Downloads"))

In [None]:
VOCAB_MAX_SIZE = 50000
TEXT.build_vocab(data_train, max_size=VOCAB_MAX_SIZE, vectors=vectors)
# LABEL.build_vocab(dataset)

In [None]:
device = torch.device("cuda")

In [None]:
BATCH_SIZE = 64

In [None]:
iter_train, iter_valid, iter_test = \
    torchtext.data.BucketIterator.splits(datasets=(data_train, data_valid, data_test),
                                         batch_size=BATCH_SIZE,
                                         device=device,
                                         sort_within_batch=True,
                                         sort_key=lambda example: len(example.critique),
                                         sort=False)

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, output_dim, dropout, pad_idx):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(n_vocab, embedding_dim, padding_idx = pad_idx)
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, input, lengths):
        torch.nn.utils.rnn.pack_padded_sequence(input, lengths)
        embedded = self.embedding(input)  # ((sent_len, batch_size), emb_dim)
        packed_output, (hidden, cell) = self.rnn(embedded)  # hidden: (1, batch_size, hidden_dim)
        return self.sigmoid(self.fc(self.dropout(hidden).squeeze(0)))  # (batch_size, 1)

In [None]:
N_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(N_VOCAB, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, dropout=DROPOUT, pad_idx=PAD_IDX)

In [None]:
print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters')

In [None]:
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [None]:
criterion = torch.nn.MSELoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters())

In [None]:
def output_to_pred(output):
    return (output * 10).round() / 2

In [None]:
def accuracy(preds, y):
    diffs = preds - y
    correct = (diffs == 0).float() # convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        padded_sequences, lengths = batch.critique
        output = model(padded_sequences, lengths).squeeze(1)
        loss = criterion(output, batch.note / 5.)
        acc = accuracy(output_to_pred(output), batch.note)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
      
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            padded_sequences, lengths = batch.critique
            output = model(padded_sequences, lengths).squeeze(1)
            loss = criterion(output, batch.note / 5.)
            acc = accuracy(output_to_pred(output), batch.note)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

clock = simpleclock.Clock.started()

for epoch in range(N_EPOCHS):

    clock.elapsed_since_start.call()  # meh
    
    train_loss, train_acc = train(model, iter_train, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, iter_valid, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    clock.elapsed_since_last_call.print(f"Epoch: {epoch+1:02} | Epoch Time")
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
clock.elapsed_since_start.print("Total time")

In [None]:
import spacy

nlp = spacy.load('fr_core_news_sm')

In [None]:
def predict_tokens(tokens, model):
    model.eval()
    idxs = [TEXT.vocab.stoi[t] for t in tokens]
    inp = torch.LongTensor(idxs).reshape(-1, 1).to(device)
    output = output_to_pred(model(inp))
    return output.item()

In [None]:
def padd_artif(tokens, length=64):
    return tokens + ["<pad>"] * (length - len(tokens))

In [None]:
def predict(sentence, model):
    return predict_tokens(list(map(str, nlp.tokenizer(sentence))), model)

In [None]:
def predict_padded(sentence, model):
    tokens = padd_artif(list(map(str, nlp.tokenizer(sentence))))
    return predict_tokens(tokens, model)

In [None]:
predict_padded("Une merveille.", model)

In [None]:
model.load_state_dict(torch.load('best_model.pt'))

In [None]:
ex = data_train[0]

In [None]:
tokens, note = ex.critique, float(ex.note)

In [None]:
predict_tokens(tokens, model), note