In [1]:
# Install the libraries
!pip install transformers plotly==5.8.0 pyyaml==5.4.1 datasets pytorch-lightning > /dev/null 2>&1

In [56]:
# Import the libraries
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm

from datasets import Dataset
from datasets import load_dataset

from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig

from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, f1_score

import pandas as pd
import numpy as np

from pprint import pprint
import functools

In [3]:
# Use camembert as pre trained model
camembert = CamembertForMaskedLM.from_pretrained('camembert-base')

Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [4]:
batch_sentences = [
    "Vous savez où est la <mask> la plus proche?",
    "La Seine est un <mask>.",
    "Je cherche urgemment un endroit où retirer de l'<mask>.",
]

In [5]:
# Create the tokenizer and tokenizer output
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

tokenizer_output = tokenizer(
    batch_sentences,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)
pprint(tokenizer_output, width=150)

Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[   5,  158, 2591,  ...,    1,    1,    1],
        [   5,   61, 4458,  ...,    1,    1,    1],
        [   5,  100, 1066,  ...,    1,    1,    1]])}


In [6]:
with torch.no_grad():
    model_output = camembert(**tokenizer_output, output_hidden_states=True)
    model_output

In [7]:
model_output.logits.shape 

torch.Size([3, 512, 32005])

In [33]:
def average_embeddings(embeddings, attention_mask):
    return (attention_mask[..., None] * embeddings).mean(1)

In [8]:
# Read all previous predictions
df_0 = pd.read_csv("submission_18_12.csv")
df_1 = pd.read_csv("submission_18_12_v1.csv")
df_2 = pd.read_csv("submission_18_12_v2.csv")
df_3 = pd.read_csv("submission_18_12_v3.csv")

In [None]:
# Read the unlabelled sentences 
df_sentence = pd.read_csv("unlabelled_test_data.csv")

In [9]:
# Combine those sentences to their predictions
df_0["sentence"] = df_sentence.sentence
len(df_0)

1200

In [10]:
# Assume the common predictions from the previous submissions are very likely correct predcitions, use them to enrich the training data
same_difficulty = []
same_sentence = []

for i in range(len(df_0)):
  # Use those data as addiontal training data if all previous predictions are the same
  if df_0["difficulty"][i] == df_1["difficulty"][i] == df_2["difficulty"][i] == df_3["difficulty"][i]:
    # Construct the column sentence
    same_sentence.append(df_0["sentence"][i])
    # Construct the column difficulty
    same_difficulty.append(df_0["difficulty"][i])

len(same_difficulty)
len(same_sentence)

540

In [11]:
# Create the additional data set
data_new = {'sentence': same_sentence, 'difficulty': same_difficulty}
 
# Create the data frame out of it
df_new = pd.DataFrame(data_new)

In [12]:
df_new

Unnamed: 0,sentence,difficulty
0,Nous dûmes nous excuser des propos que nous eû...,C2
1,"Le corps de Golo lui-même, d'une essence aussi...",C2
2,"Je ne peux pas vous laisser dire cela, Madame.",A2
3,Est-ce que ça suffit?,A1
4,Tu as rempli le pot d'eau et tu as mis des gla...,A2
...,...,...
535,C'est un phénomène qui trouve une accélération...,B1
536,Je vais parler au serveur et voir si on peut d...,A2
537,Il n'était pas comme tant de gens qui par pare...,C2
538,Ils deviennent dangereux pour notre économie.,B2


In [14]:
# Read the given training data 
df_existing_train = pd.read_csv("training_data.csv")
# Drop the column id for the upcoming combination with the additional training data
df_tmp = df_existing_train.drop('id', axis=1)
df_tmp

Unnamed: 0,sentence,difficulty
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


In [15]:
# Combine the given training data and the additional training data to make the total training data
total_train = pd.concat([df_new, df_tmp], axis=0, ignore_index=True)

In [16]:
# Re-assign the column id 
total_train["id"] = total_train.index

In [17]:
# Check if the size of the total training data is correct
total_train

Unnamed: 0,sentence,difficulty,id
0,Nous dûmes nous excuser des propos que nous eû...,C2,0
1,"Le corps de Golo lui-même, d'une essence aussi...",C2,1
2,"Je ne peux pas vous laisser dire cela, Madame.",A2,2
3,Est-ce que ça suffit?,A1,3
4,Tu as rempli le pot d'eau et tu as mis des gla...,A2,4
...,...,...,...
5335,"C'est pourquoi, il décida de remplacer les hab...",B2,5335
5336,Il avait une de ces pâleurs splendides qui don...,C1,5336
5337,"Et le premier samedi de chaque mois, venez ren...",A2,5337
5338,Les coûts liés à la journalisation n'étant pas...,C2,5338


In [18]:
# Re-arrange the columns 
cols = total_train.columns.tolist()
cols = cols[-1:] + cols[:-1]
total_train = total_train[cols]
total_train

Unnamed: 0,id,sentence,difficulty
0,0,Nous dûmes nous excuser des propos que nous eû...,C2
1,1,"Le corps de Golo lui-même, d'une essence aussi...",C2
2,2,"Je ne peux pas vous laisser dire cela, Madame.",A2
3,3,Est-ce que ça suffit?,A1
4,4,Tu as rempli le pot d'eau et tu as mis des gla...,A2
...,...,...,...
5335,5335,"C'est pourquoi, il décida de remplacer les hab...",B2
5336,5336,Il avait une de ces pâleurs splendides qui don...,C1
5337,5337,"Et le premier samedi de chaque mois, venez ren...",A2
5338,5338,Les coûts liés à la journalisation n'étant pas...,C2


In [22]:
# Map the language difficulty to integer labels
total_train.difficulty = total_train.difficulty.map({'A1':0, 'A2':1, 'B1':2, 'B2':3, 'C1':4, 'C2':5})

In [23]:
total_train

Unnamed: 0,id,sentence,difficulty
0,0,Nous dûmes nous excuser des propos que nous eû...,5
1,1,"Le corps de Golo lui-même, d'une essence aussi...",5
2,2,"Je ne peux pas vous laisser dire cela, Madame.",1
3,3,Est-ce que ça suffit?,0
4,4,Tu as rempli le pot d'eau et tu as mis des gla...,1
...,...,...,...
5335,5335,"C'est pourquoi, il décida de remplacer les hab...",3
5336,5336,Il avait une de ces pâleurs splendides qui don...,4
5337,5337,"Et le premier samedi de chaque mois, venez ren...",1
5338,5338,Les coûts liés à la journalisation n'étant pas...,5


In [24]:
# Use the entire data set for training
dataset_unil = Dataset.from_pandas(total_train)

In [25]:
dataset_unil

Dataset({
    features: ['id', 'sentence', 'difficulty'],
    num_rows: 5340
})

In [27]:
# Split train test data sets 
train_test_dataset = dataset_unil.train_test_split(test_size=0.1)

In [28]:
# Have a look at the split
train_test_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence', 'difficulty'],
        num_rows: 4806
    })
    test: Dataset({
        features: ['id', 'sentence', 'difficulty'],
        num_rows: 534
    })
})

In [29]:
# Have a look at the test data set
train_test_dataset["test"]

Dataset({
    features: ['id', 'sentence', 'difficulty'],
    num_rows: 534
})

In [30]:
# Fine tune the tokenizer based on the given training data
def tokenize_batch(samples, tokenizer):
    #text = [sample["Utterance"] for sample in samples]
    #labels = torch.tensor([sample["Label"] for sample in samples])
    #str_labels = [sample["Dialogue_Act"] for sample in samples]
    text = [sample["sentence"] for sample in samples]
    labels = torch.tensor([sample["difficulty"] for sample in samples])
    str_labels = [sample["difficulty"] for sample in samples]
    # The tokenizer handles
    # - Tokenization (amazing right?)
    # - Padding (adding empty tokens so that each example has the same length)
    # - Truncation (cutting samples that are too long)
    # - Special tokens (in CamemBERT, each sentence ends with a special token </s>)
    # - Attention mask (a binary vector which tells the model which tokens to look at. For instance it will not compute anything if the token is a padding token)
    tokens = tokenizer(text, padding="longest", return_tensors="pt")

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels, "str_labels": str_labels, "sentences": text}

In [31]:
# Tokenize the test data set
unil_test_dataloader = DataLoader(train_test_dataset["test"], collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer), batch_size=16)
next(iter(unil_test_dataloader))

{'input_ids': tensor([[    5,   436,    45,  ...,     1,     1,     1],
         [    5,   574,     7,  ...,     1,     1,     1],
         [    5,    69, 16006,  ...,     1,     1,     1],
         ...,
         [    5,  1196,   200,  ...,     1,     1,     1],
         [    5,   100,   103,  ...,     1,     1,     1],
         [    5,   158,   349,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([3, 1, 5, 4, 0, 4, 3, 4, 4, 2, 3, 4, 2, 3, 5, 1]),
 'str_labels': [3, 1, 5, 4, 0, 4, 3, 4, 4, 2, 3, 4, 2, 3, 5, 1],
 'sentences': ['Ils ne connaissaient des contrées voisines que ce que leur révélaient les visites rares de voyageurs égarés ou de hardis marchands, et ne désiraient pas en connaître davantage, leur sol étant fécond et suffisant à leurs bes

In [34]:
camembert = camembert.cuda()

sentences = []
labels = []
str_labels = []
all_representations = torch.tensor([], device='cuda')

with torch.no_grad():
    #for tokenized_batch in tqdm(val_dataloader):
    for tokenized_batch in tqdm(unil_test_dataloader):       
        model_output = camembert(
            input_ids = tokenized_batch["input_ids"].cuda(),
            attention_mask = tokenized_batch["attention_mask"].cuda(),
            output_hidden_states=True
        )
        batch_representations = average_embeddings(model_output["hidden_states"][-1], tokenized_batch["attention_mask"].cuda())
        sentences.extend(tokenized_batch["sentences"])
        labels.extend(tokenized_batch["labels"])
        str_labels.extend(tokenized_batch["str_labels"])
        all_representations = torch.cat((all_representations, batch_representations), 0)


  0%|          | 0/34 [00:00<?, ?it/s]

In [35]:
tsne = TSNE()
all_representations_2d = tsne.fit_transform(all_representations.cpu())



In [37]:
# Set the number of the labels
num_labels = 6

In [38]:
# Load both train and test data
train_dataloader = DataLoader(
    #dataset["train"], 
    dataset_unil, 
    batch_size=16, 
    shuffle=True, 
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)
val_dataloader = DataLoader(
    #dataset["validation"], 
    train_test_dataset["test"], 
    batch_size=16, 
    shuffle=False, 
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

In [39]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fa807fcf8e0>

In [40]:
batch = next(iter(train_dataloader))

In [41]:
print("\n".join(tokenizer.batch_decode(batch["input_ids"])))
batch["labels"]

<s> Les uns, les pires et dont elle s'était débarrassé les premiers, étaient ceux qui lui conseillaient de ne pas "s'écouter" et professaient, fût-ce négativement et en ne la manifestant que par certains silences de désapprobation ou par certains sourires de doute, la doctrine subversive qu'une petite promenade au soleil et un bon bifteck saignant (quand elle gardait quatorze heures sur l'estomac deux méchantes gorgées d'eau de Vichy!) lui feraient plus de bien que son lit et ses médecines.</s>
<s> J'attribuais à son charme cet effet presque magique : j'en aurais joui plus complètement encore sans l'engagement que j'avais pris envers mon amour-propre.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

tensor([5, 5, 0, 1, 2, 3, 0, 2, 0, 4, 4, 5, 4, 3, 3, 1])

In [42]:
# Define the class LightningModel
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr, weight_decay, from_scratch=False):
        super().__init__()
        self.save_hyperparameters()
        if from_scratch:
            # Si `from_scratch` est vrai, on charge uniquement la config (nombre de couches, hidden size, etc.) et pas les poids du modèle 
            config = AutoConfig.from_pretrained(
                model_name, num_labels=num_labels
            )
            self.model = AutoModelForSequenceClassification.from_config(config)
        else:
            # Cette méthode permet de télécharger le bon modèle pré-entraîné directement depuis le Hub de HuggingFace sur lequel sont stockés de nombreux modèles
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.num_labels = self.model.num_labels

    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    def training_step(self, batch):
        out = self.forward(batch)

        logits = out.logits
        # -------- MASKED --------
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))

        # ------ END MASKED ------

        self.log("train/loss", loss)

        return loss

    def validation_step(self, batch, batch_index):
        labels = batch["labels"]
        out = self.forward(batch)

        preds = torch.max(out.logits, -1).indices
        # -------- MASKED --------
        acc = (batch["labels"] == preds).float().mean()
        # ------ END MASKED ------
        self.log("valid/acc", acc)

        f1 = f1_score(batch["labels"].cpu().tolist(), preds.cpu().tolist(), average="macro")
        self.log("valid/f1", f1)

    def predict_step(self, batch, batch_idx):
        """La fonction predict step facilite la prédiction de données. Elle est 
        similaire à `validation_step`, sans le calcul des métriques.
        """
        out = self.forward(batch)

        return torch.max(out.logits, -1).indices

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )

In [43]:
# Create a ligntning model
lightning_model = LightningModel("camembert-base", num_labels, lr=3e-5, weight_decay=0.)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weig

In [44]:
# Initialize the camembert trainer
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")

camembert_trainer = pl.Trainer(
    max_epochs=20,
    gpus=1,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)


Setting `Trainer(gpus=1)` is deprecated in v1.7 and will be removed in v2.0. Please use `Trainer(accelerator='gpu', devices=1)` instead.

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [45]:
# Fit the trainer
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                               | Params
-------------------------------------------------------------
0 | model | CamembertForSequenceClassification | 110 M 
-------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.506   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]


Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 6. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [46]:
lightning_model = LightningModel.load_from_checkpoint(checkpoint_path=model_checkpoint.best_model_path)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weig

In [47]:
# Get prediction of the model
def get_preds(model, tokenizer, sentence):
    tokenized_sentence = tokenizer(sentence, return_tensors="pt")
    input_ids, attention_mask = tokenized_sentence.input_ids, tokenized_sentence.attention_mask

    out = model(
        input_ids=tokenized_sentence.input_ids,
        attention_mask=tokenized_sentence.attention_mask
    )

    logits = out.logits

    probas = torch.softmax(logits, -1).squeeze()

    pred = torch.argmax(probas)

    #return ID_TO_LABEL[pred], probas[pred].item()
    return pred, probas[pred].item()

In [48]:
# Predict a ramdom sentence
test_sentence = "Bonjour, vous allez bien ?"
label_predicted, proba = get_preds(lightning_model.model, tokenizer, test_sentence)
print(f"Label: {label_predicted}, confidence: {proba:.2f}")

Label: 0, confidence: 0.99


In [54]:
# Convert the data frame of unlabelled sentences to a data set 
ds = Dataset.from_pandas(df_sentence)
ds

Dataset({
    features: ['id', 'sentence'],
    num_rows: 1200
})

In [57]:
# Generate dummy difficulties for those sentences
random_data = np.random.randint(0, 6, size=len(ds))
random_data[:40]

array([5, 5, 2, 3, 2, 3, 2, 1, 4, 0, 1, 0, 2, 3, 0, 4, 0, 4, 0, 0, 4, 0,
       4, 2, 2, 5, 3, 0, 3, 4, 2, 3, 4, 4, 1, 5, 4, 5, 5, 0])

In [58]:
# Create the column difficulty to complete the data set
ds = ds.add_column("difficulty", random_data)

In [59]:
ds

Dataset({
    features: ['id', 'sentence', 'difficulty'],
    num_rows: 1200
})

In [60]:
# Load the unlabelled sentences
val_dataloader_unil = DataLoader(
    #dataset["validation"], 
    #train_test_dataset["test"], 
    ds,
    batch_size=16, 
    shuffle=False, 
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

In [61]:
# Predict the difficulties
camembert_preds = camembert_trainer.predict(lightning_model, dataloaders=val_dataloader_unil)
camembert_preds = torch.cat(camembert_preds, -1)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 334it [00:00, ?it/s]

In [62]:
# Have a look at the predictions
difficulty = camembert_preds.tolist()
difficulty[:10]

[5, 2, 2, 1, 5, 4, 1, 1, 5, 1]

In [64]:
# Replace the dummy predictions by the actual predictions
df_pred = pd.read_csv("sample_submission.csv")
df_pred.difficulty = difficulty
df_pred

Unnamed: 0,id,difficulty
0,0,5
1,1,2
2,2,2
3,3,1
4,4,5
...,...,...
1195,1195,2
1196,1196,1
1197,1197,5
1198,1198,3


In [65]:
# Map the integer labels back to the language difficulties
df_pred.difficulty = df_pred.difficulty.map({0:'A1', 1:'A2', 2:'B1', 3:'B2', 4:'C1', 5:'C2'})
df_pred

Unnamed: 0,id,difficulty
0,0,C2
1,1,B1
2,2,B1
3,3,A2
4,4,C2
...,...,...
1195,1195,B1
1196,1196,A2
1197,1197,C2
1198,1198,B2


In [66]:
# Write the prediction to a csv file 
df_pred.to_csv("submission_22_12_camembert_base.csv", index=False)