## Fine Tuning CamemBERT pour prédire les décisions de Justice

In [7]:
pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.7.2-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  D

In [8]:
# Installation des librairies libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import logging
logging.basicConfig(level=logging.ERROR)

In [9]:
# Vérification de la disponibilité du GPU
from torch import cuda
device = 'cuda'
if cuda.is_available():
    print("ok")
else: 'cpu'

ok


In [11]:
# Article  700

base = pd.read_excel('ART_700_clean.xlsx')
base_f = base[['art_700', 'phrase pretention']] # phrase qui fait un résumé du litige entre les deux parties et qui dit que la personne peut prétendre à l'article 700 ou nom

base_f = base_f.copy()
base_f.dropna(how='any', inplace=True)

base_f.describe()

Unnamed: 0,art_700
count,1956.0
mean,0.50818
std,0.500061
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [12]:
# renomer les labels

base_f.rename({'art_700': 'Sentiment', 'phrase pretention': 'Phrase'}, axis=1, inplace=True)

In [13]:
# Chargement de la classe Automodel
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoConfig

model = AutoModel.from_pretrained("camembert-base", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("camembert-base", truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

In [14]:
#
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase       # nom de la colonne texte dans la base
        self.targets = self.data.Sentiment # nom de la variable y dans le base (binaire 1/0)
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split()) # jointure des séquences de texte
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len, # 512 pour CamemBERT
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"] # Used to differentiate segments in tasks like Next Sentence Prediction (NSP) in BERT. For single-sentence tasks, it is usually all zeros.

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [15]:
MAX_LEN = 512 # taille de la séquence
TRAIN_BATCH_SIZE = 2 # le texte / commentaires courts => 8 ou 16 / 32/ 64 (tenir compte du GPU)
VALID_BATCH_SIZE = 2
LEARNING_RATE = 1e-05 # paramètre d'apprentissage de la descente du gradient pour favoriser une meilleur convergence

train_size = 0.7
train_data = base_f.sample(frac=train_size,random_state=200)
test_data = base_f.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(base_f.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1956, 2)
TRAIN Dataset: (1369, 2)
TEST Dataset: (587, 2)


In [16]:
train_data.describe()

Unnamed: 0,Sentiment
count,1369.0
mean,0.509131
std,0.500099
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [17]:
#paramètres d'entrainements et de tests
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0 # permet de charger les données un peu plus vite (tenir compte du matériel)
                }

test_params = {'batch_size': VALID_BATCH_SIZE, # 2 décisions par 2 décisions
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Création d'une couche de neurone pour le fine tuning

In [18]:
# Ajoute d'une couche de neurones à CamemBERT

class CamembertClass(torch.nn.Module): # définition d’un modèle personnalisé basé sur torch.nn.Module
    def __init__(self):
        super(CamembertClass, self).__init__()
        self.layer_final = model # model CamemBERT-base
        self.pre_classifier = torch.nn.Linear(768, 768) # ajout d'une couche intermédiare afin d'apprendre une meilleur représentation
        self.dropout = torch.nn.Dropout(0.3) # désaction aléatoire des neurones pour éviter le surapprentissage
        self.classifier = torch.nn.Linear(768, 2) # ajout d'un classifieur final pour avoir une sortie binaire ARTICLE_700 qui est binaire (à personnaliser en fonction de votre output : 3 pour l'analyse de sentiment (positif négatif neutre)... )

    def forward(self, input_ids, attention_mask, token_type_ids): #réorganisation du réseau
        output_1 = self.layer_final(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0, :]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [19]:
model = CamembertClass()

In [20]:
model

CamembertClass(
  (layer_final): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Fine Tuning the Model


In [21]:
# Fonction de coût à rééquilibrer pour les données déséquilibrées
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [22]:
# Calcul de la précision du classement

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [23]:
# Entrainement

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)): # tqdm = progress bar
        ids = data['ids'].to(dtype = torch.long)  # to(device, dtype = torch.long)
        mask = data['mask'].to(dtype = torch.long)# to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(dtype = torch.long)# to(device, dtype = torch.long)
        targets = data['targets'].to(dtype = torch.long)# to(device, dtype = torch.long)
        optimizer.zero_grad()

        outputs = model(ids, mask, token_type_ids) #prédiction de l'output Article_700 1/0
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)
        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:# multiple of 5000 => à changer à 500 par exemple
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")


        loss.backward()  # retro-propagation
        optimizer.step() # mise à jour des coefficients

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [24]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
0it [00:00, ?it/s]


TypeError: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'pad_to_max_length'

## Validation de Model

In [None]:
import torchmetrics
from torchmetrics.classification import BinaryStatScores, MulticlassConfusionMatrix

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(dtype = torch.long) #to(device, dtype = torch.long)
            mask = data['mask'].to(dtype = torch.long) #to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(dtype=torch.long) #to(device, dtype = torch.long)
            targets = data['targets'].to(dtype = torch.long) #to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)#.squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples

    return epoch_accu


In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
output_model_file = 'Modele_CamemBERT.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')