In [1]:
import json
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score,precision_score, recall_score
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from transformers import CamembertForSequenceClassification, CamembertTokenizer, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [3]:
def extract_info(json_file):

    data_list = []
    # Charger le fichier JSON
    with open(json_file, 'r') as f:
        data = json.load(f)

    # Parcourir chaque élément du JSON
    for img, info in data.items():
        # Diviser les informations en lignes en séparant par "\n"
        lines = info.split("\n")

        # Parcourir chaque ligne
        for line in lines:
            # Retirer les éléments vides résultant de la division
            elements = re.findall(r"[\S\s]*?(?=Ⓟ|Ⓞ|Ⓕ|Ⓜ|Ⓐ|Ⓒ|Ⓚ|Ⓑ|Ⓘ|Ⓗ|Ⓔ|$)", line)
            elements = [element.strip() for element in elements if element.strip()]
            target, nom, prenom, profession, lien, age, annee, nationalite, ville, biz = "", "", "", "", "", "", "", "", "", ""
            for element in elements:
                # Supprimer les logos de la colonne "nom"
                #element = element.replace('Ⓞ', '').replace('Ⓟ', '')  # Supprimer les logos Ⓞ et Ⓟ
                if 'Ⓟ' in element:
                    target=True
                    nom=element.replace('Ⓟ', '')
                elif 'Ⓞ' in element:
                    target=False
                    nom=element.replace('Ⓞ', '')
                elif 'Ⓕ' in element:
                    prenom=element.replace('Ⓕ', '')
                elif 'Ⓜ' in element:
                    profession=element.replace('Ⓜ', '')
                elif 'Ⓐ' in element:
                    age=element.replace('Ⓐ', '')
                elif 'Ⓚ' in element:
                    nationalite=element.replace('Ⓚ', '')
                elif 'Ⓑ' in element:
                    annee=element.replace('Ⓑ', '')
                elif 'Ⓘ' in element:
                    ville=element.replace('Ⓘ', '')
                elif 'Ⓗ' in element or 'Ⓒ' in element:
                    if 'Ⓛ' in element:
                        element=element.replace('Ⓗ', '').replace('Ⓒ', '')
                        lien=element.split('Ⓛ', 1)[0]
                    else:
                        lien=element.replace('Ⓗ', '').replace('Ⓒ', '') 
                elif 'Ⓔ' in element:
                    biz=element.replace('Ⓔ', '')

                    
            # Stocker les données dans un dictionnaire
            data_dict = {'Target': target, 'Nom': nom, 'Prenom': prenom, 'Profession': profession,
                            'Relation': lien, 'Age': age,'Année':annee, 'Nationalité': nationalite, 'Ville': ville, 'Plus':biz}
            # Ajouter le dictionnaire à la liste
            data_list.append(data_dict)

    return pd.DataFrame(data_list)

In [3]:
# Chemin vers le fichier JSON
json_file = "entities.json"

# Appeler la fonction pour extraire les informations et créer le dataset
df=extract_info(json_file)

df.head()

Unnamed: 0,Target,Nom,Prenom,Profession,Relation,Age,Année,Nationalité,Ville,Plus
0,False,Breton,Cyrille,menuisier,Garçon,25,,française,,
1,True,Ferazzi,Auguste,vitrier,Garçon,30,,Piémontaise,,
2,True,Machol,Pierre,vitrier,Garçon,24,,Piémontaise,,
3,True,Desbois,Alexandre,prop re,Homme marié,48,,française,,
4,False,Vignat,Zélie,prop re,sa fe,30,,française,,


In [4]:
# Enlever les lignes vides
df=df[(df['Prenom']!='') & ((df['Target']==True)|(df['Target']==False))]

In [5]:
# Enlever les lignes où il y a chef et l'etiquette est false
index_to_drop = df[(df['Target'] == False) & ((df['Relation'].str.startswith('ch')) | (df['Relation'].str.startswith('Ch')))].index
df = df.drop(index_to_drop)

In [6]:
def create_description_column(df):
    df['Description'] = ""
    for index, row in df.iterrows():
        if row['Prenom']!='':
            description = row['Prenom'] + " " + row['Nom'] + " " 
        if row['Age']!='':
            description += row['Age'] + " "
        if row['Nationalité']!='':
            description += row['Nationalité'] + " "
        if row['Profession']!='':
            description += row['Profession'] + " "
        if row['Relation']!='':
            description += row['Relation'] + " "
        if row['Ville']!='':
            description += row['Ville'] + " "
        df.at[index, 'Description'] = description

    return df

# Appel de la fonction pour créer la nouvelle colonne et la mettre au format str
df = create_description_column(df)
df['Description'] = df['Description'].astype(str)

In [7]:
df.head()

Unnamed: 0,Target,Nom,Prenom,Profession,Relation,Age,Année,Nationalité,Ville,Plus,Description
0,False,Breton,Cyrille,menuisier,Garçon,25,,française,,,Cyrille Breton 25 française menuisier Garçon
1,True,Ferazzi,Auguste,vitrier,Garçon,30,,Piémontaise,,,Auguste Ferazzi 30 Piémontaise vitrier Garçon
2,True,Machol,Pierre,vitrier,Garçon,24,,Piémontaise,,,Pierre Machol 24 Piémontaise vitrier Garçon
3,True,Desbois,Alexandre,prop re,Homme marié,48,,française,,,Alexandre Desbois 48 française prop re Homme m...
4,False,Vignat,Zélie,prop re,sa fe,30,,française,,,Zélie Vignat 30 française prop re sa fe


In [8]:
# Diviser les données en ensembles d'entraînement et de test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Initialiser le tokenizer et le modèle
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Créer notre jeu de données du modèle
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Description']
        label = self.data.iloc[idx]['Target']

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Prétraitement des données
max_length = 128 
train_dataset = CustomDataset(train_data[0:10000], tokenizer, max_length)
test_dataset = CustomDataset(test_data, tokenizer, max_length)

In [12]:
# Entraînement du modèle
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
)

In [16]:
# Fonction pour calculer les métriques 
metric_name_list = ['accuracy', 'f1', 'precision', 'recall']

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
# Entraînement du modèle avec les métriques 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [18]:
# Entrainer le modèle
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1172,0.161354,0.948211,0.887686,0.867725,0.908587
2,0.1086,0.193073,0.947379,0.884948,0.871864,0.89843
3,0.0846,0.17905,0.947379,0.885365,0.869217,0.902124


TrainOutput(global_step=3750, training_loss=0.1466537136832873, metrics={'train_runtime': 2945.728, 'train_samples_per_second': 10.184, 'train_steps_per_second': 1.273, 'total_flos': 1973332915200000.0, 'train_loss': 0.1466537136832873, 'epoch': 3.0})

In [19]:
# Évaluation du modèle 
results = trainer.evaluate(eval_dataset=test_dataset)

In [21]:
print("Accuracy:", results['eval_accuracy'])
print("F1-score:", results['eval_f1'])
print("Precision:", results['eval_precision'])
print("Recall:", results['eval_recall'])

Accuracy: 0.9473793677204659
F1-score: 0.8853647485274128
Precision: 0.8692170818505338
Recall: 0.9021237303785781


## Piste d'amélioration: Description contextuelle

In [4]:
# Chemin vers le fichier JSON
json_file = "entities.json"

# Appeler la fonction pour extraire les informations et créer le dataset
df=extract_info(json_file)

In [5]:
# Enlever les lignes vides
df=df[(df['Prenom']!='') & ((df['Target']==True)|(df['Target']==False))]

# Enlever les lignes où il y a chef et l'etiquette est false
index_to_drop = df[(df['Target'] == False) & ((df['Relation'].str.startswith('ch')) | (df['Relation'].str.startswith('Ch')))].index
df = df.drop(index_to_drop)

In [6]:
def create_description_column(df):
    df['Description'] = ""
    for index, row in df.iterrows():
        if row['Prenom']!='':
            description = "La personne s'appelle " + row['Prenom'] + " " + row['Nom'] + ","
        if row['Age']!='':
            description += " âgé de " + str(row['Age']) + " ans,"
        if row['Nationalité']!='':
            description += " de nationalité " + row['Nationalité'] + ","
        if row['Profession']!='':
            description += " exerçant la profession de " + row['Profession'] + ","
        if row['Relation']!='':
            description += " étant " + row['Relation'] + " du ménage,"
        if row['Ville']!='':
            description += " habitant à " + row['Ville'] + "."
        df.at[index, 'Description'] = description

    return df

# Appel de la fonction pour créer la nouvelle colonne
df = create_description_column(df)

In [7]:
df.head()

Unnamed: 0,Target,Nom,Prenom,Profession,Relation,Age,Année,Nationalité,Ville,Plus,Description
0,False,Breton,Cyrille,menuisier,Garçon,25,,française,,,"La personne s'appelle Cyrille Breton, âgé de 2..."
1,True,Ferazzi,Auguste,vitrier,Garçon,30,,Piémontaise,,,"La personne s'appelle Auguste Ferazzi, âgé de ..."
2,True,Machol,Pierre,vitrier,Garçon,24,,Piémontaise,,,"La personne s'appelle Pierre Machol, âgé de 24..."
3,True,Desbois,Alexandre,prop re,Homme marié,48,,française,,,"La personne s'appelle Alexandre Desbois, âgé d..."
4,False,Vignat,Zélie,prop re,sa fe,30,,française,,,"La personne s'appelle Zélie Vignat, âgé de 30 ..."


In [8]:
# Diviser les données en ensembles d'entraînement et de test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Initialiser le tokenizer et le modèle
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Créer notre jeu de données du modèle
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Description']
        label = self.data.iloc[idx]['Target']

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Prétraitement des données
max_length = 128 
train_dataset = CustomDataset(train_data[0:10000], tokenizer, max_length)
test_dataset = CustomDataset(test_data, tokenizer, max_length)

In [12]:
# Entraînement du modèle
training_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
)

In [13]:
# Fonction pour calculer les métriques 
metric_name_list = ['accuracy', 'f1', 'precision', 'recall']

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
# Entraînement du modèle avec les métriques 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [15]:
# Entrainer le modèle
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2749,0.176351,0.944468,0.875292,0.885633,0.865189
2,0.1143,0.148509,0.949875,0.890901,0.87389,0.908587
3,0.1153,0.177642,0.948419,0.888389,0.86655,0.911357


TrainOutput(global_step=3750, training_loss=0.1630631441195806, metrics={'train_runtime': 3060.1175, 'train_samples_per_second': 9.804, 'train_steps_per_second': 1.225, 'total_flos': 1973332915200000.0, 'train_loss': 0.1630631441195806, 'epoch': 3.0})

In [16]:
# Évaluation du modèle 
results = trainer.evaluate(eval_dataset=test_dataset)

In [17]:
print("Accuracy:", results['eval_accuracy'])
print("F1-score:", results['eval_f1'])
print("Precision:", results['eval_precision'])
print("Recall:", results['eval_recall'])

Accuracy: 0.9484193011647255
F1-score: 0.8883888388838884
Precision: 0.8665496049165935
Recall: 0.9113573407202216
