In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Data 

### First name with sex

In [None]:
firstname = pd.read_csv('firstname_with_sex.csv')

# On prépare les colonnes séparées ['firstname', 'male', 'female']
new_columns = firstname['firstname;male;female'].str.split(';', expand=True)
firstname[['firstname', 'male', 'female']] = new_columns

# On enlève l'ancienne colonne
firstname.drop(columns=['firstname;male;female'], inplace=True)
firstname['male'] = firstname['male'].astype(int)
firstname['female'] = firstname['female'].astype(int)

firstname.head()

### Transcriptions with sex

In [None]:
transcriptions = pd.read_csv('transcriptions_with_sex.csv')

# On regarde à quoi ressemblent les données
transcriptions = transcriptions[transcriptions['sex'] != 'ambigu']

def extract_info_groundtruth(text):
    patterns = {
        'surname': r'surname:\s*([^\s]+)',
        'firstname': r'firstname:\s*([^\s]+)',
        'occupation': r'occupation:\s*([^\s]+)',
        'link': r'link:\s*([^\s]+)',
        'employer': r'employer:\s*([^\s]+)',
        'age': r'age:\s*(\d+)',
        'birth_date': r'birth_date:\s*(\d+)',
        'lob': r'lob:\s*([^\s]+)'
    }
    results = {key: '' for key in patterns} 
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            results[key] = match.group(1)
    return pd.Series(results) 

# Appliquer la fonction et créer les colonnes 
info_columns = ['surname', 'firstname', 'occupation', 'link', 'employer', 'age', 'birth_date', 'lob']
transcriptions[info_columns] = transcriptions['groundtruth'].apply(extract_info_groundtruth)

# Assurer que toutes les colonnes textuelles sont de type str
for col in ['surname', 'firstname', 'occupation', 'link', 'employer', 'lob']:
    transcriptions[col] = transcriptions[col].astype(str)

# Assurer que les colonnes 'age' et 'birth_date' sont numériques
transcriptions['age'] = pd.to_numeric(transcriptions['age'], errors='coerce')
transcriptions['birth_date'] = pd.to_numeric(transcriptions['birth_date'], errors='coerce')

# On enlève les paramètres qui sont probablement peu pertinents à la prédiction
transcriptions.drop(['subject_line', 'groundtruth', 'prediction', 'surname', 'employer', 'lob'], axis=1, inplace=True)
transcriptions.head(5)

# I - Présentation et description des données

## I.1 - Statistiques descriptives des données disponibles

### Fichier firstname_with_sex.csv

#### Distribution des prénoms : Nombre total de prénoms uniques, les prénoms les plus fréquents.

In [None]:
# On vérifie que chaque ligne corresponde à un nom
print("Nombre total de prénoms uniques :", firstname['firstname'].nunique())

In [None]:
# Regardons les prénoms les plus populaires chez les hommes
firstname.sort_values('male', ascending=False, ignore_index=True).head(5)[['firstname', 'male']]

In [None]:
# Regardons les prénoms les plus populaires chez les femmes
firstname.sort_values('female', ascending=False, ignore_index=True).head(5)[['firstname', 'female']]

#### Répartition du sexe : Pourcentage de prénoms majoritairement masculins, féminins et neutres selon les fréquences données.

In [None]:
# Calcul du pourcentage pour chaque prénom par genre
firstname['total'] = firstname['male'] + firstname['female']
firstname['percent_male'] = (firstname['male'] / firstname['total']) * 100
firstname['percent_female'] = (firstname['female'] / firstname['total']) * 100
firstname.head(5)

In [None]:
# Histogramme de la répartition des noms chez les hommes
plt.figure(figsize=(10, 5))
sns.histplot(firstname['percent_male'], bins=50, color='blue', alpha=0.7, label='Homme')
plt.xlabel('Percentage')
plt.ylabel('Frequence')
plt.title('Distribution des prénoms chez les Hommes en %')
plt.legend()
plt.show()

In [None]:
# Histogramme de la répartition des noms chez les femmes
plt.figure(figsize=(10, 5))
sns.histplot(firstname['percent_female'], bins=50, color='red', alpha=0.7, label='Female')
plt.xlabel('Percentage')
plt.ylabel('Frequency')
plt.title('Distribution of Names by Gender Percentage')
plt.legend()
plt.show()

### Fichier transcriptions_with_sex.csv

In [None]:
# Répartition des sexes
sex_counts = transcriptions['sex'].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=sex_counts.index, y=sex_counts.values, palette='pastel')
plt.title('Répartition par sexe')
plt.ylabel('Nombre')
plt.show()

# Analyse des âges
plt.figure(figsize=(10, 5))
sns.histplot(transcriptions['age'], bins=50, color='purple')
plt.title('Distribution des âges')
plt.xlabel('Âge')
plt.ylabel('Fréquence')
plt.show()

# Analyse des professions
occupation_counts = transcriptions['occupation'].value_counts().head(10)
sns.set_palette('deep')
plt.figure(figsize=(10, 5))
sns.barplot(x=occupation_counts.values, y=occupation_counts.index, palette='viridis')
plt.title('Top 10 des professions')
plt.xlabel('Fréquence')
plt.ylabel('Profession')
plt.show()

# Analyse des professions
link_counts = transcriptions['link'].value_counts().head(10)
sns.set_palette('deep')
plt.figure(figsize=(10, 5))
sns.barplot(x=link_counts.values, y=link_counts.index, palette='magma')
plt.title('Top 10 des liens')
plt.xlabel('Fréquence')
plt.ylabel('Liens')
plt.show()


## I.2 Estimation de la taille des données cibles

In [None]:
transcriptions.info()

# III - Expérimentation

## III.1 - Protocole Expériemental

### III.1.a - Classifier SKLearn with naive_bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

Préprocessing - Le modèle que l'on utilise prend en entrer une liste de chaînes de caractères qui contient toutes informations et les prédictions associées.

Nous allons donc revenir avec un format de données textes similaires à "groundtruth" que l'on avait avant le processing de notre base de données, mais l'on aura tout de même retirer les informations peu pertinentes.

Ce code a été adaptée depuis la ressource officielle : https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
def get_texts(transcriptions):
    texts = []
    for i in range(len(transcriptions)):
        firstname = transcriptions['firstname'].iloc[i]
        occupation = transcriptions['occupation'].iloc[i]
        link = transcriptions['link'].iloc[i]
        age = transcriptions['age'].iloc[i]
        birth_date = transcriptions['birth_date'].iloc[i]
        texts.append(f"firstname: {firstname}, occupation: {occupation}, link: {link}, age: {age}, birth_date: {birth_date}")
    return texts

In [None]:
train_texts = get_texts(transcriptions)
train_labels = (transcriptions['sex'] == 'femme').astype(int).to_list()

train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=.20)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_texts)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, train_labels)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(train_texts, train_labels)

In [None]:
predicted = text_clf.predict(test_texts)
np.mean(predicted == test_labels)

### III.1.b - Réseau de Neurones Récurrents

Pour cette approche, nous allons encoder les prénoms en séquences de caractères et utiliser un RNN pour prédire le sexe.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
# Encoder les prénoms en indices de caractères
char_to_index = {char: idx + 1 for idx, char in enumerate(set(''.join(transcriptions['firstname'])))}
max_length = max(len(name) for name in transcriptions['firstname'])

def encode_names(name):
    encoded = [char_to_index[char] for char in name]
    return encoded + [0] * (max_length - len(name))

# Préparer les tensors
X_rnn = torch.tensor([encode_names(name) for name in transcriptions['firstname']])
y_rnn = torch.tensor(y.values)

# Création du DataLoader
dataset = TensorDataset(X_rnn, y_rnn)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Définir le modèle RNN
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(char_to_index) + 1, embedding_dim=8)
        self.rnn = nn.RNN(input_size=8, hidden_size=16, batch_first=True)
        self.fc = nn.Linear(16, 1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return torch.sigmoid(x)

In [None]:
# Configuration de l'appareil
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 32
num_epochs = 10
lr = 0.001

# Création du jeu de données
def encode_name(name):
    return [ord(char) - ord('a') + 1 for char in name.lower() if 'a' <= char <= 'z']

max_length = max(len(name) for name in transcriptions['firstname'])
names_encoded = [encode_name(name) + [0] * (max_length - len(encode_name(name))) for name in transcriptions['firstname']]
names_encoded = torch.tensor(names_encoded, dtype=torch.long)
sex_encoded = torch.tensor(transcriptions['sex'].apply(lambda x: 1 if x == 'femme' else 0).values)

# Division des données
dataset = TensorDataset(names_encoded, sex_encoded)
train_dataset, test_dataset = random_split(dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8 * len(dataset))])


def rnn (batch_size, num_epochs, lr):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Définition du modèle RNN
    class NameRNN(nn.Module):
        def __init__(self):
            super(NameRNN, self).__init__()
            self.embedding = nn.Embedding(27, 8) 
            self.rnn = nn.GRU(8, 16, batch_first=True)
            self.fc = nn.Linear(16, 2)

        def forward(self, x):
            x = self.embedding(x)
            x, _ = self.rnn(x)
            x = self.fc(x[:, -1, :])
            return x

    model = NameRNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Entraînement du modèle
    def train_model(model, train_loader, criterion, optimizer, num_epochs=num_epochs):
        model.train()
        for epoch in range(num_epochs):
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

    # Tester le modèle
    def evaluate_model(model, test_loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Accuracy: {100 * correct / total}%')

    train_model(model, train_loader, criterion, optimizer)
    return evaluate_model(model, test_loader)


### III.1.c - HuggingFace : Modèle BERT

In [None]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

In [None]:
# transcriptions = transcriptions.sample(frac = 1).reset_index(drop=True)
# transcriptions

In [None]:
n_obs = len(transcriptions)
transcriptions_train = transcriptions[n_obs//5 :]
transcriptions_test = transcriptions[ : n_obs//5]

Le code utilisé a été adapté depuis ressource officielle pour l'utilisation des modèles d'HuggingFace: https://huggingface.co/transformers/v3.2.0/custom_datasets.html#sequence-classification-with-imdb-reviews

In [None]:
def get_texts (transcriptions):
    texts = []
    for i in range(len(transcriptions)):
        firstname = transcriptions['firstname'].iloc[i]
        occupation = transcriptions['occupation'].iloc[i]
        link = transcriptions['link'].iloc[i]
        age = transcriptions['age'].iloc[i]
        birth_date = transcriptions['birth_date'].iloc[i]
        texts.append(f"firstname: {firstname}, occupation: {occupation}, link: {link}, age: {age}, birth_date: {birth_date}")
    return texts

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} 
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def compute_accuracy(model, dataloader):
    
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            correct_predictions += torch.sum(predictions == labels).item()
            total_predictions += len(labels)

    accuracy = correct_predictions / total_predictions
    return accuracy

def BERT (num_epochs, batch_size, wd, lr):

    # Séparation des données en features et labels
    train_texts, train_labels = get_texts(transcriptions_train), (transcriptions_train['sex'] == 'femme').astype(int).to_list()
    test_texts, test_labels = get_texts(transcriptions_test), (transcriptions_test['sex'] == 'femme').astype(int).to_list()

    # validation sets and train sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.20)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

    train_dataset = IMDbDataset(train_encodings, train_labels)
    val_dataset = IMDbDataset(val_encodings, val_labels)
    test_dataset = IMDbDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=wd,
        logging_dir='./logs',
        logging_steps=10,
    )

    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    optim = AdamW(model.parameters(), lr=lr)

    for epoch in range(3):
        for batch in train_loader:
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optim.step()
    
    # Créer les DataLoaders pour les jeux d'entraînement, de validation et de test
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Calculer la précision sur le jeu d'entraînement
    train_accuracy = compute_accuracy(model, train_loader)
    print(f"Train Accuracy: {train_accuracy:.4f}")

    # Calculer la précision sur le jeu de validation
    val_accuracy = compute_accuracy(model, val_loader)
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Calculer la précision sur le jeu de test
    test_accuracy = compute_accuracy(model, test_loader)
    print(f"Test Accuracy: {test_accuracy:.4f}")

    
    return [train_accuracy, val_accuracy, test_accuracy]

In [None]:
num_epochs_list = [3, 5, 10, 15]
batch_size_list = [8, 16, 32, 64]
wd_list = [0.1, 0.01, 0.001, 0.0001]
lr_list = [1e-5, 2e-5, 5e-5, 1e-4]

accuracy_list = []
hyperparameters_list = []
results_list = []

for num_epochs in num_epochs_list:
    for batch_size in batch_size_list:
        for wd in wd_list:
            for lr in lr_list:
                hyperparameters = [num_epochs, batch_size, wd, lr]
                accuracy = BERT(num_epochs, batch_size, wd, lr)

                hyperparameters_list.append(hyperparameters)
                accuracy_list.append(accuracy)
                results_list.append(hyperparameters + accuracy)

results_data = pd.DataFrame(results_list, columns=['epochs', 'batch', 'wd', 'lr', 'train_accuracy', 'val_accuracy', 'test_accuracy'])
results_data.to_csv('resultats.csv', index=False)

## III.2 - Analyse des résultats obtenus

In [None]:
df_resultats = pd.read_csv('resultats.csv')
df_resultats

In [None]:
# Accuracy en fontion du epochs, les marges d'érreurs sont définies en fonctions 
# des différences entre les résultats lorsque les autres hyperparamètres sont modifiés
sns.lineplot(data=df_resultats, x='epochs', y='val_accuracy', label='Validation accuracy')
sns.lineplot(data=df_resultats, x='epochs', y='test_accuracy', label='Test accuracy')

plt.xlabel('Nombre d\'epochs')
plt.ylabel('Précision')

plt.title('Accuracy en fontion du nombre d\'epochs')

In [None]:
# Accuracy en fontion du batch
sns.lineplot(data=df_resultats, x='batch', y='val_accuracy', label='Validation accuracy')
sns.lineplot(data=df_resultats, x='batch', y='test_accuracy', label='Test accuracy')

plt.xlabel('Taille des Batchs')
plt.ylabel('Précision')

plt.title('Accuracy en fontion de la taille des batchs')

In [None]:
# Accuracy en fontion du weight decay
sns.lineplot(data=df_resultats, x='wd', y='val_accuracy', label='Validation accuracy')
sns.lineplot(data=df_resultats, x='wd', y='test_accuracy', label='Test accuracy')

plt.xscale('log')

plt.xlabel('Weight Decay')
plt.ylabel('Précision')

plt.title('Accuracy en fontion du weight decay')

In [None]:
# Accuracy en fontion du learning-rate
sns.lineplot(data=df_resultats, x='lr', y='val_accuracy', label='Validation accuracy')
sns.lineplot(data=df_resultats, x='lr', y='test_accuracy', label='Test accuracy')

plt.xlabel('Learning Rate')
plt.ylabel('Précision')

plt.title('Accuracy en fontion du learning-rate')