In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [33]:
name_data = pd.read_csv("firstname_with_sex.csv", sep=";")
df_2 = pd.read_csv("transcriptions_with_sex.csv")

In [34]:
def extract_groundtruth(df):
    # Créer un ensemble vide pour stocker les noms de colonnes uniques
    unique_columns = set()

    # Parcourir chaque ligne de la colonne "groundtruth"
    for entry in df["groundtruth"]:
        # Utiliser une expression régulière pour extraire les noms de colonnes
        matches = re.findall(r"(\w+):", entry)
        unique_columns.update(matches)

    # Créer des dictionnaires vides pour stocker les données extraites
    extracted_data = {col: [] for col in unique_columns}

    # Parcourir chaque ligne de la colonne "groundtruth" et extraire les données
    for entry in df["groundtruth"]:
        matches = re.findall(r"(\w+): (\w+)", entry)
        entry_data = {col: None for col in unique_columns}
        for match in matches:
            col_name, value = match
            entry_data[col_name] = value
        for col, value in entry_data.items():
            extracted_data[col].append(value)

    # Créer un DataFrame avec les données extraites
    new_df = pd.DataFrame(extracted_data)

    return new_df

In [35]:
df_2.shape

(241, 4)

In [36]:
data = df_2.drop("prediction", axis=1)

In [37]:
new_dataframe = extract_groundtruth(df_2[["groundtruth"]])

In [38]:
new_dataframe

Unnamed: 0,civil_status,link,occupation,observation,lob,birth_date,surname,age,employer,firstname
0,,fille,idem,,,,Chardon,30,,Marie
1,,chef,sp,,,,Lhopital,67,,Louis
2,,idem,idem,,,,Papin,15,idem,Marie
3,,femme,,,Rigny,1875,Lavocat,,,Marie
4,,,,,,,Benne,78,,Marguerite
...,...,...,...,...,...,...,...,...,...,...
236,,épouse,sans,,,,Burlurut,61,,Pétronille
237,,son,idem,,,,Combey,39,,Alexandre
238,,épouse,idem,,,,Collin,38,idem,Marguerite
239,,fils,,,idem,1900,Dumont,,,Etienne


In [39]:
for column in new_dataframe.columns:
    print(new_dataframe[column].unique())

[None 'Homme' 'Garçon']
['fille' 'chef' 'idem' 'femme' None 'fils' 'enfant' 'Sa' 'enf' 'Chef'
 'épouse' 'domestique' 'ép' 'père' 'mère' 'Domestique' 'belle' 'Fils'
 'petite' 'sa' 'Leur' 'petit' 'bru' 'pensionnaire' 'leur' 'mére' 'assisté'
 'ouvrier' 'frère' 'Schouer' 'Son' 'ch' 'son' 'domest']
['idem' 'sp' None 'Métayer' 'néant' 'journalier' 'cantonnier'
 'Cultivateur' 'sans' 'domestique' 'couvreur' 'roulier' 's' 'cultivateur'
 'garde' 'journalière' 'manoeuvre' 'forgeron' 'ouvrier' 'propriétaire'
 'Domestique' 'argentière' 'voiturier' 'jardinier' 'cult' 'métayer'
 'femme' 'culivateur' 'boulanger' 'buraliste' 'clerc' 'quincaillier'
 'charcutier' 'sellier' 'Sans' 'imprimeur' 'patissier' 'cullotière'
 'employé' 'blanchiseuse' 'nourrisson' 'cultivatrice' 'cultivat'
 'charretier' 'rentière' 'repasseuse' 'receveur' 'Garde' 'domest'
 'menuisier' 'tourneur' 'agent' 'couturière' 'déposit' 'tisserand'
 'employée' 'Propriétaire' 'coiffeur' 'cultiv']
[None 'x']
[None 'Rigny' 'Paris' 'Saint' 'Coula

In [40]:
for column in new_dataframe.columns:
    data[column] = new_dataframe[column]
    

In [41]:
preprocessed_data = data.drop("groundtruth", axis=1)

In [42]:
preprocessed_data

Unnamed: 0,subject_line,sex,civil_status,link,occupation,observation,lob,birth_date,surname,age,employer,firstname
0,ebb26ada-044c-4c62-9dbc-a9c8d505d31c,femme,,fille,idem,,,,Chardon,30,,Marie
1,338496f5-e4ca-43ac-aa5c-429cb3f6ac00,homme,,chef,sp,,,,Lhopital,67,,Louis
2,e6a279da-9b6f-4f49-b498-64857bc50d1e,femme,,idem,idem,,,,Papin,15,idem,Marie
3,7534deca-39e8-4f00-be17-c12460015de1,femme,,femme,,,Rigny,1875,Lavocat,,,Marie
4,ef334a66-a504-418a-9872-e7c9db923488,femme,,,,,,,Benne,78,,Marguerite
...,...,...,...,...,...,...,...,...,...,...,...,...
236,1d92738a-cffe-4aee-ab10-db04c37f7405,femme,,épouse,sans,,,,Burlurut,61,,Pétronille
237,09440055-7972-4135-a537-e6c5a1f2aeb2,homme,,son,idem,,,,Combey,39,,Alexandre
238,4d387278-12c3-410c-bdf0-c5c603479764,femme,,épouse,idem,,,,Collin,38,idem,Marguerite
239,623b95de-f87c-4844-a7f7-361088eed83a,homme,,fils,,,idem,1900,Dumont,,,Etienne


In [43]:
preprocessed_data.occupation.isna().sum()

42

In [44]:
preprocessed_data["firstname"] = preprocessed_data["firstname"].str.lower()

In [45]:
jointure_data = pd.merge(
    preprocessed_data,
    name_data,
    left_on=["firstname"],
    right_on=["firstname"],
    how="left",
)

In [46]:
jointure_data.drop("subject_line", axis=1)

Unnamed: 0,sex,civil_status,link,occupation,observation,lob,birth_date,surname,age,employer,firstname,male,female
0,femme,,fille,idem,,,,Chardon,30,,marie,10145.0,2390322.0
1,homme,,chef,sp,,,,Lhopital,67,,louis,750498.0,2720.0
2,femme,,idem,idem,,,,Papin,15,idem,marie,10145.0,2390322.0
3,femme,,femme,,,Rigny,1875,Lavocat,,,marie,10145.0,2390322.0
4,femme,,,,,,,Benne,78,,marguerite,1441.0,813859.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,femme,,épouse,sans,,,,Burlurut,61,,pétronille,30.0,13067.0
237,homme,,son,idem,,,,Combey,39,,alexandre,90238.0,413.0
238,femme,,épouse,idem,,,,Collin,38,idem,marguerite,1441.0,813859.0
239,homme,,fils,,,idem,1900,Dumont,,,etienne,211297.0,898.0


In [47]:
texts = [" ".join([str(x) for x in row]) for row in jointure_data]

In [51]:
jointure_data

Unnamed: 0,subject_line,sex,civil_status,link,occupation,observation,lob,birth_date,surname,age,employer,firstname,male,female
0,ebb26ada-044c-4c62-9dbc-a9c8d505d31c,femme,,fille,idem,,,,Chardon,30,,marie,10145.0,2390322.0
1,338496f5-e4ca-43ac-aa5c-429cb3f6ac00,homme,,chef,sp,,,,Lhopital,67,,louis,750498.0,2720.0
2,e6a279da-9b6f-4f49-b498-64857bc50d1e,femme,,idem,idem,,,,Papin,15,idem,marie,10145.0,2390322.0
3,7534deca-39e8-4f00-be17-c12460015de1,femme,,femme,,,Rigny,1875,Lavocat,,,marie,10145.0,2390322.0
4,ef334a66-a504-418a-9872-e7c9db923488,femme,,,,,,,Benne,78,,marguerite,1441.0,813859.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,1d92738a-cffe-4aee-ab10-db04c37f7405,femme,,épouse,sans,,,,Burlurut,61,,pétronille,30.0,13067.0
237,09440055-7972-4135-a537-e6c5a1f2aeb2,homme,,son,idem,,,,Combey,39,,alexandre,90238.0,413.0
238,4d387278-12c3-410c-bdf0-c5c603479764,femme,,épouse,idem,,,,Collin,38,idem,marguerite,1441.0,813859.0
239,623b95de-f87c-4844-a7f7-361088eed83a,homme,,fils,,,idem,1900,Dumont,,,etienne,211297.0,898.0


In [53]:
df = jointure_data.drop("subject_line", axis=1)

In [58]:
def concatenate_column_names(row):
    text_parts = []
    for col_name, value in row.iteritems():
        if pd.notna(value) and value != "None":
            text_parts.append(f"{col_name}_{value}")
    return " ".join(text_parts)

In [59]:
df["texte"] = df.apply(concatenate_row_values_with_column_names, axis=1)

  for col_name, value in row.iteritems():


In [60]:
df

Unnamed: 0,sex,civil_status,link,occupation,observation,lob,birth_date,surname,age,employer,firstname,male,female,texte
0,femme,,fille,idem,,,,Chardon,30,,marie,10145.0,2390322.0,sex_femme link_fille occupation_idem surname_C...
1,homme,,chef,sp,,,,Lhopital,67,,louis,750498.0,2720.0,sex_homme link_chef occupation_sp surname_Lhop...
2,femme,,idem,idem,,,,Papin,15,idem,marie,10145.0,2390322.0,sex_femme link_idem occupation_idem surname_Pa...
3,femme,,femme,,,Rigny,1875,Lavocat,,,marie,10145.0,2390322.0,sex_femme link_femme lob_Rigny birth_date_1875...
4,femme,,,,,,,Benne,78,,marguerite,1441.0,813859.0,sex_femme surname_Benne age_78 firstname_margu...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,femme,,épouse,sans,,,,Burlurut,61,,pétronille,30.0,13067.0,sex_femme link_épouse occupation_sans surname_...
237,homme,,son,idem,,,,Combey,39,,alexandre,90238.0,413.0,sex_homme link_son occupation_idem surname_Com...
238,femme,,épouse,idem,,,,Collin,38,idem,marguerite,1441.0,813859.0,sex_femme link_épouse occupation_idem surname_...
239,homme,,fils,,,idem,1900,Dumont,,,etienne,211297.0,898.0,sex_homme link_fils lob_idem birth_date_1900 s...


In [25]:
final_data = jointure_data[["employer","age","birth_date","civil_status","link","occupation","observation","lob","sex"]]

In [26]:
final_data

Unnamed: 0,employer,age,birth_date,civil_status,link,occupation,observation,lob,sex
0,,30,,,fille,idem,,,femme
1,,67,,,chef,sp,,,homme
2,idem,15,,,idem,idem,,,femme
3,,,1875,,femme,,,Rigny,femme
4,,78,,,,,,,femme
...,...,...,...,...,...,...,...,...,...
236,,61,,,épouse,sans,,,femme
237,,39,,,son,idem,,,homme
238,idem,38,,,épouse,idem,,,femme
239,,,1900,,fils,,,idem,homme


In [34]:
# Séparation des données en features (X) et target (y)
X = final_data.drop(columns=["sex"])  # Features
y = final_data["sex"]  # Target

# Effectuer un stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Afficher les tailles des ensembles d'entraînement et de test
print("Taille de l'ensemble d'entraînement :", len(X_train))
print("Taille de l'ensemble de test :", len(X_test))

Taille de l'ensemble d'entraînement : 192
Taille de l'ensemble de test : 49


In [None]:
X_train.head(10)

: 

In [40]:
from sklearn.preprocessing import OrdinalEncoder

# Instancier l'encodeur ordinal avec la gestion des inconnus
ordinal_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

# Appliquer l'encodage ordinal sur les colonnes catégorielles
X_train_encoded = ordinal_encoder.fit_transform(
    X_train[
        [
            "employer",
            "civil_status",
            "link",
            "occupation",
            "observation",
            "lob",
        ]
    ]
)
X_test_encoded = ordinal_encoder.transform(
    X_test[
        [
            "employer",
            "civil_status",
            "link",
            "occupation",
            "observation",
            "lob",
        ]
    ]
)

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Instancier le modèle RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)

# Entraîner le modèle sur X_train_encoded
random_forest.fit(X_train_encoded, y_train)

# Prédire sur X_test_encoded
y_pred = random_forest.predict(X_test_encoded)

# Calculer les métriques
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
conf_matrix = confusion_matrix(y_test, y_pred)

# Afficher les métriques
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.7551020408163265
F1 Score: 0.7573738929534078
Confusion Matrix:
 [[ 1  0  1]
 [ 1 15  6]
 [ 1  3 21]]


In [44]:
# Obtenir les modalités de classe prédites par le modèle
class_labels = random_forest.classes_

# Afficher les modalités de classe
print("Classes:", class_labels)

Classes: ['ambigu' 'femme' 'homme']


In [49]:
from sklearn.preprocessing import LabelEncoder

# Instancier le label encoder
label_encoder = LabelEncoder()

# Fit sur y_train pour obtenir les classes uniques
label_encoder.fit(y_train)

# Transformer y_train et y_test en entiers
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [50]:
from xgboost import XGBClassifier

# Instancier le modèle XGBClassifier
xgb_model = XGBClassifier(random_state=42)

# Entraîner le modèle sur X_train_encoded
xgb_model.fit(X_train_encoded, y_train_encoded)

# Prédire sur X_test_encoded
y_pred_xgb = xgb_model.predict(X_test_encoded)

# Obtenir les modalités de classe prédites par le modèle
class_labels_xgb = xgb_model.classes_

# Calculer les métriques
accuracy_xgb = accuracy_score(y_test_encoded, y_pred_xgb)
f1_xgb = f1_score(y_test_encoded, y_pred_xgb, average="weighted")
conf_matrix_xgb = confusion_matrix(y_test_encoded, y_pred_xgb)

# Afficher les métriques
print("Accuracy (XGBoost):", accuracy_xgb)
print("F1 Score (XGBoost):", f1_xgb)
print("Confusion Matrix (XGBoost):\n", conf_matrix_xgb)
print("Classes (XGBoost):", class_labels_xgb)

Accuracy (XGBoost): 0.7551020408163265
F1 Score (XGBoost): 0.7547357645313072
Confusion Matrix (XGBoost):
 [[ 0  0  2]
 [ 1 16  5]
 [ 1  3 21]]
Classes (XGBoost): [0 1 2]


In [48]:
from sklearn.svm import SVC

# Instancier le modèle SVM
svm_model = SVC(random_state=42)

# Entraîner le modèle sur X_train_encoded
svm_model.fit(X_train_encoded, y_train)

# Prédire sur X_test_encoded
y_pred_svm = svm_model.predict(X_test_encoded)

# Obtenir les modalités de classe prédites par le modèle
class_labels_svm = svm_model.classes_

# Calculer les métriques
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm, average="weighted")
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# Afficher les métriques
print("Accuracy (SVM):", accuracy_svm)
print("F1 Score (SVM):", f1_svm)
print("Confusion Matrix (SVM):\n", conf_matrix_svm)
print("Classes (SVM):", class_labels_svm)

Accuracy (SVM): 0.6326530612244898
F1 Score (SVM): 0.6153219602696954
Confusion Matrix (SVM):
 [[ 0  1  1]
 [ 0 12 10]
 [ 0  6 19]]
Classes (SVM): ['ambigu' 'femme' 'homme']
