# Hackathon Ekimetrics 2023

L'objectif de ce Hackathon est d'implémenter un algorithme permettant à une entreprise d'automatiser son processus de recrutement, en s'assurant que l'algorithme de sélection ne reproduise aucun biais discriminatoire.

### Biais a priori

Lors de la mise en œuvre d'un système d'IA, l'équité et les préjugés doivent être un élément important lors de la conception, en particulier lorsqu'il s'agit d'informations sensibles, et/ou d'informations personnelles identifiables (PII), et/ou d'informations personnelles sur la santé (PHI). En effet, non seulement ces informations sont soumises à la loi (par exemple, le GDPR en Europe), mais elles sont également soumises à un défi en termes d'image de marque.
L'exemple d'aujourd'hui vise à attribuer un risque aux données de recrutement.
Avant de mettre en œuvre un système d'IA pour prédire la probabilité d'embauche d'un candidat, les ingénieurs en IA et les parties prenantes de l'entreprise devraient :

- Se situer et identifier les sources potentielles de biais
- Définir une ou plusieurs mesures qui quantifieront les biais du système d'IA.

https://www.hirevue.com/blog/hiring/what-is-adverse-impact-and-why-measuring-it-matters

### Dataset

L'enquête annuelle de StackOverflow auprès des développeurs (plus de 70 000 réponses provenant de plus de 180 pays) examine tous les aspects de l'expérience des développeurs, de l'apprentissage du code aux technologies préférées, en passant par le contrôle des versions et l'expérience professionnelle.

Nous disposons des données suivantes : 


- Age: age of the applicant, >35 years old or <35 years old (categorical)

- EdLevel: education level of the applicant (Undergraduate, Master, PhD...) (categorical)

- Gender: gender of the applicant, (Man, Woman, or NonBinary) (categorical)

- MainBranch: whether the applicant is a profesional developer (categorical)

- YearsCode: how long the applicant has been coding (integer)

- YearsCodePro: how long the applicant has been coding in a professional context, (integer)

- PreviousSalary: the applicant's previous job salary (float)

- ComputerSkills: number of computer skills known by the applicant (integer)

- Employed: target variable, whether the applicant has been hired (categorical)

In [None]:
pip install dalex

In [None]:
#Importation des librairies nécessaires 

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import dalex as dx
from dalex.fairness import resample, reweight, roc_pivot
from copy import copy
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff

# Exploratory Data Analysis

In [None]:
#Chargement du dataset 
#df = pd.read_csv('/Users/come/Desktop/ENSAE/Hackathon-Ekimetrics/stackoverflow_full.csv', index_col=0)
df = pd.read_csv('https://raw.githubusercontent.com/cominho/Hackathon-Ekimetrics/main/stackoverflow_full.csv?token=GHSAT0AAAAAACKRKK6JJCU5M6DPABA4YRNYZK6GFHQ')
df

In [None]:
#Nombre d'observations
nb_obs = df.shape[0]
nb_obs

In [None]:
#Existence de valeurs manquantes 
miss_val = df.isna().any().any()
miss_val

In [None]:
#Nombre de valeurs manquantes par colonne 
miss_val_col = df.isna().sum()
miss_val_col

## Statistiques descriptives

In [None]:
#Variables à inclure dans les statistiques descriptives
variables_to_describe = ['Age', 'PreviousSalary', 'EdLevel', 'Employment', 'Gender', 'ComputerSkills']

#Création d'une grille de sous-graphiques
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig.suptitle('Statistiques descriptives')

for i, variable in enumerate(variables_to_describe):
    row, col = i // 3, i % 3
    sns.histplot(data=df, x=variable, ax=axes[row, col], kde=False)
    axes[row, col].set_title(f'{variable} Distribution')

plt.tight_layout()
plt.subplots_adjust(top=0.9)

plt.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=1)

colors = px.colors.sequential.Agsunset

for i, gender in enumerate(df['Gender'].unique()):
    data = df[df['Gender'] == gender]
    fig.add_trace(go.Box(x=data['Gender'], y=data['PreviousSalary'], name=gender, marker=dict(color=colors[i])), row=1, col=1)

fig.update_layout(title='Distribution des salaires par genre')
fig.show()

In [None]:
colors = px.colors.sequential.Plasma  


fig = make_subplots(rows=1, cols=1)

unique_ages = df['Age'].unique()
unique_ages.sort() 

for i, age in enumerate(unique_ages):
    data = df[df['Age'] == age]
   
    color = colors[i % len(colors)]
    
    fig.add_trace(go.Box(
        x=data['Age'], 
        y=data['PreviousSalary'], 
        name=str(age), 
        marker=dict(color=color)
    ), row=1, col=1)


fig.update_layout(title='Distribution des salaires en fonction de l âge')

fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

colors = px.colors.sequential.Plasma

fig = make_subplots(rows=1, cols=1)

ed_levels = df['EdLevel'].unique()
ed_levels.sort()

for i, ed_level in enumerate(ed_levels):
    data = df[df['EdLevel'] == ed_level]
    color = colors[i % len(colors)]
    
    fig.add_trace(go.Box(
        x=data['EdLevel'], 
        y=data['PreviousSalary'], 
        name=ed_level, 
        marker=dict(color=color)
    ), row=1, col=1)

fig.update_layout(
    title="Distribution des salaires par niveau d'éducation",
    xaxis_tickangle=-45
)

fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

colors = px.colors.sequential.Plasma

fig = make_subplots(rows=1, cols=1)

mental_health_statuses = df['MentalHealth'].unique()
mental_health_statuses.sort()

for i, status in enumerate(mental_health_statuses):
    data = df[df['MentalHealth'] == status]
    color = colors[i % len(colors)]
    
    fig.add_trace(go.Box(
        x=data['MentalHealth'], 
        y=data['PreviousSalary'], 
        name=status, 
        marker=dict(color=color)
    ), row=1, col=1)

fig.update_layout(title='Distribution des salaires et maladie mentale')

fig.show()

In [None]:
colors = px.colors.sequential.Plasma

fig = make_subplots(rows=1, cols=1)

employed_statuses = df['Employed'].unique()
employed_statuses.sort()

for i, status in enumerate(employed_statuses):
    data = df[df['Employed'] == status]
    color = colors[i % len(colors)]

    fig.add_trace(go.Box(
        x=data['Employed'].astype(str), 
        y=data['PreviousSalary'], 
        name=str(status), 
        marker=dict(color=color)
    ), row=1, col=1)

fig.update_layout(title='Distribution des salaires en fonction du statut salarial')

fig.show()

In [None]:
colors = px.colors.sequential.Inferno

fig = make_subplots(rows=1, cols=1)

main_branches = df['MainBranch'].unique()
main_branches.sort()

for i, branch in enumerate(main_branches):
    data = df[df['MainBranch'] == branch]
    color = colors[i % len(colors)]
    
    fig.add_trace(go.Violin(
        x=data['MainBranch'].astype(str), 
        y=data['PreviousSalary'], 
        name=str(branch), 
        line_color=color
    ), row=1, col=1)

fig.update_layout(
    title='Distribution des salaires par branche de travail',
    xaxis_tickangle=-45
)

fig.show()

On explore par ailleurs notre variable cible pour s'assurer que les 2 catégories ne sont pas numériquement désquilibrées, car cela pourrait nous poser problème pendant la modélisation si tel est le cas. 

In [None]:
employed_proportion = df['Employed'].value_counts(normalize=True)

fig = px.bar(
    x=employed_proportion.index, 
    y=employed_proportion.values, 
    labels={'x': 'Employed', 'y': 'Proportion'}, 
    title='Proportion de personnes Employed'
)
fig.update_layout(showlegend=False)
fig.update_traces(marker_color=px.colors.sequential.Aggrnyl)
fig.show()

# Identification des biais 

In [None]:
# Préparation des données pour les subplots
gender_employed = df.groupby(['Gender', 'Employed']).size().unstack().fillna(0)
mental_health_employed = df.groupby(['MentalHealth', 'Employed']).size().unstack().fillna(0)
age_employed = df.groupby(['Age', 'Employed']).size().unstack().fillna(0)
previous_employed = df.groupby(['Employment', 'Employed']).size().unstack().fillna(0)

colors = px.colors.sequential.Plasma

# Création des subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=(
    "Par genre",
    "Par maladie mentale",
    "Par tranche d'âge",
    "Par emploi précédent"
))

# Ajout des bar plots avec la palette de couleurs Plasma
for i, employed in enumerate(gender_employed.columns):
    fig.add_trace(go.Bar(x=gender_employed.index, y=gender_employed[employed], name=str(employed), marker_color=colors[i + 2]), row=1, col=1)

for i, employed in enumerate(mental_health_employed.columns):
    fig.add_trace(go.Bar(x=mental_health_employed.index, y=mental_health_employed[employed], name=str(employed), marker_color=colors[i + 2]), row=1, col=2)

for i, employed in enumerate(age_employed.columns):
    fig.add_trace(go.Bar(x=age_employed.index, y=age_employed[employed], name=str(employed), marker_color=colors[i + 2]), row=2, col=1)

for i, employed in enumerate(previous_employed.columns):
    fig.add_trace(go.Bar(x=previous_employed.index, y=previous_employed[employed], name=str(employed), marker_color=colors[i + 2]), row=2, col=2)

# Mise à jour du layout pour ajouter un titre et ajuster la présentation
fig.update_layout(
    title_text="Comparaison du recrutement en fonction de différentes caractéristiques",
    barmode='stack',
    showlegend=False
)

# Affichage de la figure
fig.show()

Pour avoir une idée plus précise de la présence de biais, il faut comparer les taux d'offres d'emploi en fonction des caractéristiques. En effet, prendre en compte la valeur absolue risquerait de ne pas refléter la présence d'un biais, car certaines variables ont des catégories sur-représentées par rapport à d'autres. 

In [None]:
# Calcul des pourcentages par groupe
gender_percentage = df.groupby('Gender')['Employed'].mean() * 100
mental_health_percentage = df.groupby('MentalHealth')['Employed'].mean() * 100
age_percentage = df.groupby('Age')['Employed'].mean() * 100
previous_employed_percentage = df.groupby('Employment')['Employed'].mean() * 100

colors = px.colors.sequential.Plasma

# Création des subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=(
    "Par genre",
    "Par maladie mentale",
    "Par tranche d'âge",
    "Par emploi précédent"
))

# Ajout des barres pour chaque sous-graphique
fig.add_trace(go.Bar(x=gender_percentage.index, y=gender_percentage.values, marker_color=colors[0]), row=1, col=1)
fig.add_trace(go.Bar(x=mental_health_percentage.index, y=mental_health_percentage.values, marker_color=colors[1]), row=1, col=2)
fig.add_trace(go.Bar(x=age_percentage.index, y=age_percentage.values, marker_color=colors[2]), row=2, col=1)
fig.add_trace(go.Bar(x=previous_employed_percentage.index, y=previous_employed_percentage.values, marker_color=colors[3]), row=2, col=2)

# Mise à jour du layout pour ajouter un titre général
fig.update_layout(
    title_text="Comparaison du recrutement en fonction de différentes caractéristiques normalisées",
    showlegend=False
)

# Affichage de la figure
fig.show()

#### Biais croisés

Après avoir identifié les différentes sources de biais, nous avons constaté que l'âge et le genre sont les deux variables susceptibles de produire des discriminations. Ainsi, il convient de regarder si ces 2 sources de biais se cumulent : 

In [None]:
# Calcul des pourcentages par groupe pour les biais croisés
cross_biases = df.groupby(['Gender', 'Age'])['Employed'].mean() * 100

In [None]:
cross_biases_df = cross_biases.unstack()

colors = px.colors.sequential.Plasma

fig = go.Figure()

# Ajout des barres empilées pour chaque groupe
for i, column in enumerate(cross_biases_df.columns):
    fig.add_trace(go.Bar(
        x=cross_biases_df.index, 
        y=cross_biases_df[column], 
        name=str(column),
        marker_color=colors[i % len(colors)]
    ))

# Mise à jour du layout pour empiler les barres et ajouter un titre
fig.update_layout(
    barmode='stack',
    title_text="Biais croisés entre Genre et Âge",
    xaxis_title="Genre et Âge",
    yaxis_title="Pourcentage employé",
    showlegend=False
)

# Affichage de la figure
fig.show()

In [None]:
df['Category'] = df['Gender'] + '_' + df['Age']
category_percentage = df.groupby('Category')['Employed'].mean() * 100

colors = px.colors.qualitative.Plotly

fig = go.Figure()

fig.add_trace(go.Bar(
    x=category_percentage.index,
    y=category_percentage.values,
    marker_color=colors[:len(category_percentage)]
))

fig.update_layout(
    title_text="Taux d'emploi pour chaque catégorie",
    xaxis_title="Catégorie",
    yaxis_title="Pourcentage employé",
    showlegend=False
)

fig.show()

# Modélisation 

#### Arbre de décision 

In [None]:
X = df.drop(columns='Employed')
y = df.Employed

In [None]:
technologies = df['HaveWorkedWith']

technologies_encoded = technologies.str.get_dummies(';')

df = pd.concat([df, technologies_encoded], axis=1)

In [None]:
target = "Employed"

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=target), df[target], test_size=0.2, random_state=42)

In [None]:
protected = (pd.Series(np.where(X_test["Gender"] == "Woman", 'f', 'hnb'), index = X_test.index) + '_' + X_test.Age)

protected_train = (pd.Series(np.where(X_train["Gender"] == "Woman", 'f', 'hnb').astype(str), index=X_train.index) + '_' + X_train.Age)

privileged = 'hnb_<35'

In [None]:
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', 'passthrough', numerical_features)
])

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))
])


In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

#Création d'un objet explainer
explainer = dx.Explainer(model, X_test, y_test)

In [None]:
#Obtenir les résultats sous forme d'un tableau récapitulatif avec les métriques de performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

summary_table = pd.DataFrame({
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1],
    'ROC AUC': [roc_auc]
})

print("Summary Table:")
print(summary_table)

In [None]:
# Supposons que y_test et y_pred sont déjà définis et que vous avez la matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)

# Créer le graphique avec Plotly
fig = ff.create_annotated_heatmap(conf_matrix, x=['Not Employed', 'Employed'], y=['Not Employed', 'Employed'], colorscale='Blues', showscale=True)
fig.update_layout(title_text='Confusion Matrix', xaxis_title='Predictions', yaxis_title='True Labels')
fig.show()

#### Fairness metrics 

In [None]:
fairness_object = explainer.model_fairness(protected=protected, privileged=privileged)

fairness_result = fairness_object.fairness_check() #valeur du seuil epsilon par défaut et égale à 0.8

print(fairness_result)

In [None]:
fairness_object.result

In [None]:
fairness_object.metric_scores

## Bias detection Plots 

### Fairness Check plot

In [None]:
fairness_object.plot()

In [None]:
fairness_object.plot(type = 'metric_scores')

### Autres modèles 

#### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
numeric_features = make_column_selector(dtype_include=np.number)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)])


In [None]:
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=123, max_depth=5))])

In [None]:
model_rf.fit(X_train, y_train)

In [None]:
# Faire des prédictions sur l'ensemble de test
y_pred_rf = model_rf.predict(X_test)

# Créer un objet Explainer
explainer_rf = dx.Explainer(model_rf, X_test, y_test, verbose = False)

# Obtenez les résultats sous forme d'un tableau récapitulatif avec les métriques de performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

print("Decision Tree Model:")
print("Accuracy:", accuracy_rf)
print("\nClassification Report:\n", classification_rep_rf)

In [None]:
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)


fig = ff.create_annotated_heatmap(conf_matrix_rf, x=['Not Employed', 'Employed'], y=['Not Employed', 'Employed'], colorscale='Blues', showscale=False)
fig.update_layout(title_text='Decision Tree Confusion Matrix', xaxis_title='Predictions', yaxis_title='True Labels')
fig.show()

In [None]:
fairness_object_rf = explainer_rf.model_fairness(protected, privileged)

#### Regression logistique 

In [None]:
model_logit = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=123))])

In [None]:
model_logit.fit(X_train, y_train)

In [None]:
# Faire des prédictions sur l'ensemble de test
y_pred_logit = model_logit.predict(X_test)

# Créer un objet Explainer
explainer_logit = dx.Explainer(model_logit, X_test, y_test, verbose = False)

# Obtenez les résultats sous forme d'un tableau récapitulatif avec les métriques de performance
accuracy_logit = accuracy_score(y_test, y_pred_logit)
classification_rep_logit = classification_report(y_test, y_pred_logit)

print("Logistic Regression Model:")
print("Accuracy:", accuracy_logit)
print("\nClassification Report:\n", classification_rep_logit)

In [None]:
fairness_object_logit = explainer_logit.model_fairness(protected, privileged)

In [None]:
conf_matrix_logit = confusion_matrix(y_test, y_pred_logit)

fig = ff.create_annotated_heatmap(conf_matrix_logit, x=['Not Employed', 'Employed'], y=['Not Employed', 'Employed'], colorscale='Blues', showscale=False)
fig.update_layout(title_text='Logistic Regression Confusion Matrix', xaxis_title='Predictions', yaxis_title='True Labels')
fig.show()

### Comparaison des modèles 

In [None]:
fairness_object.plot(objects=[fairness_object_logit, fairness_object_rf])

In [None]:
fairness_object.plot(objects=[fairness_object_logit, fairness_object_rf], type = "metric_scores")

## Parity loss plots 

In [None]:
fairness_object.parity_loss

#### Radar plot 

In [None]:
fairness_object.plot(objects=[fairness_object_logit, fairness_object_rf], type = "radar")

#### Heatmap 

In [None]:
fairness_object.plot(objects=[fairness_object_logit, fairness_object_rf], type = "heatmap")

#### Stacked 

In [None]:
fairness_object.plot(objects=[fairness_object_logit, fairness_object_rf], type = "stacked")

In [None]:
fairness_object.plot(objects=[fairness_object_logit, fairness_object_rf], type = "performance_and_fairness")

#### Ceteris Paribus cut-off

In [None]:
fairness_object.plot(objects=[fairness_object_rf], #for better visibility only one additional model
             type = "ceteris_paribus_cutoff", 
             subgroup = 'hnb_<35') 

# Correction des biais 

#### Ce que permet de faire Dalex : 

There are few possible solutions to overcome bias affecting classification models. In dalex, there are 3 mitigation techniques:
1. resample - returns indices that may be used to pick relevant samples of data
2. reweight - returns sample (case) weights for model training
3. roc-pivot - returns the Explainer with changed y_hat

### Preprocessing

#### Supprimer les variables identifiées comme source de biais 

#### Resampling 

In [None]:
# copying
model_u = copy(model)
model_p = copy(model)

In [None]:
# resample
indices_uniform = resample(protected, y_test, verbose = False)
indices_preferential = resample(protected,
                                y_test, 
                                type = 'preferential', 
                                probs = explainer.y_hat, 
                                verbose = False)


model_u.fit(X.iloc[indices_uniform, :], y[indices_uniform])
model_p.fit(X.iloc[indices_preferential, :], y[indices_preferential])

In [None]:
explainer_u = dx.Explainer(model_u, X_test, y_test, verbose = False)
explainer_p = dx.Explainer(model_p, X_test, y_test, verbose = False)


fobject_base = explainer.model_fairness(protected, privileged, label='base')
fobject_unif = explainer_u.model_fairness(protected, privileged, label='res_unif')
fobject_pref = explainer_p.model_fairness(protected, privileged, label='res_pref')


# plotting
fobject_base.plot([fobject_unif, fobject_pref])

#### Performance des modèles 

In [None]:
explainer_u.model_performance().result

In [None]:
explainer_p.model_performance().result

In [None]:
fobject_unif.fairness_check()

In [None]:
fobject_pref.fairness_check()

### Inprocessing ###

#### Reweighing

In [None]:
weights = reweight(protected, y_test, verbose = False)

In [None]:
model_weighted = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier(random_state=123))])


In [None]:
kwargs = {model_weighted.steps[-1][0] + '__sample_weight': weights}

In [None]:
model_weighted.fit(X_test,y_test, **kwargs)

In [None]:
explainer_weighted = dx.Explainer(model_weighted, X_test, y_test, verbose = False)

In [None]:
fobject_weighted = explainer_weighted.model_fairness(protected, privileged, label='weighted')

In [None]:
fobject_weighted.fairness_check()

In [None]:
fobject_base.plot([fobject_unif, fobject_pref, fobject_weighted])

### Postprocessing ###

#### ROC-pivot

In [None]:
explainer_copy = copy(explainer)
roc_pivot_results = roc_pivot(explainer_copy, protected, privileged, theta=0.05, verbose=False)

In [None]:
fobject_roc = explainer_copy.model_fairness(protected, privileged, label='roc')

In [None]:
fobject_roc.fairness_check()

In [None]:
fobject_base.plot([fobject_unif, fobject_pref, fobject_roc])