# Rendu Fairness avec pivot de Gausse :

Dans ce notebook, nous utilisons le jeu de donnée des accidents de la route. <br>
Tout d'abord en ananlysant les données brutes.<br>
Puis en les transformant en un modèle à l'aide du **Pivot de Gausse**.<br>
Pour finir, nous analysons ce modèle.

On souhaite prédire si un véhicule donné est impliqué dans un accident **mortel** ou non.

In [1]:
#Les imports 

%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from utils import *

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
#import imblearn

## Préparation du dataframe
Les attributs présents dans la base de données ne sont pas tous utiles et certains doivent être transformer avant d'être utilisé.

In [2]:
label = "mortal"

# Lecture datasets
df1 = pd.read_csv("dataset/usagers-2022.csv", sep=';')
df2 = pd.read_csv("dataset/lieux-2022.csv", sep=';')
df3 = pd.read_csv("dataset/carcteristiques-2022.csv", sep=';')
df4 = pd.read_csv("dataset/vehicules-2022.csv", sep=';')

df4 = df4.drop(columns=['id_vehicule', 'num_veh'])


df = df1.join(df2.set_index('Num_Acc'), on='Num_Acc')
df = df.join(df3.set_index('Accident_Id'), on='Num_Acc')
df = df.join(df4.set_index('Num_Acc'), on='Num_Acc', lsuffix='_')

  df2 = pd.read_csv("dataset/lieux-2022.csv", sep=';')


In [3]:
# Suppression colonnes inutiles
df = df.drop(columns=['voie', 'v1', 'v2', 'pr', 'pr1', 'lartpc', 'larrout'
                      , 'num_veh', 'occutc', 'adr', 'senc','etatp','actp', 
                      'manv', 'jour', 'com', 'hrmn', 'motor', 'place', 'vosp', 'locp'])

df = df.drop_duplicates(subset=['id_usager']) # retire les doublons dans les usagers

# Remplacement des valeurs NaN
df['an_nais'] = df['an_nais'].fillna(df['an_nais'].mode()[0])

# Convertir en entier
df['id_vehicule'] = df['id_vehicule'].apply(lambda l: l[0:3] + l[4:7])
df['id_vehicule'] = df['id_vehicule'].astype(int)
df['sexe'] = df['sexe'].astype(int)
df

Unnamed: 0,Num_Acc,id_usager,id_vehicule,catu,grav,sexe,an_nais,trajet,secu1,secu2,...,agg,int,atm,col,lat,long,catv,obs,obsm,choc
0,202200000001,1 099 700,813952,1,3,1,2008.0,5,2,8,...,2,3,1,3,445594200000,47257200000,2,0,2,1
1,202200000001,1 099 701,813953,1,1,1,1948.0,5,1,8,...,2,3,1,3,445594200000,47257200000,2,0,2,1
2,202200000002,1 099 698,813950,1,4,1,1988.0,9,1,0,...,2,3,1,3,469258100000,63462000000,7,0,2,8
3,202200000002,1 099 699,813951,1,1,1,1970.0,4,1,0,...,2,3,1,3,469258100000,63462000000,7,0,2,8
4,202200000003,1 099 696,813948,1,1,1,2002.0,0,1,0,...,2,6,1,2,484931620000,-27604390000,7,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126657,202200055301,968 230,715631,1,1,2,2002.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126658,202200055301,968 231,715631,2,3,2,2004.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126659,202200055301,968 232,715632,1,4,2,1953.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126660,202200055302,968 228,715629,1,3,1,1992.0,1,2,6,...,2,1,1,2,475944040000,13533290000,33,0,2,1


In [4]:
# On affiche s'il y a des valeurs nulles
for column in df.columns:
  if df[column].isnull().values.any() == True:
    print(column, df[column].isnull().values.any()) 

In [5]:
# On préserve le df d'origine
df_2 = df.copy()

# On crée un attribut pour les accidents mortels
df_2['mortal'] = extract_mortal(df)

# Accident impliquant un piéton
has_pedestrian = df[df['catu'] == 3]
p = to_attribute(df, has_pedestrian, 1, 0)
df_2['pieton'] = p

# Sexe du conducteur
driver = df[(df['catu'] == 1) & (df['sexe'] == 1)]
dr = to_attribute(df, driver, 1, 0)
df_2['sexe_conducteur'] = dr


df_2 = df_2.drop_duplicates(subset=['id_vehicule'])

Pour simplifier le code, les fonction qui uniformisent les types vers ceux souhaités ont été plcées dans **utils.py**

In [6]:
# On réduit les carégories de véhicules
df_2 = simplify_catv(df_2)

# On enlève la catégorie peu repésentées qu'on ajoute dans une catégorie autre (identifiant 5)
df_2 = simplify_catr(df_2)

# Découpage en 4 catégories de vitesse
df_2 = split_vma(df_2)

        
# Département en entiers
df_2['dep'] = df_2['dep'].apply(transforme_dpt)
df_2['dep'] = pd.to_numeric(df_2['dep'], errors='coerce', downcast='integer')

# nbv en entier
df_2['nbv'] = pd.to_numeric(df_2['nbv'], errors='coerce', downcast='integer')
df_2['nbv'].fillna(2, inplace=True) # only one entry 
df_2['nbv'] = pd.to_numeric(df_2['nbv'], errors='coerce', downcast='integer')

# Lat et long en float :
df_2['lat'] = pd.to_numeric(df_2['lat'], errors='coerce')
df_2['long'] = pd.to_numeric(df_2['long'], errors='coerce')

# Age du conducteur du véhicule
df_2['age'] = get_driver_age(df_2)
df_2['age'] = df_2['age'].fillna(df_2['age'].mode()[0])
df_2['age'] = pd.to_numeric(df_2['age'], errors='coerce', downcast='integer')

# Réduire les valeurs de trajet
df_2['trajet'] = reduce_trajet_values(df_2)

# Réduire les valeurs de surf
df_2['surf'] = reduce_surf_values(df_2)

# obs prend pour valeur 0 si obstacle 1 si pas d'obstacle
df_2['obs'] = reduce_obs_values(df_2)

# On enlève les attributs qui ne sont plus utiles
df_2 = df_2.drop(columns=['an_nais','grav', 'sexe','catu', 'id_usager', 'id_vehicule',
                          'secu1','secu2','secu3', 'an', 'lat', 'long'])

Tri manuel des valeurs catégorielles/numériques

In [7]:
# valeurs catégorielles
categorical_features = ['trajet', 'catr', 'circ', 'nbv', 'prof',
                        'plan', 'surf', 'vma', 'lum', 'agg', 
                        'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton',
                        'sexe_conducteur', 'infra', 'situ']
# valeurs numériques
numerical_features = ['mois', 'dep','age']

print("numerical : ", numerical_features)
print("categorical : ", categorical_features)

numerical :  ['mois', 'dep', 'age']
categorical :  ['trajet', 'catr', 'circ', 'nbv', 'prof', 'plan', 'surf', 'vma', 'lum', 'agg', 'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton', 'sexe_conducteur', 'infra', 'situ']


## Apprentissage avec le pivot de Gausse

In [8]:
from sklearn.model_selection import train_test_split

# On découpe le jeu de données tout en conservant ensemble les véhicules impliqué dans un même accident
# Les données d'entrainement et de test n'ont donc pas de rapport direct

unique_accidents = df_2['Num_Acc'].unique() # Num_Acc uniques

df_3 = df_2.drop(columns=['mortal'])

# Création des train et test set à partir des numéros d'accident
X_train, X_test = train_test_split(unique_accidents, test_size=0.33, random_state=42)

# On peut ensuite récupérer les véhicules correspondants aux accidents
train_df = df_2[df_2['Num_Acc'].isin(X_train)]
test_df = df_2[df_2['Num_Acc'].isin(X_test)]
y_train = train_df['mortal']
y_test = test_df['mortal']
X_train = train_df.drop(columns=['mortal', 'Num_Acc'])
X_test = test_df.drop(columns=['mortal', 'Num_Acc'])
df_2 = df_2.drop(columns='Num_Acc')

In [9]:
from sklearn import tree, naive_bayes

clf = naive_bayes.GaussianNB()

clf = clf.fit(X_train, y_train)

preds = clf.predict(X_test)

clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.8734201153591579, 0.8677148103998196)

In [18]:
from sklearn import tree, ensemble, naive_bayes
from sklearn.svm import SVC
import imblearn

dt = naive_bayes.GaussianNB()
clf = imblearn.pipeline.Pipeline(
    [
        ("resample", imblearn.over_sampling.SMOTE()),
        ('classifier', dt)
    ])
clf = clf.fit(X_train, y_train)

preds = clf.predict(X_test)

clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.7307183156302203, 0.7220593446953832)

### Afficher la matrice de confusion

In [19]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
tn, fp, fn, tp

(21471, 7864, 763, 941)

In [20]:
import plotly.express as px

fig = px.imshow([[tn, fp], [fn, tp]], text_auto=True, labels=dict(y="Truth", x="Pred"),
                x=["False", "True"],
                y=["False", "True"]
               )
fig.show()

## Audit du modèle

In [57]:
import pickle

data_test = X_test.copy(deep=True)
data_test["Y"] = y_test

data_test.to_csv("test_data.csv",
          index=False)

data_train = X_train.copy(deep=True)
data_train["Y"] = y_train

data_train.to_csv("train_data.csv",
          index=False)

with open( 'clf.pickle', 'wb' ) as f:
    pickle.dump(clf, f )

### Trouver des contrefactuels avec Dice

In [16]:
import dice_ml
from dice_ml.utils import helpers

In [17]:
train_dataset = df_2.copy()
train_dataset[label] = y_train
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical_features, outcome_name=label)

m = dice_ml.Model(model=clf, backend="sklearn")

exp = dice_ml.Dice(d, m)

In [18]:
# Génération des contrefactuels
for i in range(5):
  query_instance = data_test.drop(columns="Y")[i:i+1]
  dice_exp = exp.generate_counterfactuals(
      query_instance,
      total_CFs=10,
      desired_class="opposite")
  # Visualize counterfactual explanation
  dice_exp.visualize_as_dataframe(show_only_changes=True)
  # Instance local feature importance
  imp = exp.local_feature_importance(query_instance, cf_examples_list=dice_exp.cf_examples_list)
  print(imp.local_importance)

100%|██████████| 1/1 [00:00<00:00,  6.34it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,0,4,2,2,1,1,1,0,1,2,...,1,3,3,0,2,8,0,1,34,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,6,-,-,-,...,-,-,-,-,-,-,1,-,-,1
1,-,-,-,-,-,-,8,-,-,-,...,-,-,-,-,-,-,-,-,-,1
2,-,-,-,-,-,-,6,-,-,-,...,-,-,0,-,-,-,-,-,-,1
3,-,-,-,-,-,-,5,-,-,-,...,-,-,-,-,-,-,-,-,-,1
4,-,-,-,-,-,-,8,-,-,-,...,-,2,-,-,-,-,-,-,-,1
5,-,-,-,-,-,-,8,-,-,-,...,-,-,-,-,-,-,-,-,45,1
6,-,-,-,-,-,-,5,2,-,-,...,-,-,-,-,-,-,-,-,-,1
7,-,-,-,-,-,-,5,-,-,-,...,-,2,-,-,-,-,-,-,-,1
8,-,-,-,-,-,-,8,-,-,4,...,-,-,-,-,-,-,-,-,-,1
9,-,-,-,-,-,-,4,-,-,-,...,-,-,-,-,9,-,-,-,-,1


[{'surf': 1.0, 'col': 0.2, 'infra': 0.1, 'vma': 0.1, 'catv': 0.1, 'obsm': 0.1, 'pieton': 0.1, 'mois': 0.1, 'age': 0.1, 'trajet': 0.0, 'catr': 0.0, 'circ': 0.0, 'nbv': 0.0, 'prof': 0.0, 'plan': 0.0, 'situ': 0.0, 'lum': 0.0, 'agg': 0.0, 'int': 0.0, 'atm': 0.0, 'obs': 0.0, 'choc': 0.0, 'sexe_conducteur': 0.0, 'dep': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  4.36it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,4,4,2,2,1,1,1,0,1,2,...,1,3,3,0,2,8,0,1,52,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,4,-,-,-,-,...,-,-,-,1,-,-,-,-,-,1
1,-,-,-,-,4,-,-,-,-,3,...,-,-,-,-,-,-,-,-,-,1
2,-,-,-,-,-,-,4,-,-,-,...,-,-,-,-,-,-,-,-,-,1
3,-,-,-,-,-,-,8,-,-,-,...,-,-,-,-,-,-,-,-,-,1
4,-,-,-,-,-,-,8,-,-,4,...,-,-,-,-,-,-,-,-,-,1
5,-,-,-,-,-,-,5,-,-,-,...,-,-,0,-,-,-,-,-,-,1
6,-,-,-,-,-,-,8,1,-,-,...,-,-,-,-,-,-,-,-,-,1
7,-,-,-,-,-,4,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
8,-,-,-,-,-,-,6,-,-,-,...,-,-,-,-,-,-,-,-,-,1
9,-,-,-,-,-,-,5,-,-,-,...,-,-,-,-,-,-,-,-,-,1


[{'surf': 0.7, 'plan': 0.2, 'vma': 0.2, 'prof': 0.1, 'infra': 0.1, 'lum': 0.1, 'agg': 0.1, 'catv': 0.1, 'obs': 0.1, 'trajet': 0.0, 'catr': 0.0, 'circ': 0.0, 'nbv': 0.0, 'situ': 0.0, 'int': 0.0, 'atm': 0.0, 'col': 0.0, 'obsm': 0.0, 'choc': 0.0, 'pieton': 0.0, 'sexe_conducteur': 0.0, 'mois': 0.0, 'dep': 0.0, 'age': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  5.95it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,0,3,2,2,1,1,1,0,1,3,...,1,2,3,0,2,4,0,0,47,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,4,-,-,-,...,-,-,-,-,-,-,-,-,-,1
1,-,-,-,-,4,-,-,-,-,-,...,-,-,-,-,-,-,1,-,-,1
2,-,-,-,-,-,2,-,-,-,-,...,-,-,-,-,9,-,-,-,-,1
3,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,-,-,-,-,96,1
4,-,-,-,-,-,-,6,-,-,-,...,-,-,-,-,-,-,-,-,-,1
5,-,-,-,-,-,4,5,-,-,-,...,-,-,-,-,-,-,-,-,-,1
6,-,-,-,-,-,4,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
7,3,-,-,-,-,-,-,-,-,-,...,-,-,-,-,9,-,-,-,-,1
8,-,-,-,-,3,-,3,-,-,-,...,-,-,-,-,-,-,-,-,-,1
9,-,-,-,-,-,-,5,-,-,-,...,4,-,-,-,-,-,-,-,-,1


[{'surf': 0.6, 'plan': 0.3, 'prof': 0.2, 'obsm': 0.2, 'trajet': 0.1, 'atm': 0.1, 'pieton': 0.1, 'mois': 0.1, 'age': 0.1, 'catr': 0.0, 'circ': 0.0, 'nbv': 0.0, 'infra': 0.0, 'situ': 0.0, 'vma': 0.0, 'lum': 0.0, 'agg': 0.0, 'int': 0.0, 'col': 0.0, 'catv': 0.0, 'obs': 0.0, 'choc': 0.0, 'sexe_conducteur': 0.0, 'dep': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  5.90it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,0,3,2,2,1,1,1,0,1,3,...,1,2,3,0,2,4,0,0,26,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,-,-,8,-,...,-,-,-,-,-,-,1,-,-,1
1,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,-,-,-,-,-,1
2,5,-,-,-,4,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
3,3,-,-,-,-,-,6,-,-,-,...,-,-,-,-,-,-,-,-,-,1
4,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,-,7,-,-,-,1
5,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,-,-,-,-,-,1
6,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,-,-,-,-,-,1
7,-,-,3,-,-,-,7,-,-,-,...,-,-,-,-,-,-,-,-,-,1
8,-,-,-,-,-,4,-,-,-,-,...,-,-,-,-,-,-,-,1,-,1
9,-,-,-,-,-,-,4,-,-,-,...,-,-,-,-,-,-,-,1,-,1


[{'surf': 0.7, 'trajet': 0.2, 'lum': 0.2, 'sexe_conducteur': 0.2, 'circ': 0.1, 'prof': 0.1, 'plan': 0.1, 'situ': 0.1, 'choc': 0.1, 'pieton': 0.1, 'catr': 0.0, 'nbv': 0.0, 'infra': 0.0, 'vma': 0.0, 'agg': 0.0, 'int': 0.0, 'atm': 0.0, 'col': 0.0, 'catv': 0.0, 'obs': 0.0, 'obsm': 0.0, 'mois': 0.0, 'dep': 0.0, 'age': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  5.70it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,0,4,1,1,1,1,1,0,1,2,...,1,1,3,0,2,1,0,1,37,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,5,-,-,-,...,-,-,-,-,-,-,-,-,-,1
1,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,-,-,-,-,-,1
2,-,-,-,-,-,-,8,-,-,-,...,-,-,-,-,-,-,-,-,-,1
3,-,-,-,-,-,-,6,-,-,-,...,-,-,-,-,-,-,-,-,-,1
4,-,-,-,-,-,-,5,-,-,-,...,-,4,-,-,-,-,-,-,-,1
5,-,-,-,-,-,-,8,-,-,-,...,6,-,-,-,-,-,-,-,-,1
6,-,-,-,-,-,-,7,-,-,-,...,-,-,-,-,0,-,-,-,-,1
7,-,-,-,-,-,-,5,-,-,-,...,-,-,-,-,-,-,-,-,-,1
8,-,-,-,5,-,-,5,-,-,-,...,-,-,-,-,-,-,-,-,-,1
9,-,-,-,-,-,-,8,-,-,-,...,-,-,-,-,-,6,-,-,-,1


[{'surf': 1.0, 'nbv': 0.1, 'int': 0.1, 'atm': 0.1, 'col': 0.1, 'obsm': 0.1, 'choc': 0.1, 'dep': 0.1, 'trajet': 0.0, 'catr': 0.0, 'circ': 0.0, 'prof': 0.0, 'plan': 0.0, 'infra': 0.0, 'situ': 0.0, 'vma': 0.0, 'lum': 0.0, 'agg': 0.0, 'catv': 0.0, 'obs': 0.0, 'pieton': 0.0, 'sexe_conducteur': 0.0, 'mois': 0.0, 'age': 0.0}]


### BlackBoxAuditing - Gaussien
Les résultats de l'audit sont **disponibles** dans le répertoire

In [19]:
from BlackBoxAuditing.data import load_from_file
from BlackBoxAuditing.model_factories.AbstractModelFactory import AbstractModelFactory
from BlackBoxAuditing.model_factories.AbstractModelVisitor import AbstractModelVisitor

import BlackBoxAuditing as BBA


(_, train_BBA, _, _, _, _) = load_from_file("train_data.csv",
                      correct_types = np.repeat([int], [len(data_test.columns)]),
                                response_header = 'Y',
                               train_percentage = 1.0)
(headers, _, test_BBA, response_header, features_to_ignore, correct_types) = load_from_file("test_data.csv",
                      correct_types = np.repeat([int], [len(data_test.columns)]),
                                response_header = 'Y',
                               train_percentage = 0.0)
BBA_data = (headers, train_BBA, test_BBA, response_header, features_to_ignore, correct_types)

In [20]:
class HirePredictorBuilder(AbstractModelFactory):
    def __init__(self, *args, **kwargs):
        AbstractModelFactory.__init__(self, *args, **kwargs)
        self.verbose_factory_name = "HirePredictor"
    def build(self, train_set):
        return HirePredictor()

class HirePredictor(AbstractModelVisitor):
    def __init__(self):
        with open( 'clf.pickle', 'rb' ) as f:
            self.clf = pickle.load(f)

    def test(self, test_set, test_name=""):
        df_test = pd.DataFrame(
            test_set, columns =data_test.columns.to_list())
        targets = df_test['Y']
        preds = self.clf.predict(df_test.drop('Y', axis=1))
        return [[a,b] for (a,b) in zip(targets, preds)]

In [21]:
auditor = BBA.Auditor()
auditor.ModelFactory = HirePredictorBuilder
auditor(BBA_data, output_dir = "audit-output")

Training initial model. (16:15:29)
Calculating original model statistics on test data:
	Training Set:
		Conf-Matrix: {0: {0: 54402, 1: 5673}, 1: {0: 2359, 1: 1020}}
		accuracy: 0.8734201153591579
		BCR: 0.6037162484449927
	Testing Set:
		Conf-Matrix {0: {0: 26425, 1: 2910}, 1: {0: 1196, 1: 508}}
		accuracy: 0.8677148103998196
		BCR: 0.5994615782874052
Auditing: 'trajet' (1/24). (16:15:30)
Auditing: 'catr' (2/24). (16:15:49)
Auditing: 'circ' (3/24). (16:16:08)
Auditing: 'nbv' (4/24). (16:16:27)
Auditing: 'prof' (5/24). (16:16:50)
Auditing: 'plan' (6/24). (16:17:12)
Auditing: 'surf' (7/24). (16:17:34)
Auditing: 'infra' (8/24). (16:17:56)
Auditing: 'situ' (9/24). (16:18:18)
Auditing: 'vma' (10/24). (16:18:41)
Auditing: 'mois' (11/24). (16:19:00)
Auditing: 'lum' (12/24). (16:19:16)
Auditing: 'dep' (13/24). (16:19:36)
Auditing: 'agg' (14/24). (16:19:55)
Auditing: 'int' (15/24). (16:20:15)
Auditing: 'atm' (16/24). (16:20:37)
Auditing: 'col' (17/24). (16:20:59)
Auditing: 'catv' (18/24). (16:2

## Interprétation du modèle

### Interprétation avec ShapKit
L'image résultat de l'interprétation est **disponible** dans le répertoire sous le nom de **shap-gaussien.png**

In [22]:
cols = data_test.columns.to_list()
cols.remove('Y')
cols

['trajet',
 'catr',
 'circ',
 'nbv',
 'prof',
 'plan',
 'surf',
 'infra',
 'situ',
 'vma',
 'mois',
 'lum',
 'dep',
 'agg',
 'int',
 'atm',
 'col',
 'catv',
 'obs',
 'obsm',
 'choc',
 'pieton',
 'sexe_conducteur',
 'age']

In [23]:
def predictproba_fn(data, clf=clf, columns=cols):
  df_test = pd.DataFrame(
            [data], columns=columns )
  preds = clf.predict_proba(df_test)
  return np.squeeze(preds, axis=0)
fc = lambda x: predictproba_fn(x)[1]

In [24]:
query_instance = data_test.drop(columns="Y")[0:1]
x_class = int(clf.predict(query_instance))
query_instance=query_instance.squeeze()
print(query_instance)
print()
print("Prediction for x: {0:.0f}".format(x_class))

trajet              0
catr                4
circ                2
nbv                 2
prof                1
plan                1
surf                1
infra               0
situ                1
vma                 2
mois               10
lum                 1
dep                25
agg                 2
int                 3
atm                 1
col                 3
catv                3
obs                 0
obsm                2
choc                8
pieton              0
sexe_conducteur     1
age                34
Name: 2, dtype: int64

Prediction for x: 0


In [25]:
fc(query_instance.values)

0.0010517372278898898

In [26]:
X_opposite_class = X_train[clf.predict(X_train) != x_class].copy()
reference = X_opposite_class.sample()
ref_class = int(clf.predict(reference))
reference=reference.squeeze()
print(reference)
print()
print("Prediction for this reference: {0:.0f}".format(ref_class))

trajet              1
catr                3
circ                2
nbv                 2
prof                2
plan                3
surf                1
infra               0
situ                3
vma                 3
mois                9
lum                 1
dep                74
agg                 1
int                 1
atm                 1
col                 6
catv                3
obs                 1
obsm                0
choc                1
pieton              0
sexe_conducteur     1
age                30
Name: 22261, dtype: int64

Prediction for this reference: 1


In [27]:
fc(reference.values)

0.9397252805466315

In [28]:
from shapkit.shapley_values import ShapleyValues
from shapkit.inspector import inspector
from shapkit.monte_carlo_shapley import MonteCarloShapley, MonteCarloShapleyBatch
from shapkit.sgd_shapley import SGDshapley
from shapkit.plots import plot_shapley

true_shap = MonteCarloShapley(x=query_instance, fc=fc, ref=reference, n_iter=2000)

new dimension 15


100%|██████████| 2000/2000 [00:54<00:00, 36.63it/s]


In [29]:
true_shap

trajet            -0.017239
catr              -0.036520
circ               0.000000
nbv                0.000000
prof              -0.057392
plan              -0.202455
surf               0.000000
infra              0.000000
situ              -0.036011
vma               -0.070385
mois               0.001104
lum                0.000000
dep                0.001030
agg               -0.162232
int               -0.029375
atm                0.000000
col               -0.049647
catv               0.000000
obs               -0.207373
obsm              -0.047726
choc              -0.026556
pieton             0.000000
sexe_conducteur    0.000000
age                0.002103
dtype: float64

In [52]:
fig =  plot_shapley(x=query_instance, fc=fc, ref=reference, shapley_values=true_shap, n_attributes=24)


FixedFormatter should only be used together with FixedLocator


Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.

