In [94]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from utils import *

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
#import imblearn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


L'objectif est de prédire si un véhicule donné est impliqué dans un accident mortel ou non.

## Préparation du dataframe
Les attributs présents dans la base de données ne sont pas tous utiles et certains doivent être transformer avant d'être utilisé.

In [105]:
label = "mortal"

# Lecture datasets
df1 = pd.read_csv("dataset/usagers-2022.csv", sep=';')
df2 = pd.read_csv("dataset/lieux-2022.csv", sep=';')
df3 = pd.read_csv("dataset/carcteristiques-2022.csv", sep=';')
df4 = pd.read_csv("dataset/vehicules-2022.csv", sep=';')

df4 = df4.drop(columns=['id_vehicule', 'num_veh'])


df = df1.join(df2.set_index('Num_Acc'), on='Num_Acc')
df = df.join(df3.set_index('Accident_Id'), on='Num_Acc')
df = df.join(df4.set_index('Num_Acc'), on='Num_Acc', lsuffix='_')


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



In [106]:
# Suppression colonnes inutiles
df = df.drop(columns=['voie', 'v1', 'v2', 'pr', 'pr1', 'lartpc', 'larrout'
                      , 'num_veh', 'occutc', 'adr', 'senc','etatp','actp', 
                      'manv', 'jour', 'com', 'hrmn', 'motor', 'place', 'vosp', 'locp'])

df = df.drop_duplicates(subset=['id_usager']) # retire les doublons dans les usagers

# Remplacement des valeurs NaN
df['an_nais'] = df['an_nais'].fillna(df['an_nais'].mode()[0])

# Convertir en entier
df['id_vehicule'] = df['id_vehicule'].apply(lambda l: l[0:3] + l[4:7])
df['id_vehicule'] = df['id_vehicule'].astype(int)
df['sexe'] = df['sexe'].astype(int)
df

Unnamed: 0,Num_Acc,id_usager,id_vehicule,catu,grav,sexe,an_nais,trajet,secu1,secu2,...,agg,int,atm,col,lat,long,catv,obs,obsm,choc
0,202200000001,1 099 700,813952,1,3,1,2008.0,5,2,8,...,2,3,1,3,445594200000,47257200000,2,0,2,1
1,202200000001,1 099 701,813953,1,1,1,1948.0,5,1,8,...,2,3,1,3,445594200000,47257200000,2,0,2,1
2,202200000002,1 099 698,813950,1,4,1,1988.0,9,1,0,...,2,3,1,3,469258100000,63462000000,7,0,2,8
3,202200000002,1 099 699,813951,1,1,1,1970.0,4,1,0,...,2,3,1,3,469258100000,63462000000,7,0,2,8
4,202200000003,1 099 696,813948,1,1,1,2002.0,0,1,0,...,2,6,1,2,484931620000,-27604390000,7,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126657,202200055301,968 230,715631,1,1,2,2002.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126658,202200055301,968 231,715631,2,3,2,2004.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126659,202200055301,968 232,715632,1,4,2,1953.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126660,202200055302,968 228,715629,1,3,1,1992.0,1,2,6,...,2,1,1,2,475944040000,13533290000,33,0,2,1


In [107]:
for column in df.columns:
  if df[column].isnull().values.any() == True:
    print(column, df[column].isnull().values.any()) # afficher s'il y a des valeurs nulles

In [108]:
df_2 = df.copy()

# On crée un attribut pour les accidents mortels
df_2['mortal'] = extract_mortal(df)

# Accident impliquant un piéton
has_pedestrian = df[df['catu'] == 3]
p = to_attribute(df, has_pedestrian, 1, 0)
df_2['pieton'] = p

# Sexe du conducteur
driver = df[(df['catu'] == 1) & (df['sexe'] == 1)]
dr = to_attribute(df, driver, 1, 0)
df_2['sexe_conducteur'] = dr


df_2 = df_2.drop_duplicates(subset=['id_vehicule'])

Il faut transformer les types de 'nbv', 'hrmn', 'dep', 'com', 'lat', 'long' de objets à respectivement : int, int, int, int, float, float

Pour dep et com : transformer les dpt corses pour leur donner un nom en chiffre et non en lettre (2A/2B)

In [109]:
# On réduit les carégories de véhicules
df_2 = simplify_catv(df_2)

# On enlève la catégorie peu repésentées qu'on ajoute dans une catégorie autre (identifiant 5)
df_2 = simplify_catr(df_2)

# Découpage en 4 catégories de vitesse
df_2 = split_vma(df_2)

        
# Département en entiers
df_2['dep'] = df_2['dep'].apply(transforme_dpt)
df_2['dep'] = pd.to_numeric(df_2['dep'], errors='coerce', downcast='integer')

# nbv en entier
df_2['nbv'] = pd.to_numeric(df_2['nbv'], errors='coerce', downcast='integer')
df_2['nbv'].fillna(2, inplace=True) # only one entry 
df_2['nbv'] = pd.to_numeric(df_2['nbv'], errors='coerce', downcast='integer')

# Lat et long en float :
df_2['lat'] = pd.to_numeric(df_2['lat'], errors='coerce')
df_2['long'] = pd.to_numeric(df_2['long'], errors='coerce')

# Age du conducteur du véhicule
df_2['age'] = get_driver_age(df_2)
df_2['age'] = df_2['age'].fillna(df_2['age'].mode()[0])
df_2['age'] = pd.to_numeric(df_2['age'], errors='coerce', downcast='integer')

# Réduire les valeurs de trajet
df_2['trajet'] = reduce_trajet_values(df_2)

# Réduire les valeurs de surf
df_2['surf'] = reduce_surf_values(df_2)

# obs prend pour valeur 0 si obstacle 1 si pas d'obstacle
df_2['obs'] = reduce_obs_values(df_2)

# On enlève les attributs qui ne sont plus utiles
df_2 = df_2.drop(columns=['an_nais','grav', 'sexe','catu', 'id_usager', 'id_vehicule',
                          'secu1','secu2','secu3', 'an', 'lat', 'long', 'infra', 'situ'])

In [110]:
# Tri manuel des valeurs catégorielles/numériques

# valeurs catégorielles
categorical_features = ['trajet', 'catr', 'circ', 'nbv', 'prof',
                        'plan', 'surf', 'vma', 'lum', 'agg', 
                        'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton',
                        'sexe_conducteur']
# valeurs numériques
numerical_features = ['mois', 'dep','age']

print("numerical : ", numerical_features)
print("categorical : ", categorical_features)

numerical :  ['mois', 'dep', 'age']
categorical :  ['trajet', 'catr', 'circ', 'nbv', 'prof', 'plan', 'surf', 'vma', 'lum', 'agg', 'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton', 'sexe_conducteur']


## Analyse de données

In [101]:
val = [len(df_2[df_2.mortal == 1]), len(df_2[df_2.mortal == 0])]
labels = ['Accident mortel', 'Accident non mortel']
px.pie(values=val, names=labels)

In [102]:
val = [len(df_2[df_2.pieton == 1]), len(df_2[df_2.pieton == 0])]
labels = ['Implique piéton', 'N\'implique pas de piéton']
px.pie(values=val, names=labels)

In [111]:
fig = px.histogram(df_2, x="catv")
fig.show()

fig = px.box(df_2, x="age")
fig.show()

fig = px.histogram(df_2, x="sexe_conducteur")
fig.show()

fig = px.histogram(df_2, x="catr")
fig.show()

fig = px.histogram(df_2, x="col")
fig.show()

fig = px.histogram(df_2, x='obs')
fig.show()

In [None]:
features = ['age', 'mois', 'catr', 'mortal', 'sexe_conducteur']

for exp in features:
    print(exp)
    analyse_bi_quali_quanti("catv", exp, df_2)

In [None]:
features = ['age', 'mois', 'catr', 'mortal']

for exp in features:
    print(exp)
    analyse_bi_quali_quanti("vma", exp, df_2)

In [None]:
features = ['age', 'vma', 'mois', 'catr', 'mortal']

for exp in features:
    print(exp)
    analyse_bi_quali_quanti("col", exp, df_2)

## One hot encoding

In [8]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

## Apprentissage avec un arbre de décision

In [9]:
from sklearn.model_selection import train_test_split

# On découpe le jeu de données tout en conservant ensemble les véhicules impliqué dans un même accident
# Les données d'entrainement et de test n'ont donc pas de rapport direct

unique_accidents = df_2['Num_Acc'].unique() # Num_Acc uniques

df_3 = df_2.drop(columns=['mortal'])
# Création des train et test set à partir des numéros d'accident
X_train, X_test = train_test_split(unique_accidents, test_size=0.33, random_state=42)

# On peut ensuite récupérer les véhicules correspondants aux accidents
train_df = df_2[df_2['Num_Acc'].isin(X_train)]
test_df = df_2[df_2['Num_Acc'].isin(X_test)]
y_train = train_df['mortal']
y_test = test_df['mortal']
X_train = train_df.drop(columns=['mortal', 'Num_Acc'])
X_test = test_df.drop(columns=['mortal', 'Num_Acc'])
df_2 = df_2.drop(columns='Num_Acc')

In [10]:
# Ici on autorise la spération des véhicules impliqué dans un même accident
# Source de biais

#df_3 = df_2.drop(columns=['mortal'])
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(df_3, df_2[label], test_size=0.33, random_state=42)

In [11]:
from sklearn import tree

dt = tree.DecisionTreeClassifier(random_state=42)
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', dt)])
clf = clf.fit(X_train, y_train)

preds = clf.predict(X_test)

clf.score(X_train, y_train), clf.score(X_test, y_test)

(1.0, 0.9021231354102903)

In [12]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
tn, fp, fn, tp

(27718, 1617, 1421, 283)

In [13]:
import plotly.express as px


fig = px.imshow([[tn, fp], [fn, tp]], text_auto=True, labels=dict(y="Truth", x="Pred"),
                x=["False", "True"],
                y=["False", "True"]
               )
fig.show()

## Générer des contrefactuels

In [14]:
import pickle

data_test = X_test.copy(deep=True)
data_test["Y"] = y_test

data_test.to_csv("test_data.csv",
          index=False)

data_train = X_train.copy(deep=True)
data_train["Y"] = y_train

data_train.to_csv("train_data.csv",
          index=False)

with open( 'clf.pickle', 'wb' ) as f:
    pickle.dump(clf, f )

In [64]:
import dice_ml
from dice_ml.utils import helpers

In [65]:
train_dataset = df_2.copy()
train_dataset[label] = y_train
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical_features, outcome_name=label)

m = dice_ml.Model(model=clf, backend="sklearn")

exp = dice_ml.Dice(d, m)

In [66]:
# Génération des contrefactuels
for i in range(5):
  query_instance = data_test.drop(columns="Y")[i:i+1]
  dice_exp = exp.generate_counterfactuals(
      query_instance,
      total_CFs=10,
      desired_class="opposite")
  # Visualize counterfactual explanation
  dice_exp.visualize_as_dataframe(show_only_changes=True)
  # Instance local feature importance
  imp = exp.local_feature_importance(query_instance, cf_examples_list=dice_exp.cf_examples_list)
  print(imp.local_importance)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  6.72it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,9,4,2,2,1,1,1,0,1,2,...,1,3,3,0,2,8,0,1,34,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,3,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,98,1
2,-,-,-,-,-,-,-,-,-1,-,...,-,-,-,-,-,-,-,-,-,1
3,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,-,1
4,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
5,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
6,-,-,-,-1,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,-,1
7,-,-,-,-,-,-,-,-,-,-,...,3,-,-,-,-,-,-,-,-,1
8,-,-,1,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
9,-,-,-,-,-,-,-,6,-,-,...,-,-,-,-,-,-,-,-,-,1


[{'pieton': 0.2, 'trajet': 0.1, 'circ': 0.1, 'nbv': 0.1, 'infra': 0.1, 'situ': 0.1, 'int': 0.1, 'atm': 0.1, 'age': 0.1, 'catr': 0.0, 'prof': 0.0, 'plan': 0.0, 'surf': 0.0, 'vma': 0.0, 'lum': 0.0, 'agg': 0.0, 'col': 0.0, 'catv': 0.0, 'obs': 0.0, 'obsm': 0.0, 'choc': 0.0, 'sexe_conducteur': 0.0, 'mois': 0.0, 'dep': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  6.29it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,4,4,2,2,1,1,1,0,1,2,...,1,3,3,0,2,8,0,1,52,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,-,-,-,-,...,-,-,2,-,-,-,1,-,-,1
1,-,-,-,-,-,-,-,6,-,-,...,-,-,-,-,-,-,-,-,-,1
2,-,-,1,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1
3,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,-,1
4,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,-,1
5,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,-,1
6,-,-,-,-,-,-,-,-,-,-,...,-,1,-,-,-,-,-,-,-,1
7,-,-,-,-,-,-,-,-,-,-,...,-,-,5,-,-,-,1,-,-,1
8,-,-,-,-,-,-,-,7,-,-,...,-,-,-,-,-,-,1,-,-,1
9,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,1


[{'pieton': 0.6, 'infra': 0.2, 'catv': 0.2, 'circ': 0.1, 'col': 0.1, 'mois': 0.1, 'trajet': 0.0, 'catr': 0.0, 'nbv': 0.0, 'prof': 0.0, 'plan': 0.0, 'surf': 0.0, 'situ': 0.0, 'vma': 0.0, 'lum': 0.0, 'agg': 0.0, 'int': 0.0, 'atm': 0.0, 'obs': 0.0, 'obsm': 0.0, 'choc': 0.0, 'sexe_conducteur': 0.0, 'dep': 0.0, 'age': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  5.87it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,0,3,2,2,1,1,1,0,1,3,...,1,2,3,0,2,4,0,0,47,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,2,-,-,-,-,-,...,-,-,1,-,-,-,-,-,77,1
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,5,1
2,-,-,-,-,3,-,-,-,-,-,...,-,-,-,-,-,-,-,-,14,1
3,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,97,1
4,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,97,1
5,-,-,3,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,95,1
6,-,-,-,-,-,-,-,-,-,-,...,7,-,-,-,-,-,-,-,17,1
7,-,-,-,-,-,-,-,-,-,1,...,-,-,-,-,-,-,-,-,90,1
8,-,-,-1,-,-,-,-,-,-,-,...,-,-,-,13,-,-,-,-,99,1
9,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,5,1


[{'age': 1.0, 'mois': 0.4, 'int': 0.3, 'circ': 0.2, 'prof': 0.2, 'vma': 0.1, 'atm': 0.1, 'catv': 0.1, 'obs': 0.1, 'trajet': 0.0, 'catr': 0.0, 'nbv': 0.0, 'plan': 0.0, 'surf': 0.0, 'infra': 0.0, 'situ': 0.0, 'lum': 0.0, 'agg': 0.0, 'col': 0.0, 'obsm': 0.0, 'choc': 0.0, 'pieton': 0.0, 'sexe_conducteur': 0.0, 'dep': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  5.20it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,0,3,2,2,1,1,1,0,1,3,...,1,2,3,0,2,4,0,0,26,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,11,1
1,-,-,-,-,-,-,-,-,6,-,...,-,-,-,-,-,-,-,1,17,1
2,-,-,-,-,-,-,-,-,-,-,...,-,1,-,-,-,-,-,-,52,1
3,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,98,1
4,-,-,-,-,-,-,5,-,-,-,...,-,-,-,-,-,-,-,-,96,1
5,-,-,-,-,-,-,-,6,-,-,...,-,-,-,-,-,-,-,-,84,1
6,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,1,-,1,1
7,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,5,-,-,67,1
8,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,92,1
9,-,-,-,-,3,-,-,-,-,-,...,-,-,-,-,-,-,-,-,86,1


[{'age': 1.0, 'lum': 0.3, 'int': 0.3, 'mois': 0.3, 'dep': 0.3, 'pieton': 0.2, 'prof': 0.1, 'surf': 0.1, 'infra': 0.1, 'situ': 0.1, 'agg': 0.1, 'col': 0.1, 'choc': 0.1, 'sexe_conducteur': 0.1, 'trajet': 0.0, 'catr': 0.0, 'circ': 0.0, 'nbv': 0.0, 'plan': 0.0, 'vma': 0.0, 'atm': 0.0, 'catv': 0.0, 'obs': 0.0, 'obsm': 0.0}]


100%|██████████| 1/1 [00:00<00:00,  3.61it/s]

Query instance (original outcome : 0)





Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-1,4,1,1,1,1,1,0,1,2,...,1,1,3,0,2,1,0,1,37,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,pieton,sexe_conducteur,age,mortal
0,-,-,-,-,-,-,-,9,-,-,...,-,-,5,-,-,-,-,0,74,1
1,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,74,1
2,-,-,-,-,4,-,-,-,-,-,...,-,-,0,-,4,-,-,0,97,1
3,-,-,-,-,-,2,-,-,-,3,...,-,-,-,2,-1,6,-,-,97,1
4,-,-,-,-,-,-,-,-1,-,-,...,-,-,-,-,-,-,-,-,74,1
5,-,-,-,-,4,-,-,-,-,-,...,-,-,0,-,4,-,-,0,97,1
6,-,-,-,-,4,-,-,-,-,-,...,-,-,-,-,4,-,-,-,97,1
7,-,-,-,-,-,-,-,-,-,-,...,3,-,-,-,-,-,-,-,52,1
8,-,-,-,-,4,-,-,-,-,-,...,-,-,-,-,4,-,-,0,97,1
9,-,-,-,-,-,-,-,-1,-,-,...,-,-,-,-,-,7,-,-,74,1


[{'age': 1.0, 'mois': 0.6, 'obsm': 0.5, 'prof': 0.4, 'sexe_conducteur': 0.4, 'infra': 0.3, 'catv': 0.3, 'int': 0.2, 'choc': 0.2, 'plan': 0.1, 'vma': 0.1, 'atm': 0.1, 'obs': 0.1, 'trajet': 0.0, 'catr': 0.0, 'circ': 0.0, 'nbv': 0.0, 'surf': 0.0, 'situ': 0.0, 'lum': 0.0, 'agg': 0.0, 'col': 0.0, 'pieton': 0.0, 'dep': 0.0}]


In [67]:
from BlackBoxAuditing.data import load_from_file
from BlackBoxAuditing.model_factories.AbstractModelFactory import AbstractModelFactory
from BlackBoxAuditing.model_factories.AbstractModelVisitor import AbstractModelVisitor

import BlackBoxAuditing as BBA


(_, train_BBA, _, _, _, _) = load_from_file("train_data.csv",
                      correct_types = np.repeat([int], [len(data_test.columns)]),
                                response_header = 'Y',
                               train_percentage = 1.0)
(headers, _, test_BBA, response_header, features_to_ignore, correct_types) = load_from_file("test_data.csv",
                      correct_types = np.repeat([int], [len(data_test.columns)]),
                                response_header = 'Y',
                               train_percentage = 0.0)
BBA_data = (headers, train_BBA, test_BBA, response_header, features_to_ignore, correct_types)

In [68]:
class HirePredictorBuilder(AbstractModelFactory):
    def __init__(self, *args, **kwargs):
        AbstractModelFactory.__init__(self, *args, **kwargs)
        self.verbose_factory_name = "HirePredictor"
    def build(self, train_set):
        return HirePredictor()

class HirePredictor(AbstractModelVisitor):
    def __init__(self):
        with open( 'clf.pickle', 'rb' ) as f:
            self.clf = pickle.load(f)

    def test(self, test_set, test_name=""):
        df_test = pd.DataFrame(
            test_set, columns =data_test.columns.to_list())
        targets = df_test['Y']
        preds = self.clf.predict(df_test.drop('Y', axis=1))
        return [[a,b] for (a,b) in zip(targets, preds)]

In [69]:
auditor = BBA.Auditor()
auditor.ModelFactory = HirePredictorBuilder
auditor(BBA_data, output_dir = "audit-output")

Training initial model. (14:55:20)
Calculating original model statistics on test data:
	Training Set:
		Conf-Matrix: {0: {0: 60075}, 1: {1: 3379}}
		accuracy: 1.0
		BCR: 1.0
	Testing Set:
		Conf-Matrix {0: {0: 27718, 1: 1617}, 1: {0: 1421, 1: 283}}
		accuracy: 0.9021231354102903
		BCR: 0.5554789720654476
Auditing: 'trajet' (1/24). (14:55:20)
Auditing: 'catr' (2/24). (14:55:29)
Auditing: 'circ' (3/24). (14:55:39)
Auditing: 'nbv' (4/24). (14:55:50)
Auditing: 'prof' (5/24). (14:56:01)
Auditing: 'plan' (6/24). (14:56:13)
Auditing: 'surf' (7/24). (14:56:25)
Auditing: 'infra' (8/24). (14:56:38)
Auditing: 'situ' (9/24). (14:56:50)
Auditing: 'vma' (10/24). (14:57:02)
Auditing: 'mois' (11/24). (14:57:13)
Auditing: 'lum' (12/24). (14:57:21)
Auditing: 'dep' (13/24). (14:57:32)
Auditing: 'agg' (14/24). (14:57:40)
Auditing: 'int' (15/24). (14:57:51)
Auditing: 'atm' (16/24). (14:58:04)
Auditing: 'col' (17/24). (14:58:17)
Auditing: 'catv' (18/24). (14:58:27)
Auditing: 'obs' (19/24). (14:58:39)
Auditi

In [60]:
cols = data_test.columns.to_list()
cols.remove('Y')
cols

['trajet',
 'catr',
 'circ',
 'nbv',
 'prof',
 'plan',
 'surf',
 'infra',
 'situ',
 'vma',
 'mois',
 'lum',
 'dep',
 'agg',
 'int',
 'atm',
 'col',
 'catv',
 'obs',
 'obsm',
 'choc',
 'pieton',
 'sexe_conducteur',
 'age']

In [61]:
def predictproba_fn(data, clf=clf, columns=cols):
  df_test = pd.DataFrame(
            [data], columns=columns )
  preds = clf.predict_proba(df_test)
  return np.squeeze(preds, axis=0)
fc = lambda x: predictproba_fn(x)[1]

In [62]:
query_instance = data_test.drop(columns="Y")[0:1]
x_class = int(clf.predict(query_instance))
query_instance=query_instance.squeeze()
print(query_instance)
print()
print("Prediction for x: {0:.0f}".format(x_class))

trajet              9
catr                4
circ                2
nbv                 2
prof                1
plan                1
surf                1
infra               0
situ                1
vma                 2
mois               10
lum                 1
dep                25
agg                 2
int                 3
atm                 1
col                 3
catv                3
obs                 0
obsm                2
choc                8
pieton              0
sexe_conducteur     1
age                34
Name: 2, dtype: int64

Prediction for x: 0



Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)



In [63]:
fc(query_instance.values)

0.0

In [19]:
X_opposite_class = X_train[clf.predict(X_train) != x_class].copy()
reference = X_opposite_class.sample()
ref_class = int(clf.predict(reference))
reference=reference.squeeze()
print(reference)
print()
print("Prediction for this reference: {0:.0f}".format(ref_class))

trajet              5
catr                3
circ                2
nbv                 2
prof                1
plan                1
surf                2
infra               0
situ                1
vma                 2
mois                1
lum                 3
dep                72
agg                 2
int                 1
atm                 2
col                 1
catv                2
obs                 0
obsm                2
choc                1
pieton              0
sexe_conducteur     1
age                24
Name: 112982, dtype: int64

Prediction for this reference: 1



Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)



In [20]:
fc(reference.values)

1.0

In [21]:
from shapkit.shapley_values import ShapleyValues
from shapkit.inspector import inspector
from shapkit.monte_carlo_shapley import MonteCarloShapley, MonteCarloShapleyBatch
from shapkit.sgd_shapley import SGDshapley
from shapkit.plots import plot_shapley

true_shap = ShapleyValues(x=query_instance, fc=fc, ref=reference)

 29%|██▉       | 7/24 [21:58:45<53:22:42, 11303.68s/it]


KeyboardInterrupt: 