In [None]:
#Les imports 

%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from utils import *
import pickle

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing

from dataset_prepare import load_dataset, pred_thres

In [None]:
df_2 = load_dataset()

label = 'mortal'

In [None]:
# valeurs catégorielles
categorical_features = ['trajet', 'catr', 'circ', 'nbv', 'prof',
                        'plan', 'surf', 'vma', 'lum', 'agg', 
                        'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'pieton',
                        'sexe_conducteur', 'infra', 'situ']
# valeurs numériques
numerical_features = ['mois', 'dep','age']

print("numerical : ", numerical_features)
print("categorical : ", categorical_features)

In [None]:
from sklearn.model_selection import train_test_split

# On découpe le jeu de données tout en conservant ensemble les véhicules impliqué dans un même accident
# Les données d'entrainement et de test n'ont donc pas de rapport direct

unique_accidents = df_2['Num_Acc'].unique() # Num_Acc uniques

df_3 = df_2.drop(columns=['mortal'])

# Création des train et test set à partir des numéros d'accident
X_train, X_test = train_test_split(unique_accidents, test_size=0.33, random_state=42)

# On peut ensuite récupérer les véhicules correspondants aux accidents
train_df = df_2[df_2['Num_Acc'].isin(X_train)]
test_df = df_2[df_2['Num_Acc'].isin(X_test)]
y_train = train_df['mortal']
y_test = test_df['mortal']
X_train = train_df.drop(columns=['mortal', 'Num_Acc'])
X_test = test_df.drop(columns=['mortal', 'Num_Acc'])
df_2 = df_2.drop(columns='Num_Acc')

In [None]:
from utils import custom_RFC

clf = pickle.load(open('models/rfc_model.sav', 'rb'))
preds = pred_thres(clf.predict_proba(X_test), 0.2)
clf.score(X_train, y_train), clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
tn, fp, fn, tp

In [None]:
import plotly.express as px

fig = px.imshow([[tn, fp], [fn, tp]], text_auto=True, labels=dict(y="Truth", x="Pred"),
                x=["False", "True"],
                y=["False", "True"]
               )
fig.show()

- RandomForest: 26326, 3009, 1002, 702

In [None]:
import pickle

data_test = X_test.copy(deep=True)
data_test["Y"] = y_test

data_test.to_csv("test_data.csv",
          index=False)

data_train = X_train.copy(deep=True)
data_train["Y"] = y_train

data_train.to_csv("train_data.csv",
          index=False)

with open( 'clf.pickle', 'wb' ) as f:
    pickle.dump(clf, f )

In [None]:
import dice_ml
from dice_ml.utils import helpers
from IPython.core.display import HTML

In [None]:
train_dataset = df_2.copy()
train_dataset[label] = y_train
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical_features, outcome_name=label)

m = dice_ml.Model(model=clf, backend="sklearn")

exp = dice_ml.Dice(d, m)

In [None]:
# Génération des contrefactuels
for i in range(5):
  query_instance = data_test.drop(columns="Y")[i:i+1]
  dice_exp = exp.generate_counterfactuals(
      query_instance,
      total_CFs=5,
      sample_size=100000,
      desired_class="opposite")
  # Visualize counterfactual explanation
  dice_exp.visualize_as_dataframe(show_only_changes=True)
  # Instance local feature importance
  pd.set_option("display.max_rows", None, "display.max_columns", None)
  imp = exp.local_feature_importance(query_instance, cf_examples_list=dice_exp.cf_examples_list)
  print(imp.local_importance)

In [None]:
# t prend les entrées qui donne une sortie 1
# t2 prend le reste
t = []
t2 = []
for i in range(0, len(preds)):
    if preds[i] == 1:
        t.append(i)
    else:
        t2.append(i)

In [None]:
# Calcul de shap en parallèle pour 10 valeurs
# Ici pour les sorties données 1
s = parallel_shap(data_test, X_train, clf, t, n_iter=10)
s2 = parallel_shap(data_test, X_train, clf, t2, n_iter=10)

In [None]:
# Démarrage calculs
s.start_shap()
s2.start_shap()

In [None]:
from shapkit.plots import plot_shapley

# Affichage des résultats
for i in range(0, 10):
    true_shap, query_instance, reference = s.results[i]
    fig = plot_shapley(x=query_instance, fc=s.fc, ref=reference, shapley_values=true_shap, n_attributes=24)

In [None]:
from shapkit.plots import plot_shapley

# Affichage des résultats
for i in range(0, 10):
    true_shap, query_instance, reference = s2.results[i]
    fig = plot_shapley(x=query_instance, fc=s2.fc, ref=reference, shapley_values=true_shap, n_attributes=24)

In [None]:
from BlackBoxAuditing.data import load_from_file
from BlackBoxAuditing.model_factories.AbstractModelFactory import AbstractModelFactory
from BlackBoxAuditing.model_factories.AbstractModelVisitor import AbstractModelVisitor

import BlackBoxAuditing as BBA


(_, train_BBA, _, _, _, _) = load_from_file("train_data.csv",
                      correct_types = np.repeat([int], [len(data_test.columns)]),
                                response_header = 'Y',
                               train_percentage = 1.0)
(headers, _, test_BBA, response_header, features_to_ignore, correct_types) = load_from_file("test_data.csv",
                      correct_types = np.repeat([int], [len(data_test.columns)]),
                                response_header = 'Y',
                               train_percentage = 0.0)
BBA_data = (headers, train_BBA, test_BBA, response_header, features_to_ignore, correct_types)

In [None]:
class HirePredictorBuilder(AbstractModelFactory):
    def __init__(self, *args, **kwargs):
        AbstractModelFactory.__init__(self, *args, **kwargs)
        self.verbose_factory_name = "HirePredictor"
    def build(self, train_set):
        return HirePredictor()

class HirePredictor(AbstractModelVisitor):
    def __init__(self):
        with open( 'clf.pickle', 'rb' ) as f:
            self.clf = pickle.load(f)

    def test(self, test_set, test_name=""):
        df_test = pd.DataFrame(
            test_set, columns =data_test.columns.to_list())
        targets = df_test['Y']
        preds = self.clf.predict(df_test.drop('Y', axis=1))
        return [[a,b] for (a,b) in zip(targets, preds)]

In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = HirePredictorBuilder
auditor(BBA_data, output_dir = "audit-output")