In [84]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from utils import *
#import imblearn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


L'objectif est de prédire si un véhicule donné est impliqué dans un accident mortel ou non.

## Préparation du dataframe
Les attributs présents dans la base de données ne sont pas tous utiles et certains doivent être transformer avant d'être utilisé.

In [85]:
label = "mortal"

# Lecture datasets
df1 = pd.read_csv("dataset/usagers-2022.csv", sep=';')
df2 = pd.read_csv("dataset/lieux-2022.csv", sep=';')
df3 = pd.read_csv("dataset/carcteristiques-2022.csv", sep=';')
df4 = pd.read_csv("dataset/vehicules-2022.csv", sep=';')

df4 = df4.drop(columns=['id_vehicule', 'num_veh'])


df = df1.join(df2.set_index('Num_Acc'), on='Num_Acc')
df = df.join(df3.set_index('Accident_Id'), on='Num_Acc')
df = df.join(df4.set_index('Num_Acc'), on='Num_Acc', lsuffix='_')


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



In [86]:
# Suppression colonnes inutiles
df = df.drop(columns=['voie', 'v1', 'v2', 'pr', 'pr1', 'lartpc', 'larrout'
                      , 'num_veh', 'occutc', 'adr', 'senc','etatp','actp', 
                      'manv', 'jour', 'com', 'hrmn', 'motor', 'place', 'vosp', 'locp'])

df = df.drop_duplicates(subset=['id_usager']) # retire les doublons dans les usagers

# Remplacement des valeurs NaN
df['an_nais'] = df['an_nais'].fillna(df['an_nais'].mode()[0])

# Convertir en entier
df['id_vehicule'] = df['id_vehicule'].apply(lambda l: l[0:3] + l[4:7])
df['id_vehicule'] = df['id_vehicule'].astype(int)
df['sexe'] = df['sexe'].astype(int)
df

Unnamed: 0,Num_Acc,id_usager,id_vehicule,catu,grav,sexe,an_nais,trajet,secu1,secu2,...,agg,int,atm,col,lat,long,catv,obs,obsm,choc
0,202200000001,1 099 700,813952,1,3,1,2008.0,5,2,8,...,2,3,1,3,445594200000,47257200000,2,0,2,1
1,202200000001,1 099 701,813953,1,1,1,1948.0,5,1,8,...,2,3,1,3,445594200000,47257200000,2,0,2,1
2,202200000002,1 099 698,813950,1,4,1,1988.0,9,1,0,...,2,3,1,3,469258100000,63462000000,7,0,2,8
3,202200000002,1 099 699,813951,1,1,1,1970.0,4,1,0,...,2,3,1,3,469258100000,63462000000,7,0,2,8
4,202200000003,1 099 696,813948,1,1,1,2002.0,0,1,0,...,2,6,1,2,484931620000,-27604390000,7,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126657,202200055301,968 230,715631,1,1,2,2002.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126658,202200055301,968 231,715631,2,3,2,2004.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126659,202200055301,968 232,715632,1,4,2,1953.0,5,1,-1,...,1,3,1,3,439272650000,19156370000,7,0,0,8
126660,202200055302,968 228,715629,1,3,1,1992.0,1,2,6,...,2,1,1,2,475944040000,13533290000,33,0,2,1


In [87]:
for column in df.columns:
  if df[column].isnull().values.any() == True:
    print(column, df[column].isnull().values.any()) # afficher s'il y a des valeurs nulles

In [88]:
df_2 = df.copy()

# On crée un attribut pour les accidents mortels
df_2['mortal'] = extract_mortal(df)

# Accident impliquant un piéton
has_pedestrian = df[df['catu'] == 3]
p = to_attribute(df, has_pedestrian, 1, 0)
df_2['pieton'] = p

# Sexe du conducteur
driver = df[(df['catu'] == 1) & (df['sexe'] == 1)]
dr = to_attribute(df, driver, 1, 0)
df_2['sexe_conducteur'] = dr


df_2 = df_2.drop_duplicates(subset=['id_vehicule'])

Il faut transformer les types de 'nbv', 'hrmn', 'dep', 'com', 'lat', 'long' de objets à respectivement : int, int, int, int, float, float

Pour dep et com : transformer les dpt corses pour leur donner un nom en chiffre et non en lettre (2A/2B)

In [89]:
# On réduit les carégories de véhicules
df_2 = simplify_catv(df_2)

# On enlève la catégorie peu repésentées qu'on ajoute dans une catégorie autre (identifiant 5)
df_2 = simplify_catr(df_2)

# Découpage en 4 catégories de vitesse
df_2 = split_vma(df_2)
        
# Département en entiers
df_2['dep'] = df_2['dep'].apply(transforme_dpt)
df_2['dep'] = pd.to_numeric(df_2['dep'], errors='coerce', downcast='integer')

# nbv en entier
df_2['nbv'] = pd.to_numeric(df_2['nbv'], errors='coerce', downcast='integer')
df_2['nbv'].fillna(2, inplace=True) # only one entry 

In [90]:
# Lat et long en float :
df_2['lat'] = pd.to_numeric(df_2['lat'], errors='coerce')
df_2['long'] = pd.to_numeric(df_2['long'], errors='coerce')

In [91]:
# Age du conducteur du véhicule
df_2['age'] = get_driver_age(df_2)
df_2['age'] = df_2['age'].fillna(df_2['age'].mode()[0])

# On enlève les attributs qui ne sont plus utiles
df_2 = df_2.drop(columns=['an_nais','grav', 'sexe','catu', 'Num_Acc', 'id_usager', 'id_vehicule',
                          'secu1','secu2','secu3', 'an', 'lat', 'long'])

In [92]:
# Tri manuel des valeurs catégorielles/numériques

# valeurs catégorielles
categorical_features = ['trajet', 'catr', 'circ', 'nbv', 'prof',
                        'plan', 'surf', 'infra', 'situ', 'vma', 'mois', 'lum', 'dep', 'agg', 
                        'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'mortal', 'pieton',
                        'sexe_conducteur']
# valeurs numériques
numerical_features = ['age']

print("numerical : ", numerical_features)
print("categorical : ", categorical_features)

numerical :  ['age']
categorical :  ['trajet', 'catr', 'circ', 'nbv', 'prof', 'plan', 'surf', 'infra', 'situ', 'vma', 'mois', 'lum', 'dep', 'agg', 'int', 'atm', 'col', 'catv', 'obs', 'obsm', 'choc', 'mortal', 'pieton', 'sexe_conducteur']


In [93]:
df_2

Unnamed: 0,trajet,catr,circ,nbv,prof,plan,surf,infra,situ,vma,...,atm,col,catv,obs,obsm,choc,mortal,pieton,sexe_conducteur,age
0,5,4,2,2.0,1,1,1,0,1,2,...,1,3,2,0,2,1,0,0,1,14.0
1,5,4,2,2.0,1,1,1,0,1,2,...,1,3,2,0,2,1,0,0,1,74.0
2,9,4,2,2.0,1,1,1,0,1,2,...,1,3,3,0,2,8,0,0,1,34.0
3,4,4,2,2.0,1,1,1,0,1,2,...,1,3,3,0,2,8,0,0,1,52.0
4,0,3,-1,2.0,1,1,1,5,1,2,...,1,2,3,0,2,1,0,0,1,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126656,5,3,2,2.0,1,2,7,0,3,3,...,9,6,3,2,0,1,1,0,1,27.0
126657,5,3,2,2.0,1,1,1,0,1,3,...,1,3,3,0,0,8,0,0,0,20.0
126659,5,3,2,2.0,1,1,1,0,1,3,...,1,3,3,0,0,8,0,0,0,69.0
126660,1,3,3,4.0,1,1,1,0,1,2,...,1,2,5,0,2,1,0,0,1,30.0


## Analyse de données

In [None]:
val = [len(df_2[df_2.mortal == 1]), len(df_2[df_2.mortal == 0])]
labels = ['Accident mortel', 'Accident non mortel']
px.pie(values=val, names=labels)

val = [len(df_2[df_2.pieton == 1]), len(df_2[df_2.pieton == 0])]
labels = ['Implique piéton', 'N\'implique pas de piéton']
px.pie(values=val, names=labels)

fig = px.histogram(df_2, x="catv")
fig.show()

fig = px.box(df_2, x="age")
fig.show()

fig = px.histogram(df_2, x="sexe_conducteur")
fig.show()

fig = px.histogram(df_2, x="catr")
fig.show()

fig = px.histogram(df_2, x="col")
fig.show()

In [None]:
features = ['age', 'mois', 'catr', 'mortal', 'sexe_conducteur']

for exp in features:
    print(exp)
    analyse_bi_quali_quanti("catv", exp, df_2)

In [None]:
features = ['age', 'mois', 'catr', 'mortal']

for exp in features:
    print(exp)
    analyse_bi_quali_quanti("vma", exp, df_2)

In [None]:
features = ['age', 'vma', 'mois', 'catr', 'mortal']

for exp in features:
    print(exp)
    analyse_bi_quali_quanti("col", exp, df_2)

## One hot encoding

In [94]:
def oneHotShapingColumns(indicesCat, name, x) :
    indiceTrue = int(name[-1])
    zeroes = []
    uns = []
    for key in indicesCat.keys() :
        if indicesCat[key] == indiceTrue :
            uns.append(key)
        else :
            zeroes.append(key)
    x[name].replace(to_replace=zeroes, value=0, inplace=True)
    x[name].replace(to_replace=uns, value=1, inplace=True)


### Lum :
1. Plein jour
2. Crépuscule ou aube
3. Nuit sans éclairage (fusion 3-4)
4. Nuit avec éclairage public allumé (ancien 5)

In [95]:
def oneHotLum(x) :
    x['lum1'] = x['lum']
    x['lum2'] = x['lum']
    x['lum3'] = x['lum']
    x['lum4'] = x['lum']
    indicesCat = {1:1, 2:2, 3:3, 4:3, 5:4}
    for i in range(1, 5) :
        oneHotShapingColumns(indicesCat, 'lum'+str(i), x)
    x.drop(columns=['lum'], inplace=True)

oneHotLum(df_2)
    

### Col - Type de collision :
1. Deux véhicules - frontale
2. Deux véhicules – par l’arrière 
3. Deux véhicules – par le coté
4. Trois véhicules et plus – en chaîne
5. Trois véhicules et plus - collisions multiples 
6. Autre collision

In [96]:
def oneHotCol(x) :
    x['col1'] = x['col']
    x['col2'] = x['col']
    x['col3'] = x['col']
    x['col4'] = x['col']
    x['col5'] = x['col']
    x['col6'] = x['col']
    indicesCat = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6}
    for i in range(1, 7) :
        oneHotShapingColumns(indicesCat, 'col'+str(i), x)
    x.drop(columns=['col'], inplace=True)

oneHotCol(df_2)

### VMA - Vitesse sur les lieux :
1. v< 50
2. v>=50 && v<=70
3. v> 70 && v< 100
4. v>=100

In [97]:
def oneHotVma(x) :
    x['vma1'] = x['vma']
    x['vma2'] = x['vma']
    x['vma3'] = x['vma']
    x['vma4'] = x['vma']
    indicesCat = {1:1, 2:2, 3:3, 4:4}
    for i in range(1, 5) :
        oneHotShapingColumns(indicesCat, 'vma'+str(i), x)
    x.drop(columns=['vma'], inplace=True)

oneHotVma(df_2)

In [98]:
cat_features = ['trajet', 'catr', 'circ', 'prof', 'plan', 'surf', 'situ', 'atm', 'catv', 'choc']

one_hot_data = pd.get_dummies(data=df_2, columns=cat_features)
df_2 = one_hot_data

df_2

Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,mortal,pieton,...,choc_0,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9
0,2.0,0,10,26,2,3,0,2,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2.0,0,10,26,2,3,0,2,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2.0,0,10,25,2,3,0,2,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2.0,0,10,25,2,3,0,2,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2.0,5,10,22,2,6,0,2,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126656,2.0,0,1,74,1,1,2,0,1,0,...,0,1,0,0,0,0,0,0,0,0
126657,2.0,0,1,81,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
126659,2.0,0,1,81,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
126660,4.0,0,3,41,2,1,0,2,0,0,...,0,1,0,0,0,0,0,0,0,0


## Apprentissage avec un arbre de décision

In [99]:
df_3 = df_2.drop(columns=['mortal'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_3, df_2[label], test_size=0.33, random_state=42)

In [100]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=42)
clf = clf.fit(X_train, y_train)

preds = clf.predict(X_test)

clf.score(X_test, y_test)

0.9288073629862426

In [101]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
tn, fp, fn, tp

(28341, 1125, 1095, 622)

In [102]:
import plotly.express as px


fig = px.imshow([[tn, fp], [fn, tp]], text_auto=True, labels=dict(y="Truth", x="Pred"),
                x=["False", "True"],
                y=["False", "True"]
               )
fig.show()

## Générer des contrefactuels

In [103]:
import dice_ml
from dice_ml.utils import helpers

In [104]:
train_dataset = df_2.copy()
train_dataset[label] = y_train
d = dice_ml.Data(dataframe=train_dataset, continuous_features=numerical_features, outcome_name=label)

m = dice_ml.Model(model=clf, backend="sklearn")

exp = dice_ml.Dice(d, m)

In [105]:
# Generate counterfactual examples
query_instance = train_dataset.drop(columns=label)[0:4]
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=4, desired_class="opposite")
# Visualize counterfactual explanation
dice_exp.visualize_as_dataframe()

100%|██████████| 4/4 [00:02<00:00,  1.78it/s]

Query instance (original outcome : 0)





Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1
1,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1
2,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1
3,2.0,0,10,26,2,7,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1


Query instance (original outcome : 0)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1
1,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1
2,11.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1
3,2.0,0,10,26,2,3,0,2,0,1,...,1,0,0,0,0,0,0,0,0,1


Query instance (original outcome : 0)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
1,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
2,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
3,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1


Query instance (original outcome : 0)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,nbv,infra,mois,dep,agg,int,obs,obsm,pieton,sexe_conducteur,...,choc_1,choc_2,choc_3,choc_4,choc_5,choc_6,choc_7,choc_8,choc_9,mortal
0,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
1,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
2,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,0,1
3,2.0,0,10,25,2,3,0,2,0,1,...,0,0,0,0,0,0,0,1,1,1
