# Accidents corporels de la circulation routière (ACCR) en France

Pour chaque accident corporel (soit un accident survenu sur une voie ouverte à la circulation publique, impliquant au moins un véhicule et ayant fait au moins une victime ayant nécessité des soins), des saisies d’information décrivant l’accident sont effectuées par l’unité des forces de l’ordre (police, gendarmerie, etc.) qui est intervenue sur le lieu de l’accident. Ces saisies sont rassemblées dans une fiche intitulée bulletin d’analyse des accidents corporels. L’ensemble de ces fiches constitue le fichier national des accidents corporels de la circulation dit « Fichier BAAC » administré par l’Observatoire national interministériel de la sécurité routière "ONISR".

In [22]:
from pathlib import Path
import pandas as pd
import numpy as np

In [23]:
HERE = Path.cwd().parent.parent
DATASET_PATH = HERE / 'dataset'
DATASET_PATH

PosixPath('/home/goamegah/Documents/workspace/develop/esgi/s1/hackaton/automl-road-accident-gravity/dataset')

In [24]:
import sys
sys.path.append(str(HERE))

## Dataset 

In [25]:
caracteristiques = pd.read_csv(
    f'{DATASET_PATH}/caract-2023.csv',
    sep=';',
)

lieux = pd.read_csv(
    f'{DATASET_PATH}/lieux-2023.csv', 
    sep=';',
    dtype={'lartpc': str}
)
vehicules = pd.read_csv(
    f'{DATASET_PATH}/vehicules-2023.csv',
    sep=';'
    )

usagers = pd.read_csv(
    f'{DATASET_PATH}/usagers-2023.csv',
    sep=';'
    )

In [26]:
print(caracteristiques.shape)
caracteristiques.head()

(54822, 15)


Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,col,adr,lat,long
0,202300000001,7,5,2023,06:00,1,75,75101,2,4,2,7,RUE DE RIVOLI,4886638600,232347100
1,202300000002,7,5,2023,05:30,5,94,94080,2,1,3,6,Avenue de Paris,4884547782,242868146
2,202300000003,7,5,2023,20:50,1,94,94022,2,3,2,1,Avenue du Général Leclerc,4876240000,240655000
3,202300000004,6,5,2023,23:57,5,94,94078,2,1,3,5,Rue de Paris,4873248432,244687575
4,202300000005,7,5,2023,00:50,5,94,94068,2,2,3,3,56bis Avenue Raspail,4878581000,249217000


In [27]:
print(vehicules.shape)
vehicules.head()

(93585, 11)


Unnamed: 0,Num_Acc,id_vehicule,num_veh,senc,catv,obs,obsm,choc,manv,motor,occutc
0,202300000001,155 680 557,A01,1,30,0,0,5,1,1,
1,202300000002,155 680 556,A01,2,7,0,1,1,1,1,
2,202300000003,155 680 554,B01,1,2,0,2,1,16,1,
3,202300000003,155 680 555,A01,2,7,0,2,2,15,1,
4,202300000004,155 680 551,B01,1,7,0,2,9,2,4,


In [28]:
print(usagers.shape)
usagers.head()

(125789, 16)


Unnamed: 0,Num_Acc,id_usager,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202300000001,203 851 184,155 680 557,A01,1,1,4,1,1978.0,5,2,0,-1,-1,-1,-1
1,202300000002,203 851 182,155 680 556,A01,1,1,1,2,1997.0,9,1,0,-1,-1,-1,-1
2,202300000002,203 851 183,155 680 556,A01,10,3,3,1,1997.0,9,0,-1,-1,2,3,1
3,202300000003,203 851 180,155 680 554,B01,1,1,3,1,1987.0,0,2,6,0,0,0,-1
4,202300000003,203 851 181,155 680 555,A01,1,1,1,2,1984.0,0,1,0,0,0,0,-1


In [29]:
print(lieux.shape)
lieux.head()

(70860, 18)


Unnamed: 0,Num_Acc,catr,voie,v1,v2,circ,nbv,vosp,prof,pr,pr1,plan,lartpc,larrout,surf,infra,situ,vma
0,202300000001,4,RUE DE RIVOLI,0,,1,2,0,1,-1,-1,1,,-1,2,0,1,30
1,202300000001,4,RUE SAINT FLORENTIN,0,,1,1,0,1,-1,-1,1,,-1,2,0,1,30
2,202300000002,3,120,0,,2,3,2,1,-1,-1,1,,-1,2,0,1,50
3,202300000003,3,5,0,,2,4,0,1,1,0,1,,-1,2,5,1,50
4,202300000003,3,87,0,,2,4,0,1,1,0,1,,-1,2,5,1,50


## Data modeling

In [30]:
caract_lieux = pd.merge(caracteristiques, lieux, on='Num_Acc', how='inner')
caract_lieux_veh = pd.merge(caract_lieux, vehicules, on='Num_Acc', how='inner')
final_df = pd.merge(caract_lieux_veh, usagers, on=['Num_Acc', 'id_vehicule'], how='inner')

print(final_df.shape)
final_df.head()

(163683, 56)


Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202300000001,7,5,2023,06:00,1,75,75101,2,4,...,4,1,1978.0,5,2,0,-1,-1,-1,-1
1,202300000001,7,5,2023,06:00,1,75,75101,2,4,...,4,1,1978.0,5,2,0,-1,-1,-1,-1
2,202300000002,7,5,2023,05:30,5,94,94080,2,1,...,1,2,1997.0,9,1,0,-1,-1,-1,-1
3,202300000002,7,5,2023,05:30,5,94,94080,2,1,...,3,1,1997.0,9,0,-1,-1,2,3,1
4,202300000003,7,5,2023,20:50,1,94,94022,2,3,...,3,1,1987.0,0,2,6,0,0,0,-1


In [31]:
final_df.dtypes

Num_Acc          int64
jour             int64
mois             int64
an               int64
hrmn            object
lum              int64
dep             object
com             object
agg              int64
int              int64
atm              int64
col              int64
adr             object
lat             object
long            object
catr             int64
voie            object
v1               int64
v2              object
circ             int64
nbv             object
vosp             int64
prof             int64
pr              object
pr1             object
plan             int64
lartpc          object
larrout         object
surf             int64
infra            int64
situ             int64
vma              int64
id_vehicule     object
num_veh_x       object
senc             int64
catv             int64
obs              int64
obsm             int64
choc             int64
manv             int64
motor            int64
occutc         float64
id_usager       object
num_veh_y  

## Traitement des données

In [32]:
from accr.data.processing import Processing

processing = Processing()

In [33]:
np.unique(final_df.actp)

array([' -1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'],
      dtype=object)

In [34]:
# final_df.dtypes

In [35]:
# remplacer les valeurs vides, -1, 0, (.) par np.nan
clean_df = processing.replace_missing_values(final_df)

In [36]:
print(clean_df.shape)
clean_df.head()

(163683, 56)


Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202300000001,7,5,2023,06:00,1.0,75,75101,2,4.0,...,4.0,1.0,1978.0,5.0,2.0,,,,,
1,202300000001,7,5,2023,06:00,1.0,75,75101,2,4.0,...,4.0,1.0,1978.0,5.0,2.0,,,,,
2,202300000002,7,5,2023,05:30,5.0,94,94080,2,1.0,...,1.0,2.0,1997.0,9.0,1.0,,,,,
3,202300000002,7,5,2023,05:30,5.0,94,94080,2,1.0,...,3.0,1.0,1997.0,9.0,,,,2.0,3.0,1.0
4,202300000003,7,5,2023,20:50,1.0,94,94022,2,3.0,...,3.0,1.0,1987.0,,2.0,6.0,,,,


In [37]:
np.unique(clean_df.actp.to_string)

array([<bound method Series.to_string of 0         NaN
       1         NaN
       2         NaN
       3           3
       4         NaN
                ...
       163678    NaN
       163679    NaN
       163680    NaN
       163681    NaN
       163682    NaN
       Name: actp, Length: 163683, dtype: object>     ], dtype=object)

In [38]:
clean_df.columns

Index(['Num_Acc', 'jour', 'mois', 'an', 'hrmn', 'lum', 'dep', 'com', 'agg',
       'int', 'atm', 'col', 'adr', 'lat', 'long', 'catr', 'voie', 'v1', 'v2',
       'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1', 'plan', 'lartpc', 'larrout',
       'surf', 'infra', 'situ', 'vma', 'id_vehicule', 'num_veh_x', 'senc',
       'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'occutc', 'id_usager',
       'num_veh_y', 'place', 'catu', 'grav', 'sexe', 'an_nais', 'trajet',
       'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp'],
      dtype='object')

In [39]:
# lean_df.to_csv(f'{DATASET_PATH}/cleaned_data.csv', index=False)

In [40]:
col_to_delete = [
    'Num_Acc', 'jour', 'mois', 'an','hrmn', 'dep', 'com', 'agg', 'adr', 'lat', 'long', 'voie', 'v1',
    'v2', 'pr', 'pr1', 'lartpc', 'larrout', 'id_vehicule', 'senc', 'motor', 'occutc', 'id_usager',
    'place', 'catu', 'sexe', 'an_nais', 'trajet', 'num_veh_x', 'num_veh_y', 'actp', 'nbv', 'secu2', 'secu3'
]

In [41]:
all_columns = set(clean_df.columns)
col_to_delete_set = set(col_to_delete)
col_to_keep_set = all_columns - col_to_delete_set
col_to_keep_list = list(col_to_keep_set)

In [42]:
col_to_keep_list

['vma',
 'surf',
 'etatp',
 'grav',
 'lum',
 'locp',
 'catv',
 'manv',
 'col',
 'choc',
 'infra',
 'situ',
 'int',
 'plan',
 'atm',
 'prof',
 'obsm',
 'catr',
 'vosp',
 'circ',
 'obs',
 'secu1']

In [43]:
final_cleaned_df = clean_df[col_to_keep_list]
final_cleaned_df

Unnamed: 0,vma,surf,etatp,grav,lum,locp,catv,manv,col,choc,...,int,plan,atm,prof,obsm,catr,vosp,circ,obs,secu1
0,30.0,2.0,,4.0,1.0,,30.0,1.0,7.0,5.0,...,4.0,1.0,2.0,1.0,,4,,1.0,,2.0
1,30.0,2.0,,4.0,1.0,,30.0,1.0,7.0,5.0,...,4.0,1.0,2.0,1.0,,4,,1.0,,2.0
2,50.0,2.0,,1.0,5.0,,7.0,1.0,6.0,1.0,...,1.0,1.0,3.0,1.0,1.0,3,2.0,2.0,,1.0
3,50.0,2.0,1.0,3.0,5.0,2.0,7.0,1.0,6.0,1.0,...,1.0,1.0,3.0,1.0,1.0,3,2.0,2.0,,
4,50.0,2.0,,3.0,1.0,,2.0,16.0,1.0,1.0,...,3.0,1.0,2.0,1.0,2.0,3,,2.0,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163678,90.0,1.0,,4.0,1.0,,30.0,9.0,3.0,3.0,...,6.0,1.0,1.0,1.0,2.0,2,1.0,,,2.0
163679,90.0,1.0,,1.0,1.0,,7.0,9.0,3.0,2.0,...,6.0,1.0,1.0,1.0,2.0,2,1.0,,,1.0
163680,30.0,1.0,,1.0,1.0,,7.0,22.0,3.0,7.0,...,1.0,1.0,6.0,1.0,2.0,4,2.0,2.0,,1.0
163681,30.0,1.0,,1.0,1.0,,7.0,22.0,3.0,7.0,...,1.0,1.0,6.0,1.0,2.0,4,2.0,2.0,,1.0


In [44]:
processing.summary(final_cleaned_df)

{'Missing Values': vma       10360
 surf        505
 etatp    150800
 grav        156
 lum           3
 locp     150789
 catv        438
 manv       9930
 col          28
 choc       9264
 infra    138030
 situ        478
 int          10
 plan        399
 atm           7
 prof        518
 obsm      28366
 catr          0
 vosp     143222
 circ      11247
 obs      139989
 secu1     18171
 dtype: int64,
 'Constant Features': []}

In [45]:
preprocessing_df = processing.preprocessing(final_cleaned_df, method="drop", subset=["grav"])
preprocessing_df.shape

(163527, 22)

In [46]:
preprocessing_df = preprocessing_df.drop_duplicates()
preprocessing_df = preprocessing_df.reset_index(drop=True)

In [47]:
preprocessing_df.shape

(124776, 22)

In [48]:
preprocessing_df = processing.preprocessing(preprocessing_df, method="imputation", strategy="mean")
preprocessing_df

Unnamed: 0,vma,surf,etatp,grav,lum,locp,catv,manv,col,choc,...,int,plan,atm,prof,obsm,catr,vosp,circ,obs,secu1
0,30.0,2.0,1.254388,4.0,1.0,3.334744,30.0,1.0,7.0,5.0,...,4.0,1.0,2.0,1.0,1.960718,4,1.872982,1.000000,7.118126,2.00000
1,50.0,2.0,1.254388,1.0,5.0,3.334744,7.0,1.0,6.0,1.0,...,1.0,1.0,3.0,1.0,1.000000,3,2.000000,2.000000,7.118126,1.00000
2,50.0,2.0,1.000000,3.0,5.0,2.000000,7.0,1.0,6.0,1.0,...,1.0,1.0,3.0,1.0,1.000000,3,2.000000,2.000000,7.118126,2.39607
3,50.0,2.0,1.254388,3.0,1.0,3.334744,2.0,16.0,1.0,1.0,...,3.0,1.0,2.0,1.0,2.000000,3,1.872982,2.000000,7.118126,2.00000
4,50.0,2.0,1.254388,1.0,1.0,3.334744,7.0,15.0,1.0,2.0,...,3.0,1.0,2.0,1.0,2.000000,3,1.872982,2.000000,7.118126,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124771,30.0,1.0,1.254388,4.0,1.0,3.334744,7.0,26.0,7.0,4.0,...,1.0,2.0,1.0,2.0,9.000000,4,1.872982,2.000000,7.118126,1.00000
124772,90.0,1.0,1.254388,4.0,1.0,3.334744,30.0,9.0,3.0,3.0,...,6.0,1.0,1.0,1.0,2.000000,2,1.000000,1.928738,7.118126,2.00000
124773,90.0,1.0,1.254388,1.0,1.0,3.334744,7.0,9.0,3.0,2.0,...,6.0,1.0,1.0,1.0,2.000000,2,1.000000,1.928738,7.118126,1.00000
124774,30.0,1.0,1.254388,1.0,1.0,3.334744,7.0,22.0,3.0,7.0,...,1.0,1.0,6.0,1.0,2.000000,4,2.000000,2.000000,7.118126,1.00000


In [49]:
processing.summary(preprocessing_df)

{'Missing Values': vma      0
 surf     0
 etatp    0
 grav     0
 lum      0
 locp     0
 catv     0
 manv     0
 col      0
 choc     0
 infra    0
 situ     0
 int      0
 plan     0
 atm      0
 prof     0
 obsm     0
 catr     0
 vosp     0
 circ     0
 obs      0
 secu1    0
 dtype: int64,
 'Constant Features': []}

In [50]:
Y = preprocessing_df['grav']
X = preprocessing_df.drop('grav', axis=1)
X

Unnamed: 0,vma,surf,etatp,lum,locp,catv,manv,col,choc,infra,...,int,plan,atm,prof,obsm,catr,vosp,circ,obs,secu1
0,30.0,2.0,1.254388,1.0,3.334744,30.0,1.0,7.0,5.0,5.364261,...,4.0,1.0,2.0,1.0,1.960718,4,1.872982,1.000000,7.118126,2.00000
1,50.0,2.0,1.254388,5.0,3.334744,7.0,1.0,6.0,1.0,5.364261,...,1.0,1.0,3.0,1.0,1.000000,3,2.000000,2.000000,7.118126,1.00000
2,50.0,2.0,1.000000,5.0,2.000000,7.0,1.0,6.0,1.0,5.364261,...,1.0,1.0,3.0,1.0,1.000000,3,2.000000,2.000000,7.118126,2.39607
3,50.0,2.0,1.254388,1.0,3.334744,2.0,16.0,1.0,1.0,5.000000,...,3.0,1.0,2.0,1.0,2.000000,3,1.872982,2.000000,7.118126,2.00000
4,50.0,2.0,1.254388,1.0,3.334744,7.0,15.0,1.0,2.0,5.000000,...,3.0,1.0,2.0,1.0,2.000000,3,1.872982,2.000000,7.118126,1.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124771,30.0,1.0,1.254388,1.0,3.334744,7.0,26.0,7.0,4.0,5.364261,...,1.0,2.0,1.0,2.0,9.000000,4,1.872982,2.000000,7.118126,1.00000
124772,90.0,1.0,1.254388,1.0,3.334744,30.0,9.0,3.0,3.0,5.364261,...,6.0,1.0,1.0,1.0,2.000000,2,1.000000,1.928738,7.118126,2.00000
124773,90.0,1.0,1.254388,1.0,3.334744,7.0,9.0,3.0,2.0,5.364261,...,6.0,1.0,1.0,1.0,2.000000,2,1.000000,1.928738,7.118126,1.00000
124774,30.0,1.0,1.254388,1.0,3.334744,7.0,22.0,3.0,7.0,5.364261,...,1.0,1.0,6.0,1.0,2.000000,4,2.000000,2.000000,7.118126,1.00000


In [52]:
X.dtypes

vma      float64
surf     float64
etatp    float64
lum      float64
locp     float64
catv     float64
manv     float64
col      float64
choc     float64
infra    float64
situ     float64
int      float64
plan     float64
atm      float64
prof     float64
obsm     float64
catr       int64
vosp     float64
circ     float64
obs      float64
secu1    float64
dtype: object

In [51]:
processing.df_to_numerical(X, drop="first")

ValueError: No objects to concatenate

In [53]:
preprocessing_df.to_csv('../../dataset/preprocessed_data-v2.csv', index=False)