# Accidents corporels de la circulation routière (ACCR) en France

Pour chaque accident corporel (soit un accident survenu sur une voie ouverte à la circulation publique, impliquant au moins un véhicule et ayant fait au moins une victime ayant nécessité des soins), des saisies d’information décrivant l’accident sont effectuées par l’unité des forces de l’ordre (police, gendarmerie, etc.) qui est intervenue sur le lieu de l’accident. Ces saisies sont rassemblées dans une fiche intitulée bulletin d’analyse des accidents corporels. L’ensemble de ces fiches constitue le fichier national des accidents corporels de la circulation dit « Fichier BAAC » administré par l’Observatoire national interministériel de la sécurité routière "ONISR".

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
HERE = Path.cwd().parent.parent
DATASET_PATH = HERE / 'dataset'
DATASET_PATH

WindowsPath('C:/Users/csalhab/OneDrive/Online Sessions/4iabd1/hackaton/hackaton-accr/dataset')

In [3]:
import sys
sys.path.append(str(HERE))

## Dataset 

In [4]:
caracteristiques = pd.read_csv(
    f'{DATASET_PATH}/caract-2023.csv',
    sep=';',
)

lieux = pd.read_csv(
    f'{DATASET_PATH}/lieux-2023.csv', 
    sep=';',
    dtype={'lartpc': str}
)
vehicules = pd.read_csv(
    f'{DATASET_PATH}/vehicules-2023.csv',
    sep=';'
    )

usagers = pd.read_csv(
    f'{DATASET_PATH}/usagers-2023.csv',
    sep=';'
    )

In [5]:
print(caracteristiques.shape)
caracteristiques.head()

(54822, 15)


Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,col,adr,lat,long
0,202300000001,7,5,2023,06:00,1,75,75101,2,4,2,7,RUE DE RIVOLI,4886638600,232347100
1,202300000002,7,5,2023,05:30,5,94,94080,2,1,3,6,Avenue de Paris,4884547782,242868146
2,202300000003,7,5,2023,20:50,1,94,94022,2,3,2,1,Avenue du Général Leclerc,4876240000,240655000
3,202300000004,6,5,2023,23:57,5,94,94078,2,1,3,5,Rue de Paris,4873248432,244687575
4,202300000005,7,5,2023,00:50,5,94,94068,2,2,3,3,56bis Avenue Raspail,4878581000,249217000


In [6]:
print(vehicules.shape)
vehicules.head()

(93585, 11)


Unnamed: 0,Num_Acc,id_vehicule,num_veh,senc,catv,obs,obsm,choc,manv,motor,occutc
0,202300000001,155 680 557,A01,1,30,0,0,5,1,1,
1,202300000002,155 680 556,A01,2,7,0,1,1,1,1,
2,202300000003,155 680 554,B01,1,2,0,2,1,16,1,
3,202300000003,155 680 555,A01,2,7,0,2,2,15,1,
4,202300000004,155 680 551,B01,1,7,0,2,9,2,4,


In [7]:
print(usagers.shape)
usagers.head()

(125789, 16)


Unnamed: 0,Num_Acc,id_usager,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202300000001,203 851 184,155 680 557,A01,1,1,4,1,1978.0,5,2,0,-1,-1,-1,-1
1,202300000002,203 851 182,155 680 556,A01,1,1,1,2,1997.0,9,1,0,-1,-1,-1,-1
2,202300000002,203 851 183,155 680 556,A01,10,3,3,1,1997.0,9,0,-1,-1,2,3,1
3,202300000003,203 851 180,155 680 554,B01,1,1,3,1,1987.0,0,2,6,0,0,0,-1
4,202300000003,203 851 181,155 680 555,A01,1,1,1,2,1984.0,0,1,0,0,0,0,-1


In [8]:
print(lieux.shape)
lieux.head()

(70860, 18)


Unnamed: 0,Num_Acc,catr,voie,v1,v2,circ,nbv,vosp,prof,pr,pr1,plan,lartpc,larrout,surf,infra,situ,vma
0,202300000001,4,RUE DE RIVOLI,0,,1,2,0,1,-1,-1,1,,-1,2,0,1,30
1,202300000001,4,RUE SAINT FLORENTIN,0,,1,1,0,1,-1,-1,1,,-1,2,0,1,30
2,202300000002,3,120,0,,2,3,2,1,-1,-1,1,,-1,2,0,1,50
3,202300000003,3,5,0,,2,4,0,1,1,0,1,,-1,2,5,1,50
4,202300000003,3,87,0,,2,4,0,1,1,0,1,,-1,2,5,1,50


## Data modeling

In [9]:
caract_lieux = pd.merge(caracteristiques, lieux, on='Num_Acc', how='inner')
caract_lieux_veh = pd.merge(caract_lieux, vehicules, on='Num_Acc', how='inner')
final_df = pd.merge(caract_lieux_veh, usagers, on=['Num_Acc', 'id_vehicule'], how='inner')

print(final_df.shape)
final_df.head()

(163683, 56)


Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202300000001,7,5,2023,06:00,1,75,75101,2,4,...,4,1,1978.0,5,2,0,-1,-1,-1,-1
1,202300000001,7,5,2023,06:00,1,75,75101,2,4,...,4,1,1978.0,5,2,0,-1,-1,-1,-1
2,202300000002,7,5,2023,05:30,5,94,94080,2,1,...,1,2,1997.0,9,1,0,-1,-1,-1,-1
3,202300000002,7,5,2023,05:30,5,94,94080,2,1,...,3,1,1997.0,9,0,-1,-1,2,3,1
4,202300000003,7,5,2023,20:50,1,94,94022,2,3,...,3,1,1987.0,0,2,6,0,0,0,-1


In [10]:
final_df.dtypes

Num_Acc          int64
jour             int64
mois             int64
an               int64
hrmn            object
lum              int64
dep             object
com             object
agg              int64
int              int64
atm              int64
col              int64
adr             object
lat             object
long            object
catr             int64
voie            object
v1               int64
v2              object
circ             int64
nbv             object
vosp             int64
prof             int64
pr              object
pr1             object
plan             int64
lartpc          object
larrout         object
surf             int64
infra            int64
situ             int64
vma              int64
id_vehicule     object
num_veh_x       object
senc             int64
catv             int64
obs              int64
obsm             int64
choc             int64
manv             int64
motor            int64
occutc         float64
id_usager       object
num_veh_y  

## Traitement des données

In [11]:
from accr.data.processing import Processing

processing = Processing()

In [12]:
np.unique(final_df.actp)

array([' -1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'],
      dtype=object)

In [13]:
# final_df.dtypes

In [14]:
# remplacer les valeurs vides, -1, 0, (.) par np.nan
clean_df = processing.replace_missing_values(final_df)

In [15]:
print(clean_df.shape)
clean_df.head()

(163683, 56)


Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,...,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202300000001,7,5,2023,06:00,1.0,75,75101,2,4.0,...,4.0,1.0,1978.0,5.0,2.0,,,,,
1,202300000001,7,5,2023,06:00,1.0,75,75101,2,4.0,...,4.0,1.0,1978.0,5.0,2.0,,,,,
2,202300000002,7,5,2023,05:30,5.0,94,94080,2,1.0,...,1.0,2.0,1997.0,9.0,1.0,,,,,
3,202300000002,7,5,2023,05:30,5.0,94,94080,2,1.0,...,3.0,1.0,1997.0,9.0,,,,2.0,3.0,1.0
4,202300000003,7,5,2023,20:50,1.0,94,94022,2,3.0,...,3.0,1.0,1987.0,,2.0,6.0,,,,


In [16]:
np.unique(clean_df.actp.to_string)

array([<bound method Series.to_string of 0         NaN
       1         NaN
       2         NaN
       3           3
       4         NaN
                ...
       163678    NaN
       163679    NaN
       163680    NaN
       163681    NaN
       163682    NaN
       Name: actp, Length: 163683, dtype: object>     ], dtype=object)

In [17]:
# col_to_delete = [
#     'Num_Acc', 'jour', 'mois', 'an','hrmn', 'dep', 'com', 'agg', 'adr', 'lat', 'long', 'voie', 'v1',
#     'v2', 'pr', 'pr1', 'lartpc', 'larrout', 'id_vehicule', 'senc', 'motor', 'occutc', 'id_usager',
#     'place', 'catu', 'sexe', 'an_nais', 'trajet', 'num_veh_x', 'num_veh_y', 'actp', 'nbv'
# ]

col_to_delete = ['Num_Acc', 'id_vehicule', 'id_usager', 'jour', 'mois', 'an','hrmn', 'adr', 
                 'num_veh_x',   'num_veh_y', 'sexe', 'an_nais', 'voie']

In [18]:
all_columns = set(clean_df.columns)
col_to_delete_set = set(col_to_delete)
col_to_keep_set = all_columns - col_to_delete_set
col_to_keep_list = list(col_to_keep_set)

In [19]:
final_cleaned_df = clean_df[col_to_keep_list]
final_cleaned_df

Unnamed: 0,v2,locp,lartpc,dep,catv,secu3,senc,int,secu2,occutc,...,etatp,infra,lum,vma,nbv,circ,catr,plan,manv,obs
0,,,,75,30.0,,1.0,4.0,,,...,,,1.0,30.0,2,1.0,4,1.0,1.0,
1,,,,75,30.0,,1.0,4.0,,,...,,,1.0,30.0,1,1.0,4,1.0,1.0,
2,,,,94,7.0,,2.0,1.0,,,...,,,5.0,50.0,3,2.0,3,1.0,1.0,
3,,2.0,,94,7.0,,2.0,1.0,,,...,1.0,,5.0,50.0,3,2.0,3,1.0,1.0,
4,,,,94,2.0,,1.0,3.0,6.0,,...,,5.0,1.0,50.0,4,2.0,3,1.0,16.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163678,,,,973,30.0,,1.0,6.0,,,...,,,1.0,90.0,2,,2,1.0,9.0,
163679,,,,973,7.0,,1.0,6.0,,,...,,,1.0,90.0,2,,2,1.0,9.0,
163680,,,,69,7.0,,2.0,1.0,,,...,,,1.0,30.0,2,2.0,4,1.0,22.0,
163681,,,,69,7.0,,2.0,1.0,,,...,,,1.0,30.0,2,2.0,4,1.0,22.0,


In [20]:
processing.summary(final_cleaned_df)

{'Missing Values': v2         150060
 locp       150789
 lartpc     163605
 dep             0
 catv          438
 secu3      161620
 senc         9404
 int            10
 secu2      129955
 occutc     160958
 larrout    121905
 motor       13425
 trajet      46273
 catu            0
 obsm        28366
 secu1       18171
 col            28
 lat             0
 v1         163462
 surf          505
 place           2
 vosp       143222
 pr1        101117
 grav          156
 actp       150796
 pr          99075
 com             0
 situ          478
 agg             0
 prof          518
 choc         9264
 atm             7
 long            0
 etatp      150800
 infra      138030
 lum             3
 vma         10360
 nbv         14185
 circ        11247
 catr            0
 plan          399
 manv         9930
 obs        139989
 dtype: int64,
 'Constant Features': []}

In [21]:
preprocessing_df = processing.preprocessing(final_cleaned_df, method="drop", subset=["grav"])
preprocessing_df.shape

(163527, 43)

In [22]:
preprocessing_df = processing.preprocessing(preprocessing_df, method="imputation", strategy="constant", fill_value=0)
preprocessing_df

Unnamed: 0,v2,locp,lartpc,dep,catv,secu3,senc,int,secu2,occutc,...,etatp,infra,lum,vma,nbv,circ,catr,plan,manv,obs
0,D,0.0,3,75,30.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,1.0,30.0,2,1.0,4,1.0,1.0,0.0
1,D,0.0,3,75,30.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,1.0,30.0,1,1.0,4,1.0,1.0,0.0
2,D,0.0,3,94,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,5.0,50.0,3,2.0,3,1.0,1.0,0.0
3,D,2.0,3,94,7.0,0.0,2.0,1.0,0.0,0.0,...,1.0,0.0,5.0,50.0,3,2.0,3,1.0,1.0,0.0
4,D,0.0,3,94,2.0,0.0,1.0,3.0,6.0,0.0,...,0.0,5.0,1.0,50.0,4,2.0,3,1.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163678,D,0.0,3,973,30.0,0.0,1.0,6.0,0.0,0.0,...,0.0,0.0,1.0,90.0,2,0.0,2,1.0,9.0,0.0
163679,D,0.0,3,973,7.0,0.0,1.0,6.0,0.0,0.0,...,0.0,0.0,1.0,90.0,2,0.0,2,1.0,9.0,0.0
163680,D,0.0,3,69,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,1.0,30.0,2,2.0,4,1.0,22.0,0.0
163681,D,0.0,3,69,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,1.0,30.0,2,2.0,4,1.0,22.0,0.0


In [23]:
processing.summary(preprocessing_df)

{'Missing Values': v2         0
 locp       0
 lartpc     0
 dep        0
 catv       0
 secu3      0
 senc       0
 int        0
 secu2      0
 occutc     0
 larrout    0
 motor      0
 trajet     0
 catu       0
 obsm       0
 secu1      0
 col        0
 lat        0
 v1         0
 surf       0
 place      0
 vosp       0
 pr1        0
 grav       0
 actp       0
 pr         0
 com        0
 situ       0
 agg        0
 prof       0
 choc       0
 atm        0
 long       0
 etatp      0
 infra      0
 lum        0
 vma        0
 nbv        0
 circ       0
 catr       0
 plan       0
 manv       0
 obs        0
 dtype: int64,
 'Constant Features': []}

In [24]:
Y = preprocessing_df['grav']
X = preprocessing_df.drop('grav', axis=1)
X

Unnamed: 0,v2,locp,lartpc,dep,catv,secu3,senc,int,secu2,occutc,...,etatp,infra,lum,vma,nbv,circ,catr,plan,manv,obs
0,D,0.0,3,75,30.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,1.0,30.0,2,1.0,4,1.0,1.0,0.0
1,D,0.0,3,75,30.0,0.0,1.0,4.0,0.0,0.0,...,0.0,0.0,1.0,30.0,1,1.0,4,1.0,1.0,0.0
2,D,0.0,3,94,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,5.0,50.0,3,2.0,3,1.0,1.0,0.0
3,D,2.0,3,94,7.0,0.0,2.0,1.0,0.0,0.0,...,1.0,0.0,5.0,50.0,3,2.0,3,1.0,1.0,0.0
4,D,0.0,3,94,2.0,0.0,1.0,3.0,6.0,0.0,...,0.0,5.0,1.0,50.0,4,2.0,3,1.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163678,D,0.0,3,973,30.0,0.0,1.0,6.0,0.0,0.0,...,0.0,0.0,1.0,90.0,2,0.0,2,1.0,9.0,0.0
163679,D,0.0,3,973,7.0,0.0,1.0,6.0,0.0,0.0,...,0.0,0.0,1.0,90.0,2,0.0,2,1.0,9.0,0.0
163680,D,0.0,3,69,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,1.0,30.0,2,2.0,4,1.0,22.0,0.0
163681,D,0.0,3,69,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,1.0,30.0,2,2.0,4,1.0,22.0,0.0


In [25]:
processing.df_to_numerical(X, drop="first")

AttributeError: 'PreProcessing' object has no attribute 'one_hot_encode'

In [None]:
preprocessing_df.to_csv('../../dataset/preprocessed_data-v2.csv', index=False)