### Imports

In [129]:
import os
import pandas as pd
import json
import folium
import matplotlib.pyplot as plt
import numpy as np
import re
import math
import numpy as np

### Constants

In [130]:
DATA_FOLDER = 'data_clean'
IS_DATE = re.compile("^[1-2]{1}[0-9]{3}$")

### Load Dataframe

In [137]:
def describeDf(df,name="DESCRIPTION",level=1):
    print("_________________ " + name+" _________________")
    date = []
    if(level==1):
        print("\n")
        for col in df.columns:
            if((not IS_DATE.match(col)) & (not re.compile("[0-9]+ an").match(col))):
                print("         ATTRIBUTE: "+col)
                print("   "+str(df[col].unique()))
                print("\n")
            else:
                date.append(col)
        print("         VALUES: "+str(date))
        print("\n\n\n\n")
    else :
        print(df.columns.values)
        print("\n\n\n\n")

print("Dataframes available:\n\n")
for file in os.listdir(DATA_FOLDER):
    filename = os.fsdecode(file)
    tablename = "df_"+os.path.splitext(os.path.splitext(filename)[0])[0]
    globals()[tablename] = pd.read_csv(DATA_FOLDER + '/' + filename)
    globals()[tablename] = globals()[tablename].drop(columns=['Unnamed: 0'])
    describeDf(globals()[tablename], name=tablename,level=0)
    

Dataframes available:


_________________ df_Accident_acteurs _________________
['voiture_index' 'voiture' 'sexe_index' 'sexe' 'role_index' 'role'
 'age_index' 'age' 'consequence_index' 'consequence' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015' '2016' '2017']





_________________ df_Accident_circonstances _________________
['mois_index' 'mois' 'day_index' 'day' 'hours_index' 'hours'
 'gravite_index' 'gravite' 'lieu_index' 'lieu' 'contexte_index' 'contexte'
 'circonstance_index' 'circonstance' '1992' '1993' '1994' '1995' '1996'
 '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004' '2005' '2006'
 '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014' '2015' '2016'
 '2017']





_________________ df_Accident_objets _________________
['voiture_index' 'voiture' 'age_index' 'age' 'sexe_index' 'sexe'
 'annee_permis_index' 'annee_permis' 'faute_index' 'faute'
 'co

### Describe a DataFrame

In [174]:
describeDf(df_Accident_acteurs)

_________________ DESCRIPTION _________________


         ATTRIBUTE: voiture_index
   [210. 211. 214. 220. 221. 222. 223. 224. 227. 228. 229.]


         ATTRIBUTE: voiture
   ['Voiture de tourisme' 'Véhicule de transport de personnes'
 'Véhicule de transport de choses' 'Cycle' 'Cyclomoteur' 'Motocycle léger'
 "Motocycle jusqu'à 125 cm3" 'Motocycle de plus de 125 cm3' 'Piéton'
 'Autre véhicule non motorisé' 'Autre et inconnu']


         ATTRIBUTE: sexe_index
   [1. 2. 9.]


         ATTRIBUTE: sexe
   ['Homme' 'Femme' 'Inconnu']


         ATTRIBUTE: role_index
   [300. 301. 303.]


         ATTRIBUTE: role
   ['Conducteur' 'Piéton' 'Passager']


         ATTRIBUTE: age_index
   [  0.   5.   6.  10.  15.  18.  20.  21.  30.  40.  50.  60.  70. 999.]


         ATTRIBUTE: age
   ["jusqu'à 4 ans" '5 ans' '6-9 ans' '10-14 ans' '15-17 ans' '18-19 ans'
 '20 ans' '21-29 ans' '30-39 ans' '40-49 ans' '50-59 ans' '60-69 ans'
 '70 ans et +' 'Inconnu']


         ATTRIBUTE: consequence_index
  

## A. Prepare Data

### A.1. Get the Swiss population

In [138]:
def build_age(intervales):
    ages_total = []
    for i in range(len(intervales)-1):
        ages = []
        ages.append(intervales[i])
        for k in range(int(intervales[i]),int(intervales[i+1])):
            if(k != 1 and k<100):
                ages.append('{} ans'.format(k))
            if(k == 1):
                ages.append('{} an'.format(k))
            if(k==100):
                ages.append('100 ans ou plus')
                break
        ages_total.append(ages)
    return ages_total

In [151]:
intervales = build_age(df_Accident_acteurs.age_index.unique())
for elem in intervales:
    df_Population_canton[str(elem[0])] = df_Population_canton.loc[:,elem[1:]].sum(axis=1)
toKeep = []
for elem in intervales :
    toKeep.append(str(elem[0]))
toKeep.append('annee')
toKeep.append('sexe')

df_pop_Suisse = df_Population_canton[(df_Population_canton['canton'] == 'Suisse') & (df_Population_canton['sexe'] != 'Sexe - Total')\
                                     & (df_Population_canton.etat_civil == "Etat civil - Total")]

df_pop_Suisse = df_pop_Suisse[toKeep]
df_pop_Suisse['annee'] = df_pop_Suisse['annee'].map(int).map(str)
df_pop_Suisse = df_pop_Suisse.groupby(['annee','sexe']).sum()
df_pop_Suisse = df_pop_Suisse.transpose()
df_pop_Suisse.index = df_pop_Suisse.index.map(float)
display(df_pop_Suisse.head(2))

annee,2010,2010,2011,2011,2012,2012,2013,2013,2014,2014,2015,2015,2016,2016,2017,2017
sexe,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme
0.0,191197.0,202097.0,195313.0,206364.0,198173.0,210120.0,201280.0,213036.0,204828.0,216474.0,207157.0,219664.0,210312.0,222765.0,212201.0,224251.0
5.0,37402.0,39653.0,37807.0,40014.0,38743.0,40693.0,39450.0,42127.0,40359.0,42763.0,41844.0,43688.0,41177.0,43629.0,41704.0,44528.0


> `df_pop_Suisse` contains the Swiss population by **year** and **sexe** on *columns* and by **age** range on *rows*

### A.2. Normalize the data by the population

In [140]:
df_Accident_acteurs_normalized = df_Accident_acteurs.copy()
df_Accident_acteurs_normalized = df_Accident_acteurs_normalized[df_Accident_acteurs_normalized['sexe_index']!=9]
for year,sexe in df_pop_Suisse.columns.values:
    for age in df_pop_Suisse.index.values:
        df_Accident_acteurs_normalized.loc[(df_Accident_acteurs_normalized.age_index==age) &\
                                           (df_Accident_acteurs_normalized.sexe==sexe),year]\
        = df_Accident_acteurs_normalized[year]*10000 / df_pop_Suisse.loc[df_pop_Suisse.index==age,year][sexe].values[0]
reg = re.compile("200[0-9]|1[0-9]{3}")
toDrop = []
for elem in df_Accident_acteurs_normalized.columns:
    if reg.match(elem):
        toDrop.append(elem)
df_Accident_acteurs_normalized = df_Accident_acteurs_normalized.drop(columns=toDrop)
df_Accident_acteurs_normalized = df_Accident_acteurs_normalized[df_Accident_acteurs_normalized.age_index != 999]
display(df_Accident_acteurs_normalized.head(2))

Unnamed: 0,voiture_index,voiture,sexe_index,sexe,role_index,role,age_index,age,consequence_index,consequence,2010,2011,2012,2013,2014,2015,2016,2017
0,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,0.0,jusqu'à 4 ans,315.0,Blessé léger,0.0,0.0,0.047592,0.0,0.0,0.0,0.0,0.0
1,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,0.0,jusqu'à 4 ans,316.0,Blessé grave,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,0.0,jusqu'à 4 ans,320.0,Tué,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,5.0,5 ans,315.0,Blessé léger,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,5.0,5 ans,316.0,Blessé grave,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


> `df_Accident_acteurs_normalized` contains the proportion in term of **age range** and **sexe** of the population involved in accident

In [150]:
df_Accident_objets_normalized = df_Accident_objets.copy()
df_Accident_objets_normalized = df_Accident_objets_normalized[df_Accident_objets_normalized['sexe_index']!=9]
for year,sexe in df_pop_Suisse.columns.values:
    for age in df_pop_Suisse.index.values:
        df_Accident_objets_normalized.loc[(df_Accident_objets_normalized.age_index==age) &\
                                           (df_Accident_objets_normalized.sexe==sexe),year]\
        = df_Accident_objets_normalized[year]*10000 / df_pop_Suisse.loc[df_pop_Suisse.index==age,year][sexe].values[0]
reg = re.compile("200[0-9]|1[0-9]{3}")
toDrop = []
for elem in df_Accident_objets_normalized.columns:
    if reg.match(elem):
        toDrop.append(elem)
df_Accident_objets_normalized = df_Accident_objets_normalized.drop(columns=toDrop)
df_Accident_objets_normalized = df_Accident_objets_normalized[df_Accident_objets_normalized.age_index != 999]
display(df_Accident_objets_normalized.head(2))

Unnamed: 0,voiture_index,voiture,age_index,age,sexe_index,sexe,annee_permis_index,annee_permis,faute_index,faute,consequence_index,consequence,2010,2011,2012,2013,2014,2015,2016,2017
0,210.0,Voiture de tourisme,0.0,Jusqu'à 4 ans,1.0,Homme,0.0,0 an,0.0,Objet sans faute ni influence,315.0,Accidents avec blessés légers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,210.0,Voiture de tourisme,0.0,Jusqu'à 4 ans,1.0,Homme,0.0,0 an,0.0,Objet sans faute ni influence,316.0,Accidents avec blessés graves,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


> `df_Accident_objets_normalized` contains the proportion in term of **age range** and **sexe** of the population involved in accident

## B. Analyse on the actors of accidents

### B.1. Sexe and Age of conductors

##### Using `df_Accident_objets_normalized`

In [171]:
conducteur = df_Accident_objets_normalized[(df_Accident_objets_normalized.annee_permis_index!=0)\
                                           & (df_Accident_objets_normalized.annee_permis_index!=999)
                                           & (df_Accident_objets_normalized.voiture_index==210)]\
            .groupby(['sexe','age']).sum().sort_values(['2010'], ascending=False)[[col for col in conducteur.columns if IS_DATE.match(col)]]
display(conducteur.head(10))


Unnamed: 0_level_0,Unnamed: 1_level_0,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Homme,20 ans,60.862079,56.936537,53.686185,48.666776,48.905259,36.807831,44.557734,41.966548
Homme,21-29 ans,57.985585,54.012681,48.313519,45.961388,42.669766,41.561941,41.601469,39.33932
Homme,30-39 ans,43.948825,41.711458,40.272102,38.930899,36.923781,33.857228,34.213227,34.741186
Homme,40-49 ans,42.76234,38.157735,37.095152,34.921224,33.867883,32.116146,31.554196,31.349074
Homme,50-59 ans,40.635182,35.750283,34.096272,32.277577,32.620497,32.241042,31.179018,29.243624
Femme,21-29 ans,37.211739,34.689406,32.344718,31.19215,31.071365,29.557699,29.01588,29.609281
Homme,60-69 ans,33.565207,32.212362,31.389332,30.340141,29.221557,27.395775,28.27717,26.888664
Femme,20 ans,31.85907,23.004705,27.907554,26.002259,29.219663,27.577656,25.347733,22.259078
Homme,70 ans et plus,29.545209,29.776206,27.465131,28.226431,26.067983,27.05915,27.791348,26.134918
Femme,30-39 ans,29.091675,26.202055,26.530645,24.899495,23.738512,23.241012,24.008127,22.990344


##### Using `df_Accident_acteurs_normalized`

In [172]:
conducteur = df_Accident_acteurs_normalized[(df_Accident_acteurs_normalized['role_index']==300)\
                                           & (df_Accident_acteurs_normalized.voiture_index==210)]\
            .groupby(['sexe','age']).sum().sort_values(['2010'], ascending=False)\
            [[col for col in conducteur.columns if IS_DATE.match(col)]]
display(conducteur.head(10))

Unnamed: 0_level_0,Unnamed: 1_level_0,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Homme,20 ans,31.432061,34.122245,31.732727,21.470636,24.554942,21.235287,23.48313,20.070958
Homme,18-19 ans,29.945647,32.527253,26.504472,21.985492,24.036688,21.276596,19.480993,18.359528
Femme,20 ans,26.236882,18.652463,21.705875,21.100194,22.129598,19.881566,22.15264,21.822626
Homme,21-29 ans,24.691722,23.313785,20.991673,19.219483,18.170092,16.782357,16.573584,15.940908
Femme,21-29 ans,23.638871,20.971517,18.786925,19.113753,18.272432,17.542208,17.573575,17.436577
Homme,30-39 ans,17.17253,15.2689,15.043276,14.918088,13.564525,12.398887,12.352053,12.910794
Femme,18-19 ans,15.794224,13.43088,18.510672,14.199463,12.916444,12.471883,15.02706,13.273222
Femme,30-39 ans,14.636636,12.559663,13.229615,12.749105,11.981966,12.320125,12.197813,12.061601
Homme,40-49 ans,14.315094,12.790132,13.238475,12.131176,11.330628,11.407203,10.943467,10.50879
Femme,40-49 ans,13.885566,12.291048,11.675021,10.062917,10.649627,11.36534,11.167246,10.044575


> RESULTS ARE NOT THE SAME :(