### Imports

In [284]:
import os
import pandas as pd
import json
import folium
import matplotlib.pyplot as plt
import numpy as np
import re
import math
import numpy as np

%pylab inline
pylab.rcParams['figure.figsize'] = (20,12)

Populating the interactive namespace from numpy and matplotlib


### Constants

In [2]:
DATA_FOLDER = 'data_clean'
IS_DATE = re.compile("^[1-2]{1}[0-9]{3}$")

### Load Dataframe

In [68]:
def describeDf(df,name="DESCRIPTION",level=1):
    print("_________________ " + name+" _________________")
    date = []
    if(level==1):
        print("\n")
        for col in df.columns:
            if((not IS_DATE.match(col)) & (not re.compile("[0-9]+ an").match(col))):
                print("         ATTRIBUTE: "+col)
                print("   "+str(df[col].unique()))
                print("\n")
            else:
                date.append(col)
        print("         VALUES: "+str(date))
        print("\n\n\n\n")
    else :
        print(df.columns.values)
        print("\n\n\n\n")

print("Dataframes available:\n\n")
for file in os.listdir(DATA_FOLDER):
    filename = os.fsdecode(file)
    tablename = "df_"+os.path.splitext(os.path.splitext(filename)[0])[0]
    globals()[tablename] = pd.read_csv(DATA_FOLDER + '/' + filename)
    globals()[tablename] = globals()[tablename].drop(columns=['Unnamed: 0'])
    describeDf(globals()[tablename], name=tablename,level=0)
    

Dataframes available:


_________________ df_Accident_cantons _________________
['canton_index' 'canton' 'gravite_index' 'gravite' 'ruralite_index'
 'ruralite' '1992' '1993' '1994' '1995' '1996' '1997' '1998' '1999' '2000'
 '2001' '2002' '2003' '2004' '2005' '2006' '2007' '2008' '2009' '2010'
 '2011' '2012' '2013' '2014' '2015' '2016' '2017']





_________________ df_Accident_circonstances _________________
['mois_index' 'mois' 'day_index' 'day' 'hours_index' 'hours'
 'gravite_index' 'gravite' 'ruralite_index' 'ruralite' 'contexte_index'
 'contexte' 'circonstance_index' 'circonstance' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015' '2016' '2017']





_________________ df_Accident_objets _________________
['voiture_index' 'voiture' 'age_index' 'age' 'sexe_index' 'sexe'
 'annee_permis_index' 'annee_permis' 'faute_index' 'faute' 'gravite_index'
 'gravite' '1992' '19

## .Usefull functions

#### Function to restrict the dataframe to a fix period in term of years

In [319]:
def get_period(df, start=None,end=None):
    columns_to_keep = []
    dates = []
    for elem in df.columns:
        if not IS_DATE.match(elem):
            columns_to_keep.append(elem)
        else:
            dates.append(int(elem))
    if start == None:
        start = np.min(dates)
    if end == None:
        end = np.max(dates)
    
    for date in dates:
        if (date<=end) & (date>=start):
            columns_to_keep.append(str(date))
    
    return df[columns_to_keep]

#### Function to get the age corresponding to the intervales as input

In [318]:
def build_age(intervales):
    ages_total = []
    for i in range(len(intervales)-1):
        ages = []
        ages.append(intervales[i])
        for k in range(int(intervales[i]),int(intervales[i+1])):
            if(k != 1 and k<100):
                ages.append('{} ans'.format(k))
            if(k == 1):
                ages.append('{} an'.format(k))
            if(k==100):
                ages.append('100 ans ou plus')
                break
        ages_total.append(ages)
    return ages_total

## A. Prepare Data

### A.1. Get the Swiss population

In [321]:
ages = build_age(df_Accident_objets.age_index.unique())
for age in ages:
    df_Population_2010[str(age[0])] = df_Population_2010.loc[:,age[1:]].sum(axis=1)

toKeep = []
for age in ages :
    toKeep.append(str(age[0]))
toKeep.append('annee')
toKeep.append('sexe')

df_pop_Suisse = df_Population_2010[(df_Population_2010['canton'] == 'Suisse') & (df_Population_2010['sexe'] != 'Sexe - Total')\
                                     & (df_Population_2010.etat_civil == "Etat civil - Total")]

df_pop_Suisse = df_pop_Suisse[toKeep]
df_pop_Suisse['annee'] = df_pop_Suisse['annee'].map(int).map(str)
df_pop_Suisse = df_pop_Suisse.groupby(['annee','sexe']).sum()
df_pop_Suisse = df_pop_Suisse.transpose()
df_pop_Suisse.index = df_pop_Suisse.index.map(float)

In [322]:
df_Suisse_1992 = df_Population_age_1992[(df_Population_age_1992.sexe != 'Sexe - Total') & (df_Population_age_1992.age != 'Age - Total')]
df_Suisse_1992 = df_Suisse_1992.groupby(['age','sexe']).sum().transpose()

for age in ages:
    for s in df_Population_age_1992[df_Population_age_1992.sexe != 'Sexe - Total'].sexe.unique():
        index_to = [(annee,s) for annee in age[1:]]
        df_Suisse_1992[(str(age[0]),s)] = df_Suisse_1992.loc[:,index_to].sum(axis=1)
df_Suisse_1992 = df_Suisse_1992.transpose().reset_index().set_index('age').transpose()

toKeep = []
for age in ages :
    toKeep.append(str(age[0]))
df_Suisse_1992 = df_Suisse_1992[toKeep]
df_Suisse_1992 = df_Suisse_1992.transpose()

for year in range(1992,2010):
    for sexe in np.unique([sexe[1] for sexe in df_pop_Suisse.columns.values]):
        df_pop_Suisse[(str(year),sexe)] = df_Suisse_1992.loc[df_Suisse_1992.sexe==sexe][str(year)]
display(df_pop_Suisse.head(10))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_nested_tuple(tup)


annee,2010,2010,2011,2011,2012,2012,2013,2013,2014,2014,...,2005,2005,2006,2006,2007,2007,2008,2008,2009,2009
sexe,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,...,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme,Femme,Homme
0.0,191197.0,202097.0,195313.0,206364.0,198173.0,210120.0,201280.0,213036.0,204828.0,216474.0,...,177363,187819,177587,187995,178473,188725,181376,191869,184239,195325
5.0,37402.0,39653.0,37807.0,40014.0,38743.0,40693.0,39450.0,42127.0,40359.0,42763.0,...,36886,39202,36023,38385,35991,38393,36173,38361,36626,38626
6.0,149641.0,157585.0,150877.0,158790.0,152374.0,160288.0,154842.0,162938.0,157261.0,166440.0,...,157083,166095,154375,163502,151269,160537,149927,159496,148587,158177
10.0,203641.0,214404.0,200464.0,211509.0,198210.0,209411.0,196714.0,207509.0,196515.0,207080.0,...,211676,223411,209411,221047,206647,218410,205007,217162,203635,215747
15.0,128790.0,135406.0,128497.0,135222.0,127946.0,134486.0,127218.0,134069.0,125857.0,132988.0,...,129832,137137,132068,139716,133365,141070,133177,140852,131770,139313
18.0,93072.0,97510.0,91580.0,96227.0,89678.0,94701.0,89440.0,94153.0,89808.0,94855.0,...,85249,88665,86233,90156,88049,92274,89829,94435,91116,95830
20.0,48024.0,49949.0,48251.0,50407.0,48374.0,50106.0,46919.0,48904.0,46544.0,48870.0,...,43426,45072,43869,45154,44305,45733,45376,47107,46442,48183
21.0,461951.0,475463.0,468731.0,483405.0,474266.0,489718.0,481852.0,495851.0,485978.0,500823.0,...,422641,424019,425771,429269,430845,435356,437933,443900,446237,452073
30.0,550673.0,560197.0,554155.0,566511.0,560107.0,574343.0,567883.0,583855.0,576700.0,594197.0,...,576491,580128,566667,570566,558424,563241,554725,560246,552920,559031
40.0,639513.0,655951.0,641117.0,658320.0,638971.0,656420.0,634011.0,652039.0,627252.0,645154.0,...,588111,605861,600599,619487,611068,630098,622291,641052,631489,649656


> `df_pop_Suisse` contains the Swiss population by **year** and **sexe** on *columns* and by **age** range on *rows*

### A.2. Normalize the data by the population

In [268]:
df_Accident_victimes_normalized = df_Accident_victimes.copy()
df_Accident_victimes_normalized = df_Accident_victimes_normalized[df_Accident_victimes_normalized['sexe_index']!=9]
for year,sexe in df_pop_Suisse.columns.values:
    for age in df_pop_Suisse.index.values:
        df_Accident_victimes_normalized.loc[(df_Accident_victimes_normalized.age_index==age) &\
                                           (df_Accident_victimes_normalized.sexe==sexe),year]\
        = df_Accident_victimes_normalized[year]*10000 / df_pop_Suisse.loc[df_pop_Suisse.index==age,year][sexe].values[0]
reg = re.compile("200[0-9]|1[0-9]{3}")

df_Accident_victimes_normalized = df_Accident_victimes_normalized[df_Accident_victimes_normalized.age_index != 999]
display(df_Accident_victimes_normalized.head(2))

Unnamed: 0,voiture_index,voiture,sexe_index,sexe,role_index,role,age_index,age,consequence_index,consequence,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,0.0,jusqu'à 4 ans,315.0,Blessé léger,...,0.0,0.0,0.0,0.0,0.047592,0.0,0.0,0.0,0.0,0.0
1,210.0,Voiture de tourisme,1.0,Homme,300.0,Conducteur,0.0,jusqu'à 4 ans,316.0,Blessé grave,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


> `df_Accident_acteurs_normalized` contains the proportion in term of **age range** and **sexe** of the population involved in accident

In [269]:
df_Accident_objets_normalized = df_Accident_objets.copy()
df_Accident_objets_normalized = df_Accident_objets_normalized[df_Accident_objets_normalized['sexe_index']!=9]
for year,sexe in df_pop_Suisse.columns.values:
    for age in df_pop_Suisse.index.values:
        df_Accident_objets_normalized.loc[(df_Accident_objets_normalized.age_index==age) &\
                                           (df_Accident_objets_normalized.sexe==sexe),year]\
        = df_Accident_objets_normalized[year]*10000 / df_pop_Suisse.loc[df_pop_Suisse.index==age,year][sexe].values[0]

df_Accident_objets_normalized = df_Accident_objets_normalized[df_Accident_objets_normalized.age_index != 999]
display(df_Accident_objets_normalized.head(2))

Unnamed: 0,voiture_index,voiture,age_index,age,sexe_index,sexe,annee_permis_index,annee_permis,faute_index,faute,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,210.0,Voiture de tourisme,0.0,Jusqu'à 4 ans,1.0,Homme,0.0,0 an,0.0,Objet sans faute ni influence,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,210.0,Voiture de tourisme,0.0,Jusqu'à 4 ans,1.0,Homme,0.0,0 an,0.0,Objet sans faute ni influence,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


> `df_Accident_objets_normalized` contains the proportion in term of **age range** and **sexe** of the population involved in accident

## B. Analyse on the actors of accidents relative to the population

### B.1. Sexe and Age of drivers

#### B.1.1. Sexe and Age together

In [326]:
df_conducteur = df_Accident_objets_normalized[(df_Accident_objets_normalized.faute_index==1)\
                                           & (df_Accident_objets_normalized.voiture_index==210)]
df_conducteur = df_conducteur.groupby(['sexe','age']).sum().sort_values(['2017'], ascending=False)[[col for col in df_conducteur.columns if IS_DATE.match(col)]]
display(df_conducteur.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Homme,20 ans,127.971524,117.85295,125.859697,115.426207,92.977202,111.426699,117.792183,130.883766,118.059614,131.8827,...,76.846329,68.90397,60.461671,61.102625,55.482377,45.599542,47.268263,40.245925,43.554182,39.736442
Homme,18-19 ans,91.262445,85.548212,79.800978,79.263615,84.056135,78.972986,86.307553,87.905889,97.575091,99.309354,...,68.300948,65.115308,55.686596,59.962381,49.946674,47.051076,43.54014,37.905391,37.816045,35.445563
Homme,21-29 ans,79.562532,75.56845,75.104204,74.622609,67.304099,67.516639,73.536188,76.999389,74.377638,72.578555,...,50.191485,48.996512,42.968643,39.242457,35.122254,33.92148,31.927447,29.644892,30.092583,28.468726
Femme,20 ans,41.204299,47.217153,39.882947,42.637725,36.78649,33.910984,38.310314,41.47097,47.5809,48.580501,...,33.497884,37.035442,32.691987,24.248202,27.287386,26.002259,31.368168,26.294974,25.347733,25.532472
Homme,30-39 ans,43.85258,41.868027,41.974276,41.392829,39.754567,40.106226,39.883317,44.419721,41.375975,42.328752,...,28.826623,28.567289,27.008356,24.289025,23.592174,23.070797,22.080219,19.292668,20.161949,21.087092


#### B.1.2. Sexe only

In [328]:
display(df_conducteur.groupby(['sexe']).sum().sort_values(['2017'], ascending=False))

Unnamed: 0_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Homme,472.150909,452.03961,453.100949,442.028498,405.744536,421.970174,439.099231,462.567421,464.760519,472.880121,...,318.479298,304.020417,275.12025,267.219327,244.140696,226.37227,219.320518,200.363493,205.480487,195.489628
Femme,155.807492,159.759884,158.269271,158.433251,150.791784,144.482553,158.113546,169.336094,176.822786,182.652496,...,145.942061,146.576781,137.243465,121.364819,121.848155,119.164725,119.277469,112.998868,114.132016,114.017821


#### B.1.3. Age only

In [329]:
display(df_conducteur.groupby(['age']).sum().sort_values(['2017'], ascending=False))

Unnamed: 0_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20 ans,169.175823,165.070103,165.742644,158.063932,129.763693,145.337682,156.102496,172.354736,165.640514,180.463201,...,110.344213,105.939412,93.153658,85.350827,82.769763,71.601801,78.63643,66.540899,68.901914,65.268914
18-19 ans,118.734644,110.401491,105.0367,101.916682,110.876743,99.097579,112.813,115.999601,123.647997,129.953734,...,88.450343,87.723851,76.853005,78.306997,71.356608,68.629788,60.799182,54.831518,56.345653,54.325578
21-29 ans,110.318358,105.070017,105.673692,105.005076,95.110129,96.397309,102.762975,110.150468,106.550017,105.702915,...,79.556707,74.902028,66.80234,61.984712,55.321889,54.301182,51.660851,48.558544,48.46589,48.290494
30-39 ans,60.870922,59.570028,62.495261,60.304638,56.918102,58.022168,58.840376,64.887338,64.165233,63.343158,...,47.015815,44.953023,42.970642,38.941965,37.982282,36.682755,34.304946,31.851688,33.45487,33.082054
40-49 ans,47.902619,49.357171,49.300135,49.566926,46.548065,45.277973,49.482859,48.971764,51.216346,51.918942,...,40.864717,38.876569,38.355462,33.981593,33.931181,31.569275,30.783862,30.298006,28.926313,28.16332
70 ans et plus,30.850933,32.166982,32.770495,33.746316,32.46261,34.263381,31.30328,32.010362,36.55774,35.770656,...,29.158526,30.126845,28.681755,29.111991,27.407709,26.992079,26.636781,27.248966,28.715803,27.649526
50-59 ans,43.533557,44.751647,44.326456,44.851796,40.83001,42.650705,42.042554,43.01827,45.018277,42.012526,...,36.03467,35.14815,33.960527,30.868903,29.832916,28.020276,28.60376,28.242134,27.854799,27.124386
60-69 ans,37.451596,37.405979,38.351241,38.253963,36.311814,38.419658,36.383173,37.071092,38.978375,36.780571,...,30.772544,31.221792,30.229972,28.658326,25.773608,26.941746,26.231409,25.038363,26.346367,25.020737
15-17 ans,5.888159,5.351004,4.688106,6.024098,5.10669,4.185377,4.802986,4.434801,5.908898,6.225329,...,1.933272,1.659177,1.263071,1.18711,1.565302,0.749901,0.844184,0.752243,0.600894,0.53487
10-14 ans,3.169508,2.655073,2.939557,2.728322,2.515621,2.753784,2.628577,3.005084,3.661072,3.361585,...,0.238434,0.046351,0.093282,0.191722,0.0,0.048191,0.096581,0.0,0.0,0.047569


> Number of **conductors** who drive a **normal car** who are **responsible** an accident with victimes. Numbers are 1/10,000 citizens of that categorie of citizen

### B.2. Licence term of drivers

In [381]:
mapping={}
index_permis = df_Accident_objets['annee_permis_index'].unique()
for i in range(len(index_permis)):
    if ((index_permis[i]==999) | (index_permis[i] == 16)):
        mapping[index_permis[i]] = 1
    else:
        mapping[index_permis[i]] = index_permis[i+1] - index_permis[i]


df_permis = df_Accident_objets_normalized.copy()

df_permis = df_Accident_objets_normalized[(df_Accident_objets_normalized.faute_index==1)\
                            & (df_Accident_objets_normalized.voiture_index==210)]
df_permis = df_permis.groupby(['annee_permis','annee_permis_index']).sum().sort_values(['2017'], ascending=False)
dates = [col for col in df_permis.columns if IS_DATE.match(col)]

for i in range(len(df_permis.index)):
    df_permis.loc[df_permis.index.values[i]] = df_permis.loc[df_permis.index.values[i]] / mapping[df_permis.index.values[i][1]]
df_permis = df_permis[[col for col in df_permis.columns if IS_DATE.match(col)]].sort_values(['2017'], ascending=False)
display(df_permis)

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
annee_permis,annee_permis_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
16 ans et plus,16.0,151.830988,155.492622,157.363385,159.403876,150.194756,156.738364,155.433811,160.281814,173.458723,170.03131,...,139.520963,138.460565,132.426681,124.91239,119.172723,115.470413,114.408562,112.456571,114.792466,110.276788
0 an,0.0,142.498262,123.374769,123.556499,128.246876,134.726054,128.560318,138.59049,148.358898,156.150508,158.154178,...,103.349804,107.369984,91.846562,90.65062,82.193101,73.99197,73.751087,66.297358,66.41355,65.903547
1 an,1.0,115.056016,116.98797,105.379789,103.902797,91.447105,92.487221,105.406004,114.48353,107.449539,121.968624,...,75.841267,69.190132,60.503807,59.967035,59.746205,53.247396,52.533768,43.266116,47.139497,45.323683
2 ans,2.0,60.729267,61.781925,67.446199,53.471419,37.81339,45.859647,53.980696,55.626946,53.101017,57.694045,...,42.361811,39.063671,36.455055,30.937408,28.064216,28.548895,27.96997,26.389337,26.583125,24.397839
3 ans,3.0,21.661587,19.040298,19.630905,18.491277,15.035516,15.244564,15.719121,18.18127,17.416233,17.423118,...,12.650185,12.532685,9.571983,9.519724,9.023592,8.480907,8.390982,7.787504,6.92832,7.404767
4 ans,4.0,16.808797,16.995324,17.814826,16.588059,14.531135,15.446187,13.960276,15.545337,15.618946,15.944966,...,11.496095,9.942998,9.450254,8.815974,7.130637,8.006289,6.830234,6.806711,6.956656,6.384212
5 ans,5.0,14.839632,14.184733,15.103948,15.069014,14.938232,13.351162,12.61099,13.472996,12.903029,13.612848,...,9.823983,8.726848,8.875188,7.341964,6.291182,6.063323,6.058009,5.565988,6.087784,5.788939
6-10 ans,6.0,11.842954,11.87844,11.847788,12.109018,11.073046,11.179562,11.864094,12.247334,11.829183,11.006537,...,7.497953,6.917013,6.878061,6.050665,5.474215,5.459698,5.25762,4.839923,4.767883,4.88331
11-15 ans,11.0,7.140738,7.221523,7.627887,7.237492,6.932951,6.924922,7.010503,7.381686,7.136687,7.308612,...,4.576432,4.696056,4.416956,4.137954,4.084221,3.905022,3.713235,3.44783,3.455713,3.446991
Inconnu,999.0,9.615391,8.442038,7.696294,8.555882,7.820147,8.242843,7.138405,7.807623,10.655958,9.127786,...,9.005324,7.244969,6.759102,5.495936,6.575014,4.904202,3.801098,3.354011,3.593128,2.376171


> Note that for precise period sush as "6-10 ans" and "11-15 ans" results are divide by the number of years. Since we have no information about the driver licence in the population, we can't report by the number of citizen who have a driver licence.

### B.3. Sexe and Age of drivers excluding young drivers (having driver licence for 5 years or more)

In [336]:
df_conducteur_confirmed = df_Accident_objets_normalized[(df_Accident_objets_normalized.faute_index==1)\
                                           & (df_Accident_objets_normalized.voiture_index==210)\
                                            &(df_Accident_objets_normalized.annee_permis_index>4)\
                                             &(df_Accident_objets_normalized.annee_permis_index!=999)]
df_conducteur_confirmed = df_conducteur_confirmed.groupby(['sexe','age']).sum().sort_values(['2017'], ascending=False)[[col for col in df_conducteur.columns if IS_DATE.match(col)]]
display(df_conducteur_confirmed.head(5))


Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Homme,20 ans,123.94593,113.20086,120.586887,111.194697,89.596213,106.085985,113.196594,127.28146,111.513735,126.854243,...,74.935784,67.243634,58.860037,59.317158,53.885762,44.372648,46.859014,39.03248,43.152761,39.736442
Homme,18-19 ans,86.830022,81.095017,75.234055,75.854428,80.036059,75.322463,82.696359,84.615294,93.176688,95.369426,...,65.759517,63.028279,54.045739,59.131013,48.573933,45.988975,42.696748,37.698823,36.461752,34.914941
Femme,20 ans,40.550263,46.989051,39.646953,41.184166,36.045823,33.162947,37.554189,39.472369,47.09538,47.158633,...,33.497884,37.035442,32.483758,24.248202,27.080663,25.789126,30.723616,26.294974,24.92172,25.314246
Femme,18-19 ans,26.874978,24.238099,24.733518,21.767192,25.926588,19.3555,25.117066,26.732342,25.583274,29.917636,...,20.038072,22.608543,21.058965,18.344617,21.186913,21.355098,17.147693,16.81477,18.416622,18.651166
Homme,21-29 ans,40.718091,37.982672,38.267575,37.628218,32.465106,33.678133,37.736919,40.237612,39.439721,38.06534,...,26.740257,26.234701,22.062705,19.590199,19.1743,17.928773,16.632623,15.226247,15.647355,14.3626


> Clearly **today**, old men are dangerous!

### B.4. Sexe and Age of conductors of young drivers (having driver licence for 4 years or less)

In [338]:
df_conducteur_young = df_Accident_objets_normalized[(df_Accident_objets_normalized.faute_index==1)\
                                           & (df_Accident_objets_normalized.voiture_index==210)\
                                            &(df_Accident_objets_normalized.annee_permis_index<5)\
                                             &(df_Accident_objets_normalized.annee_permis_index!=999)]
df_conducteur_young = df_conducteur_young.groupby(['sexe','age']).sum().sort_values(['2017'], ascending=False)[[col for col in df_conducteur.columns if IS_DATE.match(col)]]
display(df_conducteur_young.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Homme,20 ans,123.94593,113.20086,120.586887,111.194697,89.596213,106.085985,113.196594,127.28146,111.513735,126.854243,...,74.935784,67.243634,58.860037,59.317158,53.885762,44.372648,46.859014,39.03248,43.152761,39.736442
Homme,18-19 ans,86.830022,81.095017,75.234055,75.854428,80.036059,75.322463,82.696359,84.615294,93.176688,95.369426,...,65.759517,63.028279,54.045739,59.131013,48.573933,45.988975,42.696748,37.698823,36.461752,34.914941
Femme,20 ans,40.550263,46.989051,39.646953,41.184166,36.045823,33.162947,37.554189,39.472369,47.09538,47.158633,...,33.497884,37.035442,32.483758,24.248202,27.080663,25.789126,30.723616,26.294974,24.92172,25.314246
Femme,18-19 ans,26.874978,24.238099,24.733518,21.767192,25.926588,19.3555,25.117066,26.732342,25.583274,29.917636,...,20.038072,22.608543,21.058965,18.344617,21.186913,21.355098,17.147693,16.81477,18.416622,18.651166
Homme,21-29 ans,40.718091,37.982672,38.267575,37.628218,32.465106,33.678133,37.736919,40.237612,39.439721,38.06534,...,26.740257,26.234701,22.062705,19.590199,19.1743,17.928773,16.632623,15.226247,15.647355,14.3626


### B.5. Who has not the driver licence

In [383]:
df_conducteur_illegal = df_Accident_objets_normalized[(df_Accident_objets_normalized.faute_index==1)\
                                           & (df_Accident_objets_normalized.voiture_index==210)\
                                             &(df_Accident_objets_normalized.annee_permis_index==999)]
df_conducteur_illegal = df_conducteur_illegal.groupby(['sexe','age']).sum().sort_values(['2017'], ascending=False)[[col for col in df_conducteur.columns if IS_DATE.match(col)]]
display(df_conducteur_illegal.head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Homme,15-17 ans,1.824409,1.498339,1.408392,1.471093,1.528806,1.111588,1.419111,1.567521,1.867966,1.484514,...,1.41993,1.148493,1.033928,1.109287,1.487144,0.671296,0.451169,0.593331,0.525782,0.53487
Homme,21-29 ans,1.641859,1.309747,1.103872,1.056379,0.901198,1.030963,1.086975,1.399989,1.370114,1.296046,...,1.036269,1.238738,0.757157,0.806777,0.694277,0.766359,0.69885,0.393952,0.335019,0.355119
Homme,18-19 ans,2.045734,1.640651,2.043097,1.582837,1.461846,1.703578,1.685224,1.292734,1.851959,1.854084,...,2.541431,2.087029,1.640857,0.831367,1.267146,1.062101,0.632544,0.206569,1.354294,0.318373
Homme,30-39 ans,0.50808,0.447786,0.556733,0.49751,0.542556,0.362207,0.296653,0.56123,0.612855,0.425086,...,0.51763,0.28621,0.481973,0.335386,0.417869,0.256913,0.218783,0.247978,0.228742,0.258539
Femme,18-19 ans,0.238889,0.0,0.0,0.632767,0.127717,0.128182,0.252433,0.247522,0.0,0.363372,...,0.111323,0.0,0.107444,0.0,0.22302,0.223614,0.0,0.111356,0.0,0.228849


## C. Analyse on the actor of car accident relative to the number of accident

> Maybe our previous results are false since it's relative to the population but we don't know the part of the population who has a driver licence. So now let study the pur data. We prepare the dataset to a pourcentage relative to the number of car accident during the year.

In [370]:
df_Accident = df_Accident_objets[(df_Accident_objets.faute_index==1) & (df_Accident_objets.voiture_index==210)]
for column in df_Accident.columns:
    if IS_DATE.match(column):
        df_Accident[column] = df_Accident[column].apply(lambda x: x*100/(df_Accident[column].sum()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,voiture_index,age_index,sexe_index,annee_permis_index,faute_index,gravite_index,1992,1993,1994,1995,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Femme,88200.0,40320.0,840.0,43974.0,420.0,132720.0,25.71641,26.76493,27.86012,27.880954,...,34.028062,33.778706,33.991527,34.056317,34.282059,34.641596,35.040881,35.785408,35.80727,36.309575
Homme,88200.0,40320.0,420.0,43974.0,420.0,132720.0,72.550478,71.591507,70.455072,70.507938,...,63.908187,64.488518,63.897465,63.744292,63.759194,63.543805,62.981812,61.982833,62.231615,61.8116
Inconnu,88200.0,40320.0,3780.0,43974.0,420.0,132720.0,1.733112,1.643563,1.684808,1.611108,...,2.063752,1.732777,2.111007,2.199391,1.958746,1.814599,1.977307,2.23176,1.961116,1.878824


### C.1. Sexe and Age of drivers

In [375]:
display(df_Accident.groupby(['sexe']).sum().sort_values(['2017'], ascending=False)\
        [[col for col in df_Accident.columns if IS_DATE.match(col)]])

Unnamed: 0_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Homme,72.550478,71.591507,70.455072,70.507938,70.061164,69.991251,69.749787,68.984757,68.61927,68.311121,...,63.908187,64.488518,63.897465,63.744292,63.759194,63.543805,62.981812,61.982833,62.231615,61.8116
Femme,25.71641,26.76493,27.86012,27.880954,28.21111,28.215223,28.719786,29.209088,29.723178,30.081756,...,34.028062,33.778706,33.991527,34.056317,34.282059,34.641596,35.040881,35.785408,35.80727,36.309575
Inconnu,1.733112,1.643563,1.684808,1.611108,1.727726,1.793526,1.530426,1.806155,1.657552,1.607123,...,2.063752,1.732777,2.111007,2.199391,1.958746,1.814599,1.977307,2.23176,1.961116,1.878824


In [378]:
display(df_Accident.groupby(['age']).sum().sort_values(['2017'], ascending=False)\
        [[col for col in df_Accident.columns if IS_DATE.match(col)]].head(5))

Unnamed: 0_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21-29 ans,32.362867,30.806574,29.845994,29.105396,27.340942,26.390451,26.497024,26.304285,24.575746,24.599619,...,23.934069,23.458594,22.574855,22.549467,21.410297,21.873717,21.341565,20.849785,20.481826,20.744635
30-39 ans,19.56758,19.933322,21.120889,21.020564,21.59657,21.853518,21.559577,22.530917,21.790607,21.581364,...,17.872224,17.418232,17.175271,16.651446,17.276943,17.415223,16.827966,16.334764,17.100592,17.452383
40-49 ans,13.933535,14.704334,14.578509,14.880778,15.070307,14.554431,15.547188,14.713834,15.29571,15.785642,...,17.647459,17.390397,17.893301,16.849315,17.628718,16.725511,16.385783,16.377682,15.198648,14.935792
50-59 ans,9.655094,10.352693,10.398333,10.914523,10.946466,11.604799,11.387101,11.285591,11.80583,11.132266,...,12.409753,12.588727,12.938896,12.81583,13.383435,13.334428,14.300017,14.909871,14.784446,14.909937
70 ans et plus,5.06206,5.457098,5.679713,5.95817,6.343401,6.749156,6.17029,6.016681,6.833174,6.921268,...,7.982564,8.608212,8.738422,9.634703,9.881676,10.18146,10.737527,11.527897,12.375317,12.548479


In [379]:
display(df_Accident.groupby(['sexe','age']).sum().sort_values(['2017'], ascending=False)\
        [[col for col in df_Accident.columns if IS_DATE.match(col)]].head(5))

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
sexe,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Homme,21-29 ans,23.56003,22.272913,21.271422,20.692483,19.307649,18.416448,18.899551,18.349152,17.139313,16.933587,...,15.175044,15.414057,14.669347,14.436834,13.751199,13.810658,13.340564,12.918455,12.907861,12.436439
Homme,30-39 ans,14.316765,14.218869,14.404817,14.623001,15.246863,15.223097,14.696951,15.478861,14.083554,14.497704,...,10.999864,11.113431,10.86379,10.471842,10.833067,11.060021,10.946104,10.017167,10.431107,11.247091
Homme,40-49 ans,9.77521,9.97251,9.981473,10.176343,10.259159,9.980002,10.737277,9.882082,10.385071,10.549894,...,11.176951,10.946416,11.208444,10.304414,11.096898,10.534527,9.944936,9.682403,9.3153,9.221753
Homme,50-59 ans,7.058285,7.498392,7.596109,7.686449,7.66757,8.248969,7.864691,7.794075,8.276484,7.576436,...,8.234573,8.030619,8.573275,8.287671,8.786377,8.522867,9.227432,9.665236,9.323753,9.109713
Femme,21-29 ans,8.802837,8.533661,8.574572,8.412912,8.033293,7.974003,7.597474,7.955134,7.436432,7.666032,...,8.759025,8.044537,7.905507,8.112633,7.659098,8.063059,8.001001,7.93133,7.573964,8.308196


### C.2. Licence term of drivers

In [380]:
mapping={}
index_permis = df_Accident_objets['annee_permis_index'].unique()
for i in range(len(index_permis)):
    if ((index_permis[i]==999) | (index_permis[i] == 16)):
        mapping[index_permis[i]] = 1
    else:
        mapping[index_permis[i]] = index_permis[i+1] - index_permis[i]


df_permis = df_Accident.copy()

df_permis = df_Accident[(df_Accident.faute_index==1)\
                            & (df_Accident.voiture_index==210)]
df_permis = df_permis.groupby(['annee_permis','annee_permis_index']).sum().sort_values(['2017'], ascending=False)
dates = [col for col in df_permis.columns if IS_DATE.match(col)]

for i in range(len(df_permis.index)):
    df_permis.loc[df_permis.index.values[i]] = df_permis.loc[df_permis.index.values[i]] / mapping[df_permis.index.values[i][1]]
df_permis = df_permis[[col for col in df_permis.columns if IS_DATE.match(col)]].sort_values(['2017'], ascending=False)
display(df_permis)

Unnamed: 0_level_0,Unnamed: 1_level_0,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
annee_permis,annee_permis_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
16 ans et plus,16.0,34.101699,36.129145,36.903659,38.180327,39.239549,40.826147,40.428762,40.103538,42.769352,42.826744,...,47.37093,48.517745,49.019889,49.398782,50.623601,50.677396,52.069081,53.236052,53.964497,53.520641
0 an,0.0,8.940113,7.527636,7.399259,7.375945,8.506211,7.892763,8.36876,8.231234,8.597846,8.74678,...,7.553467,7.787056,7.331084,7.480974,7.331308,6.798588,6.891373,6.377682,6.424345,6.463846
1 an,1.0,7.029686,6.720477,5.969199,5.79413,5.511066,5.593051,6.145998,6.045441,5.677398,6.081308,...,5.373927,5.170494,4.724636,5.167428,5.132715,4.885459,4.680461,4.214592,4.43787,4.490218
2 ans,2.0,5.81136,5.878224,5.7434,5.184838,4.281481,4.468191,5.198591,5.050331,4.617466,4.810169,...,4.502111,4.370216,4.143032,3.873668,3.837544,3.998686,3.504088,3.699571,3.778529,3.740412
3 ans,3.0,5.685523,5.10031,5.1239,4.845041,4.022952,3.91826,3.977894,4.377337,3.957828,3.936611,...,3.889116,4.050104,3.324478,3.48554,3.445795,3.350029,3.370599,3.416309,3.000845,3.13712
Inconnu,999.0,3.008637,2.702228,2.703798,2.659793,2.818589,2.730909,2.459614,2.876043,2.892259,2.743868,...,3.698406,3.215031,3.740935,4.041096,3.725616,3.522457,3.529117,3.716738,3.212172,3.07679
4 ans,4.0,4.690271,4.749371,4.753358,4.405648,4.098619,3.993251,3.516337,3.606557,3.574449,3.572629,...,3.555374,3.208072,3.288576,3.302892,2.774224,3.276131,2.895044,2.987124,3.034658,2.818237
5 ans,5.0,4.055368,4.000702,4.047013,3.972113,4.05448,3.712036,3.273412,3.111878,2.914811,3.018255,...,3.119466,2.901879,3.123429,2.777778,2.518388,2.561787,2.594694,2.472103,2.671175,2.57692
6-10 ans,6.0,3.187096,3.199392,3.105604,3.191751,3.097295,2.977128,2.991619,2.958873,2.7468,2.571397,...,2.456069,2.336813,2.487255,2.316591,2.251359,2.330241,2.282663,2.218026,2.143702,2.245971
11-15 ans,11.0,2.148373,2.238989,2.365679,2.324682,2.396116,2.395951,2.334507,2.360656,2.252918,2.28133,...,1.731372,1.819068,1.773533,1.777778,1.870803,1.855653,1.810446,1.75794,1.751479,1.789192


### C) Test of global population consistence among different datasets

In this section, unknown values were not excluded to avoid asymetric operations on different datasets. 

In [11]:
#test dataset with cantons
df_acc_cantons = df_Accident_cantons

#only taking years from 2010 to 2017
toDrop = []
for elem in df_acc_cantons.columns:
    if reg.match(elem):
        toDrop.append(elem)
df_acc_cantons = df_acc_cantons.drop(columns=toDrop)

#aggregate groups by gravite 
acc_cantons = df_acc_cantons.groupby(['gravite']).sum().sort_values(['2010'], ascending=False)\
            [[col for col in df_acc_cantons.columns if IS_DATE.match(col)]]
display(acc_cantons)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
gravite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accidents avec blessés légers,15214.0,14568.0,13980.0,13357.0,13756.0,13886.0,13821.0,14153.0
Accidents avec blessés graves,4082.0,4110.0,3867.0,3859.0,3818.0,3612.0,3548.0,3427.0
Accidents mortels,313.0,312.0,301.0,257.0,229.0,238.0,208.0,219.0


In [12]:
#test dataset with circonstances
df_acc_circo = df_Accident_circonstances

#only taking years from 2010 to 2017
toDrop = []
for elem in df_acc_circo.columns:
    if reg.match(elem):
        toDrop.append(elem)
df_acc_circo = df_acc_circo.drop(columns=toDrop)

#aggregate groups by gravite
acc_circo = df_acc_circo.groupby(['gravite']).sum().sort_values(['2010'], ascending=False)\
            [[col for col in df_acc_circo.columns if IS_DATE.match(col)]]
display(acc_circo)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
gravite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accidents avec blessés légers,10605.0,10383.0,9700.0,9262.0,9681.0,9681.0,9708.0,9696.0
Accidents avec blessés graves,2460.0,2485.0,2291.0,2326.0,2245.0,2128.0,2079.0,1934.0
Accidents mortels,174.0,185.0,170.0,146.0,139.0,129.0,128.0,124.0


In [13]:
#test dataset with route type
df_acc_routeType = df_Accident_type_route

#only taking years from 2010 to 2017
toDrop = []
for elem in df_acc_routeType.columns:
    if reg.match(elem):
        toDrop.append(elem)
df_acc_routeType = df_acc_routeType.drop(columns=toDrop)

#aggregate groups by gravite
acc_routeType = df_acc_routeType.groupby(['gravite']).sum().sort_values(['2010'],ascending=False)\
            [[col for col in df_acc_routeType.columns if IS_DATE.match(col)]]
display(acc_routeType)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
gravite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accidents avec blessés légers,15214.0,14568.0,13980.0,13357.0,13756.0,13886.0,13821.0,14153.0
Accidents avec blessés graves,4082.0,4110.0,3867.0,3859.0,3818.0,3612.0,3548.0,3427.0
Accidents mortels,313.0,312.0,301.0,257.0,229.0,238.0,208.0,219.0


In [14]:
#test dataset with objects
df_acc_objet = df_Accident_objets

#only taking years from 2010 to 2017
toDrop = []
for elem in df_acc_objet.columns:
    if reg.match(elem):
        toDrop.append(elem)
df_acc_objet = df_acc_objet.drop(columns=toDrop)

#aggregate groups by gravite 
acc_objet = df_acc_objet.groupby(['gravite']).sum().sort_values(['2010'],ascending=False)\
            [[col for col in df_acc_objet.columns if IS_DATE.match(col)]]
display(acc_objet)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
gravite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accidents avec blessés légers,29117.0,28122.0,26720.0,25586.0,26380.0,26428.0,26392.0,26802.0
Accidents avec blessés graves,7165.0,7146.0,6774.0,6854.0,6635.0,6242.0,6158.0,5884.0
Accidents mortels,564.0,563.0,555.0,449.0,411.0,414.0,375.0,392.0


> We know that in 2017, there were 17'799 accidents causing body injuries in Switzerland. Therefore, it means that cantons and routeType datasets are correct! 
However, the dataset of circomstances is lacking of values (missing ~1/2) and the one about objects is containing too much (round 2 times more accidents). **This might be explained by the fact that multiple objects are involved in one accident.**

In [15]:
#aggregate objets dataset in groups by vehicle type
acc_objet = df_acc_objet.groupby(['voiture']).sum().sort_values(['2010'],ascending=False)\
            [[col for col in df_acc_objet.columns if IS_DATE.match(col)]]
display(acc_objet)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
voiture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Voiture de tourisme,22889.0,21624.0,20601.0,19894.0,19717.0,19238.0,19371.0,18981.0
Cycle,3373.0,3681.0,3520.0,3545.0,4015.0,4230.0,4296.0,4481.0
Piéton,2648.0,2577.0,2490.0,2478.0,2441.0,2435.0,2478.0,2255.0
Motocycle de plus de 125 cm3,2182.0,2441.0,2246.0,2035.0,2260.0,2302.0,2123.0,2225.0
Véhicule de transport de choses,2049.0,2021.0,1940.0,1945.0,1931.0,2045.0,1946.0,2058.0
Motocycle jusqu'à 125 cm3,1553.0,1077.0,1026.0,981.0,1019.0,928.0,835.0,946.0
Motocycle léger,622.0,932.0,793.0,696.0,726.0,625.0,567.0,652.0
Cyclomoteur,616.0,579.0,470.0,444.0,413.0,405.0,409.0,393.0
Autre et inconnu,454.0,374.0,442.0,437.0,397.0,365.0,384.0,387.0
Véhicule de transport de personnes,346.0,352.0,360.0,272.0,326.0,327.0,311.0,303.0


In previous table, we can see that in 2017, more cars were involved in accidents than the total number of accidents (18981>17799). This illustrates that many vehicles are implied in accidents. 

In [16]:
totCrash=sum(acc_cantons['2017'])
totObjects = sum(acc_objet['2017'])
objetPerCrash = totObjects/totCrash

print('In average, a crash was involving %.2f objects in 2017.' % objetPerCrash)

In average, a crash was involving 1.86 objects in 2017.


Now let's study the case of circonstances dataset:

In [17]:
#aggregate groups by circonstances
acc_circo = df_acc_circo.groupby(['circonstance']).sum().sort_values(['2010'], ascending=False)\
            [[col for col in df_acc_circo.columns if IS_DATE.match(col)]]
display(acc_circo)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
circonstance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Autre facteur d'influence,12446.0,12182.0,11400.0,11038.0,11430.0,11293.0,11346.0,11195.0
Alcool,658.0,709.0,603.0,555.0,491.0,491.0,439.0,411.0
Drogues,63.0,71.0,59.0,46.0,51.0,61.0,53.0,71.0
Alcool + droques,28.0,30.0,43.0,29.0,38.0,31.0,35.0,31.0
Médicaments,23.0,38.0,29.0,34.0,34.0,41.0,31.0,29.0
Alcool + médicaments,17.0,15.0,23.0,25.0,13.0,14.0,9.0,11.0
Drogues et médicaments,3.0,5.0,3.0,6.0,7.0,5.0,1.0,5.0
Alcool + drogues + médicaments,1.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0


In [18]:
totCirco = sum(acc_circo['2017'])
totCirco

11754.0

Hmmm... it looks like this dataset is incomplete! It provides only 11754 values out of 17799 accidents... Another possibility would be that "Autres facteurs d'influence" doesn't contain "No influence factor" for example. 

In [19]:
#aggregate circo dataset in groups by context
acc_circo = df_acc_circo.groupby(['contexte']).sum().sort_values(['2010'], ascending=False)\
            [[col for col in df_acc_circo.columns if IS_DATE.match(col)]]
display(acc_circo)

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017
contexte,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Tamponnement,4061.0,4062.0,3822.0,3659.0,3882.0,3701.0,3862.0,3651.0
Changement de direction (en obliquant),3887.0,3855.0,3457.0,3383.0,3535.0,3481.0,3438.0,3464.0
Accident concernant des piétons,2251.0,2187.0,2113.0,2100.0,2081.0,2110.0,2073.0,2097.0
À une bifurcation sans changement de direction,1058.0,1027.0,979.0,922.0,916.0,910.0,881.0,865.0
Lors de croisement (longitudinal),833.0,805.0,762.0,675.0,649.0,665.0,655.0,703.0
Autre,602.0,546.0,522.0,489.0,497.0,537.0,530.0,479.0
Dépassement,465.0,497.0,415.0,434.0,416.0,435.0,402.0,417.0
Avec un animal,82.0,74.0,91.0,72.0,89.0,99.0,74.0,78.0


Still the same issue, and 'Autre' is even included... I think there miss values