### .Imports

In [104]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import re
import math
import numpy as np
import codecs
import folium
from scipy.stats.stats import pearsonr

%pylab inline
pylab.rcParams['figure.figsize'] = (20,12)

Populating the interactive namespace from numpy and matplotlib


### .Constants

In [89]:
DATA_FOLDER = 'data_clean'
IS_DATE = re.compile("^[1-2]{1}[0-9]{3}$") #will be useful to detect values that correspond to dates

PATH = 'docs/_stories/'
PLOT_FOLDER = 'plots/'
DF_FOLDER = 'dataframes/'
MAP_FOLDER = 'maps/'

### .Useful functions

#### Function to quickly describe a dataframe

In [4]:
def describeDf(df,name="DESCRIPTION",level=2):
    """
    Describe the data. Different levels are possible.
    Level 0: Print the title of the dataset
    Level 1: Name the different columns
    Level 2: Explore the values of each column
    """
    date=[]
    
    # Explore each column of the dataset
    if(level==2):
        # Print a header
        print("______________________________ " + name+" ______________________________\n")
        print("\n")
        for col in df.columns:
            # Print the attributes of each column
            if((not IS_DATE.match(col)) & (not re.compile("[0-9]+ an").match(col))):
                print("         ATTRIBUTE: "+col)
                print("   "+str(df[col].unique()))
                print("\n")
            else:
                date.append(col)
        # Print the years we have information on
        print("         VALUES: "+str(date))
        print("\n\n\n")
        return
    
    # Recover and print the columns of the datasets
    elif(level==1):
        # Print a header
        print("______________________________ " + name+" ______________________________\n")
        val = []
        # Print the different columns
        for col in df.columns:
            if((not IS_DATE.match(col)) & (not re.compile("[0-9]+ an").match(col))):
                val.append(col)
        print(val)
        print("\n\n\n")
        return
    
    # Simply print the name of the dataset
    elif(level==0):
        print("   >  " + name+"\n")
    
    # Sanity check: the given level does exist
    else:
        raise NotImplementedError

#### Function to restrict the dataframe to a fix period in term of years

We need this function as our different datasets are often based on different yearly periods.

In [5]:
def get_period(df, start=None,end=None):
    """
    Function to keep the columns of interest of a dataset, between a starting and an ending date.
    """
    columns_to_keep = []
    dates = []
    
    # Go through the columns
    for elem in df.columns:
        # If it is not a date we keep it
        if not IS_DATE.match(elem):
            columns_to_keep.append(elem)
        # Else we store it to see later if we keep it
        else:
            dates.append(int(elem))
    
    # Define the starting and ending dates in the case they are not
    if start == None:
        start = np.min(dates)
    if end == None:
        end = np.max(dates)
    
    # Only keep the columns of interest
    for date in dates:
        if (date<=end) & (date>=start):
            columns_to_keep.append(str(date))
    
    return df[columns_to_keep]

#### Function to get the age corresponding to the intervals as input

Using this functions will allow us to treat people between 40 and 50 years old as one group for example. Again, this is needed because of the disparity among the datasets, with some of them considering each age category differently, while others  groupped the people in different intervals. This functions allow us to compare and work with those different datasets.

In [6]:
def map_int_to_age(k,age_max):
    """
    Transform an integer to what is define as an age, i.e. 20 to '20 ans' ('20 years old')
    """
    if(k != 1 and k<age_max):
        return '{} ans'.format(k)
    if(k == 1):
        return '{} an'.format(k)
    if(k==age_max):
        if (age_max == 100):
            return '{} ans ou plus'.format(age_max)
        else:
            return '{} ans et plus'.format(age_max)

In [7]:
def build_age(intervals,age_max):
    """
    INPUT:  intervals: Array corresponding to the delimitation of the wanted intervals.
            age_max: The maximal age to consider.
    """
    ages_total = []
    
    # Build the intervals
    for i in range(len(intervals)-1):
        ages = []
        
        # We keep the first limit of the interval as its first element: this will
        # allow us to identify the different intervals later
        ages.append(intervals[i])
        
        # Put the corresponding ages in the interval
        for k in range(int(intervals[i]),int(intervals[i+1])):
            # Use our function
            age = map_int_to_age(k,age_max)
            ages.append(age)
            if(k==100):
                break
                
        # Store the interval we just created
        ages_total.append(ages)
    
    return ages_total

#### Mapping for the cantons

One recurrent problem we encountered, as presented in introduction, is the fact that different datasets have information in different languages. In general, we can treat them case by case, but it becomes a major issue when it comes to the Swiss cantons. As we wanted to do an in depth analysis and comparison of them, we needed to be able to refer them in different datasets.

We thus decided to build a dictionnary of every different mentioning of the cantons we found, with a little function to get the corresponding key, allowing us to compare cantons in different datasets.

In [146]:
dict_cantons = dict()
dict_cantons['AA'] = ['Appenzell Rh.-Ext.','Appenzell Ausserrhoden']
dict_cantons['AG'] = ['Argovie','Aargau']
dict_cantons['AI'] = ['Appenzell Rh.-Int.','Appenzell Innerrhoden']
dict_cantons['BE'] = ['Berne', 'Bern', 'Bern / Berne']
dict_cantons['BL'] = ['Bâle-Campagne','Basel-Landschaft']
dict_cantons['BS'] = ['Bâle-Ville','Basel-Stadt']
dict_cantons['FR'] = ['Fribourg', 'Freiburg', 'Fribourg / Freiburg']
dict_cantons['GE'] = ['Genève', 'Genf']
dict_cantons['GL'] = ['Glaris', 'Glarus']
dict_cantons['GR'] = ['Grisons', 'Graubünden', 'Grigioni', 'Grischun', 'Graubünden / Grigioni / Grischun']
dict_cantons['JU'] = ['Jura']
dict_cantons['LU'] = ['Lucerne', 'Luzern']
dict_cantons['NE'] = ['Neuchâtel']
dict_cantons['NW'] = ['Nidwald', 'Nidwalden']
dict_cantons['OW'] = ['Obwald', 'Obwalden']
dict_cantons['SG'] = ['St. Gall', 'St. Gallen', 'Saint-Gall']
dict_cantons['SH'] = ['Schaffhouse', 'Schaffhausen']
dict_cantons['SO'] = ['Soleure', 'Solothurn']
dict_cantons['SZ'] = ['Schwytz', 'Schwyz']
dict_cantons['TE'] = ['Tessin', 'Ticino']
dict_cantons['TG'] = ['Thurgovie', 'Thurgau']
dict_cantons['UR'] = ['Uri']
dict_cantons['VS'] = ['Valais', 'Wallis', 'Valais / Wallis']
dict_cantons['VD'] = ['Vaud']
dict_cantons['ZG'] = ['Zoug', 'Zug']
dict_cantons['ZH'] = ['Zurich', 'Zürich']


def getKeysByValue(dictOfElements, valueToFind):
    '''
    Find the key of a canton.
    '''
    for item  in dictOfElements.items():
        if valueToFind==item[0]:
            return valueToFind
        if valueToFind in item[1]:
            return item[0]
            break
    raise NotImplementedError

We also implemented the following function to change all the cantons names to their key

In [9]:
def clean_cantons_names(x):
    clean = x.copy()
    for cant in clean['canton'].unique():
        # Check if what is recorded as a 'canton' really is a canton, i.e. is in our
        # dictionary (this doesn't keep the Swiss totals for example)
        try:
            getKeysByValue(dict_cantons, cant)
        # If not, we don't keep the rows with those 'cantons'
        except:
            clean = clean[clean.canton!=cant]

    # Replace the cantons name by their keys to compare them below
    clean['canton'] = [getKeysByValue(dict_cantons, cant) for cant in clean['canton']]
    
    return clean

And the list of the cantons keys to be able to put every dataframe in the same order

In [10]:
list_cantons_keys = []
for key in dict_cantons.keys():
    list_cantons_keys.append(key)

#### Function to plot a set of data

In [161]:
def plot_dataframe(x, y=None, title=None, xlabel=None, ylabel=None, labels=None,\
                   rotation=0, number_to_plot=None, bars=None,name_save=None):
    if number_to_plot == None:
        try:
            number_to_plot = y.shape[0]
        except:
            pass
    if bars == None:
        plt.figure(figsize=(12,7))
        for j in range(number_to_plot):
            plt.plot(x, y.iloc[j], label = labels[j])
    else:
        x.plot(kind=bars, figsize=(12,7))
    plt.title(title, fontsize=18)
    plt.legend(fontsize=10, loc=0, bbox_to_anchor=(1,1))
    plt.xticks(rotation=rotation, fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    if (name_save!=None):
        plt.savefig(PATH+PLOT_FOLDER+name_save+'.png')
    plt.show()
    

#### Function to scatter plot correlations

In [125]:
def plot_correlation(x, y, title=None, xlabel=None, ylabel=None, labels=None,\
                   rotation=0, number_to_plot=None,name_save=None):
    if number_to_plot == None:
        try:
            number_to_plot = y.shape[0]
        except:
            pass
    plt.figure(figsize=(12,7))
    for j in range(number_to_plot):
        plt.scatter(x.iloc[j], y.iloc[j], label = labels[j])
    plt.title(title, fontsize=18)
    plt.legend(fontsize=10, loc="best", bbox_to_anchor=(1,1))
    plt.xticks(rotation=rotation, fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    if (name_save!=None):
        plt.savefig(PATH+PLOT_FOLDER+name_save+'.png')
    plt.show()
    

#### Function to save results in order to put in the datastory

In [129]:
def save_it(data, name):
    if (type(data) == type(pd.DataFrame())):
        with codecs.open(PATH+DF_FOLDER+name+'.html','w',"utf-8") as f: 
            f.write(data.to_html())
    if (type(data) == type(folium.Map())):
        data.save(PATH+MAP_FOLDER+name+".html")

### .Datasets Loading

As mentioned, to perform our analysis, we took many datasets from the Swiss OpenData website. All these datasets are excels spreedsheet in differents forms, we hence needed to do a pre-treatment to transform them into clear `csv`
 files. They are stored in the folder `data_clean`.

The pre-treatment was huge since the datasets were sometimes split into multiple spreedsheets (on for each year) for example. Moreover, the excel files were organized to be visually good looking, with many merged cells and various hierachical levels in the data, thus requiring more work to clean and ready it for the future processings with pandas. 

#### Snapshot of our different datasets

In [13]:
dataframes = {}
print("  DATAFRAMES AVAILABLES:\n")

for file in os.listdir(DATA_FOLDER):
    # Load the filename
    filename = os.fsdecode(file)
    # Name the corresponding DataFrame "df_....."
    tablename = "df_"+os.path.splitext(os.path.splitext(filename)[0])[0]
    # Load the values in the DataFrame
    globals()[tablename] = pd.read_csv(DATA_FOLDER + '/' + filename)
    try:
        globals()[tablename] = globals()[tablename].drop(columns=['Unnamed: 0'])
    except:
        pass
    # Print the title of the datasets
    describeDf(globals()[tablename], name=tablename,level=0)
    dataframes[tablename] = globals()[tablename] 

  DATAFRAMES AVAILABLES:

   >  df_Accident_cantons

   >  df_Accident_circonstances

   >  df_Accident_objets

   >  df_Accident_type_route

   >  df_Accident_victimes

   >  df_Besoin_sante

   >  df_Depense_menages_canton

   >  df_Frais_routes_cantonales

   >  df_Frais_routes_communales

   >  df_Frais_routes_nationales

   >  df_hacked_accident

   >  df_Longueur_routes

   >  df_Population_2010

   >  df_Population_age_1992

   >  df_Population_canton_1992

   >  df_Proportion_permis

   >  df_Qualite_vie_agglo

   >  df_Recettes_routes

   >  df_Voitures

