In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500) # show more columns
nan = np.nan # store numpy.nan in 'nan'

In [2]:
raw_data = pd.read_csv('raw_data/politique_data_utf_d.csv', sep=';', header=1)
raw_data.head(10)

Unnamed: 0,Bezirk (>>) / Gemeinde (......),Bezirk (>>) / Gemeinde (......).1,Jahr,Jahr.1,Partei,Partei.1,Ergebnis,Ergebnis.1,Unnamed: 8
0,101,>> Bezirk Affoltern,2015,2015,1,FDP/PLR (PRD),1,Parteistimmen,95805.0000000000
1,101,>> Bezirk Affoltern,2015,2015,1,FDP/PLR (PRD),2,Parteistärke in %,15.7782920000
2,101,>> Bezirk Affoltern,2015,2015,2,CVP/PDC,1,Parteistimmen,24001.0000000000
3,101,>> Bezirk Affoltern,2015,2015,2,CVP/PDC,2,Parteistärke in %,3.9527664000
4,101,>> Bezirk Affoltern,2015,2015,3,SP/PS,1,Parteistimmen,112705.0000000000
5,101,>> Bezirk Affoltern,2015,2015,3,SP/PS,2,Parteistärke in %,18.5615824000
6,101,>> Bezirk Affoltern,2015,2015,4,SVP/UDC,1,Parteistimmen,206032.0000000000
7,101,>> Bezirk Affoltern,2015,2015,4,SVP/UDC,2,Parteistärke in %,33.9317682000
8,101,>> Bezirk Affoltern,2015,2015,5,LPS/PLS,1,Parteistimmen,...
9,101,>> Bezirk Affoltern,2015,2015,5,LPS/PLS,2,Parteistärke in %,...


In [3]:
# there are no nones
sum(raw_data.isnull().any())

0

In [4]:
# keep only data from 2015 (2013 and 2014 are not in the dataset)
data = raw_data[raw_data['Jahr'] >= 2013]
raw_data['Jahr'].unique()

array([2015, 2011, 2007, 2003, 1999, 1995, 1991, 1987, 1983, 1979, 1975])

In [5]:
# remove the year columns
data.drop(['Jahr', 'Jahr.1'], axis=1, inplace=True)

In [6]:
data.columns[1]

'Bezirk (>>) / Gemeinde (......).1'

In [7]:
# remove 'Bezirk >>' (keep only communes ('...'))
data = data[data[data.columns[1]].str.find('...') >= 0]

In [8]:
# rename the columns
data.rename(columns={'Bezirk (>>) / Gemeinde (......)': 'commune_id', 
                   'Bezirk (>>) / Gemeinde (......).1': 'commune_name',
                   'Partei':'party_id',
                   'Partei.1':'party_name',
                   'Ergebnis':'result_id', 
                   'Ergebnis.1':'result_name',
                   'Unnamed: 8':'value'}, inplace=True)

In [9]:
#remove '....' from commune_name
def _remove_dots(row):
    return row['commune_name'].replace('...... ', '')
data.commune_name = data.apply(_remove_dots, axis=1)
data.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,result_id,result_name,value
528,1,Aeugst am Albis,1,FDP/PLR (PRD),1,Parteistimmen,4906.0
529,1,Aeugst am Albis,1,FDP/PLR (PRD),2,Parteistärke in %,18.69166
530,1,Aeugst am Albis,2,CVP/PDC,1,Parteistimmen,545.0
531,1,Aeugst am Albis,2,CVP/PDC,2,Parteistärke in %,2.0764278
532,1,Aeugst am Albis,3,SP/PS,1,Parteistimmen,4894.0


In [10]:
# '...' values mean 0, replace ... by 0
data.replace('...', 0.0, inplace=True)
data.value = pd.to_numeric(data['value'])

In [11]:
# round 'value' to 2 decimals
data.value = data.value.round(2)

In [12]:
parties = data.party_name.unique()
parties

array(['FDP/PLR (PRD)', 'CVP/PDC', 'SP/PS', 'SVP/UDC', 'LPS/PLS',
       'LdU/AdI', 'EVP/PEV', 'CSP/PCS', 'GLP/PVL', 'BDP/PBD', 'PdA/PST',
       'PSA', 'POCH', 'GPS/PES', 'FGA/AVF', 'Sol.', 'SD/DS', 'Rep./Rép.',
       'EDU/UDF', 'FPS/PSL', 'Lega', 'MCR', 'Sep./Sép.', 'Übrige/Autres'], dtype=object)

In [13]:
# Takes some time!!!
for party in parties:
    data[party] = data.apply(lambda row: row.value if row.party_name == party else 0.0, axis=1)

In [14]:
#split into 2 df, one for 'parteistimmen' and the other for 'parteistärke %'
data_voices = data[data.result_name == 'Parteistimmen']
data_percent = data[data.result_name != 'Parteistimmen']
data_voices.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,result_id,result_name,value,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
528,1,Aeugst am Albis,1,FDP/PLR (PRD),1,Parteistimmen,4906.0,4906.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
530,1,Aeugst am Albis,2,CVP/PDC,1,Parteistimmen,545.0,0.0,545.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
532,1,Aeugst am Albis,3,SP/PS,1,Parteistimmen,4894.0,0.0,0.0,4894.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
534,1,Aeugst am Albis,4,SVP/UDC,1,Parteistimmen,8118.0,0.0,0.0,0.0,8118.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536,1,Aeugst am Albis,5,LPS/PLS,1,Parteistimmen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
data_percent.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,result_id,result_name,value,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
529,1,Aeugst am Albis,1,FDP/PLR (PRD),2,Parteistärke in %,18.69,18.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
531,1,Aeugst am Albis,2,CVP/PDC,2,Parteistärke in %,2.08,0.0,2.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
533,1,Aeugst am Albis,3,SP/PS,2,Parteistärke in %,18.65,0.0,0.0,18.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,1,Aeugst am Albis,4,SVP/UDC,2,Parteistärke in %,30.93,0.0,0.0,0.0,30.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,1,Aeugst am Albis,5,LPS/PLS,2,Parteistärke in %,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
data_voices.drop(['result_name', 'result_id'], axis=1, inplace=True)
data_percent.drop(['result_name', 'result_id'], axis=1, inplace=True)
data_percent.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,value,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
529,1,Aeugst am Albis,1,FDP/PLR (PRD),18.69,18.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
531,1,Aeugst am Albis,2,CVP/PDC,2.08,0.0,2.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
533,1,Aeugst am Albis,3,SP/PS,18.65,0.0,0.0,18.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,1,Aeugst am Albis,4,SVP/UDC,30.93,0.0,0.0,0.0,30.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,1,Aeugst am Albis,5,LPS/PLS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# drop overfluous columns
data_percent = data_percent.drop(['commune_name', 'party_id', 'party_name', 'value'], axis=1).groupby(by='commune_id', axis=0, as_index=True).sum()
data_voices = data_voices.drop(['commune_name', 'party_id', 'party_name', 'value'], axis=1).groupby(by='commune_id', axis=0, as_index=True).sum()

In [18]:
#data_percent.columns = ['percentages {}'.format(c) for c in data_percent.columns]
data_percent.head()

Unnamed: 0_level_0,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
commune_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,18.69,2.08,18.65,30.93,0.0,0.0,3.47,0.0,8.44,2.62,0.17,0.0,0.0,7.08,0.0,0.0,0.31,0.0,4.58,0.0,0.0,0.0,0.0,3.01
2,14.23,4.59,19.08,33.79,0.0,0.0,5.46,0.0,7.36,4.16,0.19,0.0,0.0,6.21,0.0,0.0,0.19,0.0,1.58,0.0,0.0,0.0,0.0,3.17
3,16.47,3.38,20.4,29.1,0.0,0.0,3.14,0.0,11.86,3.8,0.11,0.0,0.0,6.66,0.0,0.0,0.17,0.0,1.75,0.0,0.0,0.0,0.0,3.15
4,12.79,2.88,19.39,34.94,0.0,0.0,2.57,0.0,8.75,4.66,0.19,0.0,0.0,8.02,0.0,0.0,0.18,0.0,1.64,0.0,0.0,0.0,0.0,3.98
5,15.81,3.92,22.48,30.11,0.0,0.0,3.59,0.0,9.63,3.77,0.23,0.0,0.0,6.47,0.0,0.0,0.02,0.0,1.82,0.0,0.0,0.0,0.0,2.16


In [19]:
#data_voices.columns = ['total voices {}'.format(c) for c in data_voices.columns]

In [20]:
# put small parties into 'others'.
cutoff = 1995000
smaller_voices = data_voices

In [21]:
small_parties_idx = [p[0] for p in enumerate(smaller_voices.sum(axis=0)) if p[1] < 1995000]

In [22]:
small_parties = data_voices.columns[small_parties_idx]
small_parties

Index(['LPS/PLS', 'LdU/AdI', 'EVP/PEV', 'CSP/PCS', 'PdA/PST', 'PSA', 'POCH',
       'FGA/AVF', 'Sol.', 'SD/DS', 'Rep./Rép.', 'EDU/UDF', 'FPS/PSL', 'Lega',
       'MCR', 'Sep./Sép.', 'Übrige/Autres'],
      dtype='object')

In [23]:
small_parties_total_voices = data_voices[small_parties].sum(axis=1)
small_parties_total_voices

commune_id
1        3026.0
2       11366.0
3        5545.0
4        3844.0
5        3877.0
6        1452.0
7        2837.0
8         727.0
9        6029.0
10       5706.0
11       2696.0
12       2131.0
13       1972.0
14       3982.0
21       1165.0
22        854.0
23        843.0
24        865.0
25       2156.0
26       1436.0
27       2958.0
28       2492.0
29       1526.0
30       4024.0
31       4014.0
32        618.0
33       2273.0
34       1834.0
35       2518.0
36       1904.0
         ...   
6800      386.0
6803       12.0
6806       11.0
6807       31.0
6808       67.0
6809       30.0
6810       36.0
9012        0.0
9022        0.0
9030      206.0
9040       18.0
9052        0.0
9100      296.0
9112        0.0
9120      513.0
9160        2.0
9161        0.0
9162        0.0
9170     1155.0
9182        0.0
9190     1914.0
9200      447.0
9211        0.0
9212        0.0
9220     7499.0
9222        0.0
9230      184.0
9232        0.0
9250     7894.0
9252        0.0
dtype: float6

In [32]:
data_voices['Other/Autres'] = small_parties_total_voices
data_voices_smaller = data_voices.drop(small_parties, axis=1)
data_voices_smaller = data_voices_smaller.round(2)
data_voices_smaller.head()

Unnamed: 0_level_0,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,GLP/PVL,BDP/PBD,GPS/PES,Other/Autres
commune_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4906.0,545.0,4894.0,8118.0,2214.0,687.0,1857.0,3026.0
2,15271.0,4922.0,20481.0,36266.0,7898.0,4470.0,6667.0,11366.0
3,10980.0,2252.0,13600.0,19397.0,7907.0,2535.0,4440.0,5545.0
4,5740.0,1293.0,8701.0,15675.0,3925.0,2089.0,3599.0,3844.0
5,7834.0,1942.0,11141.0,14926.0,4771.0,1868.0,3205.0,3877.0


In [33]:
data_percent['Other/Autres'] = data_percent[small_parties].sum(axis=1)
data_percent_smaller = data_percent.drop(small_parties, axis=1)
data_percent_smaller = data_percent_smaller.round(2)
data_percent_smaller.head()

Unnamed: 0_level_0,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,GLP/PVL,BDP/PBD,GPS/PES,Other/Autres
commune_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,18.69,2.08,18.65,30.93,8.44,2.62,7.08,11.54
2,14.23,4.59,19.08,33.79,7.36,4.16,6.21,10.59
3,16.47,3.38,20.4,29.1,11.86,3.8,6.66,8.32
4,12.79,2.88,19.39,34.94,8.75,4.66,8.02,8.56
5,15.81,3.92,22.48,30.11,9.63,3.77,6.47,7.82


In [34]:
data_voices_smaller.to_csv('cleaned_data/2015/cleaned_politique_party_total_voices2015.csv')

In [35]:
data_percent_smaller.to_csv('cleaned_data/2015/cleaned_politique_party_percentages2015.csv')