Data from: https://www.pxweb.bfs.admin.ch/default.aspx

In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500) # show more columns
nan = np.nan # store numpy.nan in 'nan'

In [2]:
raw_data = pd.read_csv('raw_data/politique_data_utf_d.csv', sep=';', header=1)
raw_data.head(10)

Unnamed: 0,Bezirk (>>) / Gemeinde (......),Bezirk (>>) / Gemeinde (......).1,Jahr,Jahr.1,Partei,Partei.1,Ergebnis,Ergebnis.1,Unnamed: 8
0,101,>> Bezirk Affoltern,2015,2015,1,FDP/PLR (PRD),1,Parteistimmen,95805.0000000000
1,101,>> Bezirk Affoltern,2015,2015,1,FDP/PLR (PRD),2,Parteistärke in %,15.7782920000
2,101,>> Bezirk Affoltern,2015,2015,2,CVP/PDC,1,Parteistimmen,24001.0000000000
3,101,>> Bezirk Affoltern,2015,2015,2,CVP/PDC,2,Parteistärke in %,3.9527664000
4,101,>> Bezirk Affoltern,2015,2015,3,SP/PS,1,Parteistimmen,112705.0000000000
5,101,>> Bezirk Affoltern,2015,2015,3,SP/PS,2,Parteistärke in %,18.5615824000
6,101,>> Bezirk Affoltern,2015,2015,4,SVP/UDC,1,Parteistimmen,206032.0000000000
7,101,>> Bezirk Affoltern,2015,2015,4,SVP/UDC,2,Parteistärke in %,33.9317682000
8,101,>> Bezirk Affoltern,2015,2015,5,LPS/PLS,1,Parteistimmen,...
9,101,>> Bezirk Affoltern,2015,2015,5,LPS/PLS,2,Parteistärke in %,...


In [3]:
# there are no nones
sum(raw_data.isnull().any())

0

In [4]:
# keep only data from 2015 (2013 and 2014 are not in the dataset)
data = raw_data[raw_data['Jahr'] >= 2013]
raw_data['Jahr'].unique()

array([2015, 2011, 2007, 2003, 1999, 1995, 1991, 1987, 1983, 1979, 1975])

In [5]:
# remove the year columns
data.drop(['Jahr', 'Jahr.1'], axis=1, inplace=True)

In [6]:
data.columns[1]

'Bezirk (>>) / Gemeinde (......).1'

In [7]:
# remove 'Bezirk >>' (keep only communes ('...'))
data = data[data[data.columns[1]].str.find('...') >= 0]

In [8]:
# rename the columns
data.rename(columns={'Bezirk (>>) / Gemeinde (......)': 'commune_id', 
                   'Bezirk (>>) / Gemeinde (......).1': 'commune_name',
                   'Partei':'party_id',
                   'Partei.1':'party_name',
                   'Ergebnis':'result_id', 
                   'Ergebnis.1':'result_name',
                   'Unnamed: 8':'value'}, inplace=True)

In [9]:
#remove '....' from commune_name
def _remove_dots(row):
    return row['commune_name'].replace('...... ', '')
data.commune_name = data.apply(_remove_dots, axis=1)
data.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,result_id,result_name,value
528,1,Aeugst am Albis,1,FDP/PLR (PRD),1,Parteistimmen,4906.0
529,1,Aeugst am Albis,1,FDP/PLR (PRD),2,Parteistärke in %,18.69166
530,1,Aeugst am Albis,2,CVP/PDC,1,Parteistimmen,545.0
531,1,Aeugst am Albis,2,CVP/PDC,2,Parteistärke in %,2.0764278
532,1,Aeugst am Albis,3,SP/PS,1,Parteistimmen,4894.0


In [10]:
# '...' values mean 0, replace ... by 0
data.replace('...', 0.0, inplace=True)
data.value = pd.to_numeric(data['value'])

In [11]:
# round 'value' to 2 decimals
data.value = data.value.round(2)

In [12]:
# remove rows that are not communes
data = data[~data.commune_id.isin([9012, 9022, 9030, 9040, 9052, 9100, 9112, 
                           9120, 9160, 9161, 9162, 9170, 9182, 9190, 
                           9200, 9211, 9212, 9220, 9222, 9230, 9232, 9250,9252])]


In [13]:
parties = data.party_name.unique()
parties

array(['FDP/PLR (PRD)', 'CVP/PDC', 'SP/PS', 'SVP/UDC', 'LPS/PLS',
       'LdU/AdI', 'EVP/PEV', 'CSP/PCS', 'GLP/PVL', 'BDP/PBD', 'PdA/PST',
       'PSA', 'POCH', 'GPS/PES', 'FGA/AVF', 'Sol.', 'SD/DS', 'Rep./Rép.',
       'EDU/UDF', 'FPS/PSL', 'Lega', 'MCR', 'Sep./Sép.', 'Übrige/Autres'], dtype=object)

In [14]:
# Takes some time!!!
for party in parties:
    data[party] = data.apply(lambda row: row.value if row.party_name == party else 0.0, axis=1)

In [15]:
#split into 2 df, one for 'parteistimmen' and the other for 'parteistärke %'
data_voices = data[data.result_name == 'Parteistimmen']
data_percent = data[data.result_name != 'Parteistimmen']
data_voices.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,result_id,result_name,value,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
528,1,Aeugst am Albis,1,FDP/PLR (PRD),1,Parteistimmen,4906.0,4906.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
530,1,Aeugst am Albis,2,CVP/PDC,1,Parteistimmen,545.0,0.0,545.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
532,1,Aeugst am Albis,3,SP/PS,1,Parteistimmen,4894.0,0.0,0.0,4894.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
534,1,Aeugst am Albis,4,SVP/UDC,1,Parteistimmen,8118.0,0.0,0.0,0.0,8118.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536,1,Aeugst am Albis,5,LPS/PLS,1,Parteistimmen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
data_percent.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,result_id,result_name,value,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
529,1,Aeugst am Albis,1,FDP/PLR (PRD),2,Parteistärke in %,18.69,18.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
531,1,Aeugst am Albis,2,CVP/PDC,2,Parteistärke in %,2.08,0.0,2.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
533,1,Aeugst am Albis,3,SP/PS,2,Parteistärke in %,18.65,0.0,0.0,18.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,1,Aeugst am Albis,4,SVP/UDC,2,Parteistärke in %,30.93,0.0,0.0,0.0,30.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,1,Aeugst am Albis,5,LPS/PLS,2,Parteistärke in %,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
data_voices.drop(['result_name', 'result_id'], axis=1, inplace=True)
data_percent.drop(['result_name', 'result_id'], axis=1, inplace=True)
data_percent.head()

Unnamed: 0,commune_id,commune_name,party_id,party_name,value,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
529,1,Aeugst am Albis,1,FDP/PLR (PRD),18.69,18.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
531,1,Aeugst am Albis,2,CVP/PDC,2.08,0.0,2.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
533,1,Aeugst am Albis,3,SP/PS,18.65,0.0,0.0,18.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,1,Aeugst am Albis,4,SVP/UDC,30.93,0.0,0.0,0.0,30.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,1,Aeugst am Albis,5,LPS/PLS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# drop overfluous columns
data_percent = data_percent.drop(['commune_name', 'party_id', 'party_name', 'value'], axis=1).groupby(by='commune_id', axis=0, as_index=True).sum()
data_voices = data_voices.drop(['commune_name', 'party_id', 'party_name', 'value'], axis=1).groupby(by='commune_id', axis=0, as_index=True).sum()

In [19]:
#data_percent.columns = ['percentages {}'.format(c) for c in data_percent.columns]
data_percent.head()

Unnamed: 0_level_0,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,LPS/PLS,LdU/AdI,EVP/PEV,CSP/PCS,GLP/PVL,BDP/PBD,PdA/PST,PSA,POCH,GPS/PES,FGA/AVF,Sol.,SD/DS,Rep./Rép.,EDU/UDF,FPS/PSL,Lega,MCR,Sep./Sép.,Übrige/Autres
commune_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,18.69,2.08,18.65,30.93,0.0,0.0,3.47,0.0,8.44,2.62,0.17,0.0,0.0,7.08,0.0,0.0,0.31,0.0,4.58,0.0,0.0,0.0,0.0,3.01
2,14.23,4.59,19.08,33.79,0.0,0.0,5.46,0.0,7.36,4.16,0.19,0.0,0.0,6.21,0.0,0.0,0.19,0.0,1.58,0.0,0.0,0.0,0.0,3.17
3,16.47,3.38,20.4,29.1,0.0,0.0,3.14,0.0,11.86,3.8,0.11,0.0,0.0,6.66,0.0,0.0,0.17,0.0,1.75,0.0,0.0,0.0,0.0,3.15
4,12.79,2.88,19.39,34.94,0.0,0.0,2.57,0.0,8.75,4.66,0.19,0.0,0.0,8.02,0.0,0.0,0.18,0.0,1.64,0.0,0.0,0.0,0.0,3.98
5,15.81,3.92,22.48,30.11,0.0,0.0,3.59,0.0,9.63,3.77,0.23,0.0,0.0,6.47,0.0,0.0,0.02,0.0,1.82,0.0,0.0,0.0,0.0,2.16


In [20]:
#data_voices.columns = ['total voices {}'.format(c) for c in data_voices.columns]

In [21]:
# put small parties into 'others'.
cutoff = 1995000
smaller_voices = data_voices

In [22]:
small_parties_idx = [p[0] for p in enumerate(smaller_voices.sum(axis=0)) if p[1] < 1995000]

In [23]:
small_parties = data_voices.columns[small_parties_idx]
small_parties

Index(['LPS/PLS', 'LdU/AdI', 'EVP/PEV', 'CSP/PCS', 'BDP/PBD', 'PdA/PST', 'PSA',
       'POCH', 'FGA/AVF', 'Sol.', 'SD/DS', 'Rep./Rép.', 'EDU/UDF', 'FPS/PSL',
       'Lega', 'MCR', 'Sep./Sép.', 'Übrige/Autres'],
      dtype='object')

In [24]:
small_parties_total_voices = data_voices[small_parties].sum(axis=1)
small_parties_total_voices

commune_id
1        3713.0
2       15836.0
3        8080.0
4        5933.0
5        5745.0
6        2060.0
7        4250.0
8         936.0
9        8745.0
10       8063.0
11       4440.0
12       2676.0
13       3593.0
14       6582.0
21       1796.0
22       1779.0
23       1673.0
24       1792.0
25       3482.0
26       1908.0
27       4649.0
28       3499.0
29       2464.0
30       5708.0
31       5508.0
32        908.0
33       4109.0
34       3640.0
35       4480.0
36       2689.0
         ...   
6748       48.0
6750      100.0
6751       49.0
6753       36.0
6754      177.0
6757      354.0
6758       42.0
6759        1.0
6771      101.0
6773       14.0
6774      113.0
6775       38.0
6778       17.0
6781       68.0
6782      209.0
6783       13.0
6784      114.0
6785       36.0
6787       15.0
6789       12.0
6790       58.0
6792       17.0
6793        9.0
6800      386.0
6803       12.0
6806       11.0
6807       31.0
6808       67.0
6809       30.0
6810       36.0
dtype: float6

In [25]:
data_voices['Other/Autres'] = small_parties_total_voices
data_voices_smaller = data_voices.drop(small_parties, axis=1)
data_voices_smaller = data_voices_smaller.round(2)
data_voices_smaller.head()

Unnamed: 0_level_0,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,GLP/PVL,GPS/PES,Other/Autres
commune_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4906.0,545.0,4894.0,8118.0,2214.0,1857.0,3713.0
2,15271.0,4922.0,20481.0,36266.0,7898.0,6667.0,15836.0
3,10980.0,2252.0,13600.0,19397.0,7907.0,4440.0,8080.0
4,5740.0,1293.0,8701.0,15675.0,3925.0,3599.0,5933.0
5,7834.0,1942.0,11141.0,14926.0,4771.0,3205.0,5745.0


In [26]:
data_percent['Other/Autres'] = data_percent[small_parties].sum(axis=1)
data_percent_smaller = data_percent.drop(small_parties, axis=1)
data_percent_smaller = data_percent_smaller.round(2)
data_percent_smaller.head()

Unnamed: 0_level_0,FDP/PLR (PRD),CVP/PDC,SP/PS,SVP/UDC,GLP/PVL,GPS/PES,Other/Autres
commune_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,18.69,2.08,18.65,30.93,8.44,7.08,14.16
2,14.23,4.59,19.08,33.79,7.36,6.21,14.75
3,16.47,3.38,20.4,29.1,11.86,6.66,12.12
4,12.79,2.88,19.39,34.94,8.75,8.02,13.22
5,15.81,3.92,22.48,30.11,9.63,6.47,11.59


In [35]:
# check integrity of the data
id_commune = pd.read_csv("../municipalities/2015/id_commune_2015.csv")
for i in id_commune['id']:
    assert i in data_percent_smaller.index
assert len(id_commune) == len(data_percent_smaller)
for i in id_commune['id']:
    assert i in data_voices_smaller.index
assert len(id_commune) == len(data_voices_smaller)

In [28]:
data_voices_smaller.to_csv('cleaned_data/2015/cleaned_politique_party_total_voices2015.csv')

In [29]:
data_percent_smaller.to_csv('cleaned_data/2015/cleaned_politique_party_percentages2015.csv')