In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import re

In [2]:
df = pd.read_csv('data_to_be_processed/population_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,age,nationality,sex,commune,year,value,id
0,0,S,M,Aeugst am Albis,2010,8,1
1,1,S,M,Aeugst am Albis,2010,9,1
2,2,S,M,Aeugst am Albis,2010,7,1
3,3,S,M,Aeugst am Albis,2010,13,1
4,4,S,M,Aeugst am Albis,2010,5,1


In [4]:
#group the ages by categories :{[0, 18), [18, 40), [40, 65), [65, 100)}

d = pd.DataFrame(columns=[ 'commune', 'sex', 'year', 'nationality', 'age', 'value'])
for l, u in [(0, 18), (18, 40), (40, 65), (65, 100)]:
    tmp = df[df['age'] >= l]
    tmp = tmp[tmp['age'] < u]
    tmp = tmp.groupby(['id', 'commune', 'sex', 'year', 'nationality'], as_index=False).sum()
    tmp['age'] = int(u)
    d = pd.concat([d, tmp])
for s in ['id', 'year', 'age', 'value']:
    d[s] = d[s].astype(int)
df = d

In [5]:
df.head()

Unnamed: 0,age,commune,id,nationality,sex,value,year
0,18,Aeugst am Albis,1,A,F,26,2010
1,18,Aeugst am Albis,1,S,F,144,2010
2,18,Aeugst am Albis,1,A,F,29,2011
3,18,Aeugst am Albis,1,S,F,150,2011
4,18,Aeugst am Albis,1,A,F,33,2012


no data are available for 2016. We have therefore decided to keep the data from 2015 and sum the data of the municipalities that have merged in beginning of 2016

In [6]:
def create_2016_data(data):
    f = pd.read_csv("data_to_be_processed/fusion_mappings.csv")
    f = f[f.date.str.contains('2016')]
    f['post_id'] = f['post_id'].astype(int)
    #if use again check if outer is the rigth join
    fu = pd.merge(f, data, left_on='pre_name', right_on='commune', how='outer')
    fu['commune'] = fu['post_name'].fillna(fu['commune'])
    fu['id'] = fu['post_id'].fillna(fu['id'])
    fu['id'] = fu['id'].astype(int)
    fu.drop(['pre_id', 'pre_name', 'post_id', 'post_name', 'date'], axis=1, inplace=True)
    fu = fu.groupby(['id', 'commune', 'sex', 'nationality', 'age'], as_index=False).sum()
    return fu

In [7]:
for year in range (2013, 2017):    
    #filter out by year
    if(year == 2016):
        tmp = df[df['year'] == 2015]
        tmp = tmp.drop('year', axis=1)
#         tmp = create_2016_data(tmp)
    else:
        tmp = df[df['year'] == year]
        tmp = tmp.drop('year', axis=1)
        
    #compute total population by municipality
    total = tmp.groupby(['id', 'commune']).sum()
    total = total.drop('age', axis=1)
    total.columns = ['total_inhabitants']
    
    #for each age category compute the percentage of inhabitants in it and add the row to total
    for i in [18, 40, 65, 100]:
        age = tmp[tmp['age'] == i].groupby(['id', 'commune']).sum()
        age = age.drop('age', axis=1)
        age.columns = ['percentage_%d' %i]
        total = pd.merge(total, age, left_index=True, right_index=True)
        total['percentage_%d' %i] = total['percentage_%d' %i] / total['total_inhabitants']
    
    #compute percentage of men in each municipality
    men = tmp[tmp.sex == 'M'].groupby(['id', 'commune']).sum()
    men = men.drop('age', axis=1)
    men.columns = ['percentage_men']
    total = pd.merge(total, men, left_index=True, right_index=True)
    total['percentage_men'] =  total['percentage_men'] / total['total_inhabitants']
    
    #compute percentage of swiss in each municipality
    swiss = tmp[tmp.nationality == 'S'].groupby(['id', 'commune']).sum()
    swiss = swiss.drop('age', axis=1)
    swiss.columns = ['percentage_swiss']
    total = pd.merge(total, swiss, left_index=True, right_index=True)
    total['percentage_swiss'] = total['percentage_swiss'] / total['total_inhabitants']
    total = total.reset_index()

    #check that we have no empty values
    inds = total[pd.isnull(total).any(1)]
    assert(inds.empty)
    
    #round percentages to 2 decimals
    total = total.round(2)
    total.to_csv('municipalities/%d/data_commune.csv' % year, index=False)

In [8]:
#data for 2016
total.head()

Unnamed: 0,id,commune,total_inhabitants,percentage_18,percentage_40,percentage_65,percentage_100,percentage_men,percentage_swiss
0,1,Aeugst am Albis,1981,0.19,0.23,0.42,0.16,0.5,0.87
1,2,Affoltern am Albis,11707,0.18,0.3,0.35,0.17,0.5,0.73
2,3,Bonstetten,5326,0.22,0.26,0.37,0.15,0.49,0.86
3,4,Hausen am Albis,3477,0.2,0.23,0.4,0.17,0.49,0.86
4,5,Hedingen,3659,0.21,0.25,0.39,0.15,0.5,0.85
