In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import re

In [2]:
df = pd.read_csv('data/population_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,age,nationality,sex,commune,year,value
0,0,S,M,Aeugst am Albis,2010,8
1,1,S,M,Aeugst am Albis,2010,9
2,2,S,M,Aeugst am Albis,2010,7
3,3,S,M,Aeugst am Albis,2010,13
4,4,S,M,Aeugst am Albis,2010,5


In [4]:
#group the ages by categories :{[0, 18), [18, 40), [40, 65), [65, 100)}
d = pd.DataFrame(columns=['commune', 'sex', 'year', 'nationality', 'age', 'value'])
for l, u in [(0, 18), (18, 40), (40, 65), (65, 100)]:
    tmp = df[df['age'] >= l]
    tmp = tmp[tmp['age'] < u]
    tmp = tmp.groupby(['commune', 'sex', 'year', 'nationality'], as_index=False).sum()
    tmp['age'] = int(u)
    d = pd.concat([d, tmp])
for s in ['year', 'age', 'value']:
    d[s] = d[s].astype(int)
df = d

In [5]:
df.head()

Unnamed: 0,commune,sex,year,nationality,age,value
0,Aadorf,F,2010,A,18,122
1,Aadorf,F,2010,S,18,650
2,Aadorf,F,2011,A,18,120
3,Aadorf,F,2011,S,18,656
4,Aadorf,F,2012,A,18,108


In [6]:
ids =  pd.read_csv('data/data_to_be_processed/ids')
ids.rename(columns={'name':'commune'}, inplace=True)
ids = ids.sort_values('commune', ascending=True)

no data are available for 2016. We have therefore decided to keep the data from 2015 and sum the data of the municipalities that have merged in beginning of 2016

In [7]:
def create_2016_data(data):
    f = pd.read_csv("data/fusion_mappings.csv")
    f = f[f.date.str.contains('2016')]
    fu = pd.merge(f, data, left_on='pre_name', right_on='commune', how='outer')
    fu['commune'] = fu['post_name'].fillna(fu['commune'])
    fu.drop(['pre_id', 'pre_name', 'post_id', 'post_name', 'date'], axis=1, inplace=True)
    fu = fu.groupby(['commune', 'sex', 'nationality', 'age'], as_index=False).sum()
    return fu

In [8]:
for year in range (2013, 2017):
    #filter out by year
    if(year == 2016):
        tmp = df[df['year'] == 2015]
        tmp = tmp.drop('year', axis=1)
        tmp = create_2016_data(tmp)
    else:
        tmp = df[df['year'] == year]
        tmp = tmp.drop('year', axis=1)
        
    #compute total population by municipality
    total = tmp.groupby('commune').sum()
    total = total.drop('age', axis=1)
    total.columns = ['total_inhabitants']
    #for each age category compute the percentage of inhabitants in it and add the row to total
    for i in [18, 40, 65, 100]:
        age = tmp[tmp['age'] == i].groupby('commune').sum()
        age = age.drop('age', axis=1)
        age.columns = ['percentage_%d' %i]
        total = pd.merge(total, age, left_index=True, right_index=True)
        total['percentage_%d' %i] = total['percentage_%d' %i] / total['total_inhabitants']
    #compute percentage of men in each municipality
    men = tmp[tmp.sex == 'M'].groupby('commune').sum()
    men = men.drop('age', axis=1)
    men.columns = ['percentage_men']
    total = pd.merge(total, men, left_index=True, right_index=True)
    total['percentage_men'] =  total['percentage_men'] / total['total_inhabitants']
    #compute percentage of swissin each municipality
    swiss = tmp[tmp.nationality == 'S'].groupby('commune').sum()
    swiss = swiss.drop('age', axis=1)
    swiss.columns = ['percentage_swiss']
    total = pd.merge(total, swiss, left_index=True, right_index=True)
    total['percentage_swiss'] = total['percentage_swiss'] / total['total_inhabitants']
    
    total = total.reset_index(level=0)
    #merge dataframe with municiplity ids
#     TODO we need the correct ID per year
#     total = pd.merge(ids, total, left_on='commune', right_on='commune')
#     total = total.sort_values('id')
#     total['id'] = total['id'].astype(int)
    #round percentages to 2 decimals
    total = total.round(2)
    total.to_csv('data/%d/data_commune.csv' % year, index=False)

In [9]:
foo = pd.merge(ids, total, left_on='commune', right_on='commune', how='outer')
inds = foo[pd.isnull(foo).any(1)]

In [10]:
inds

Unnamed: 0,id,commune,total_inhabitants,percentage_18,percentage_40,percentage_65,percentage_100,percentage_men,percentage_swiss
2278,,Clavaleyres,49,0.16,0.24,0.35,0.24,0.53,0.96
2279,,Deisswil bei Münchenbuchsee,85,0.12,0.29,0.4,0.19,0.48,1.0
2280,,Hellsau,201,0.22,0.25,0.38,0.15,0.49,0.95
2281,,Jaberg,252,0.16,0.25,0.46,0.13,0.47,0.97
2282,,La Baroche,1184,0.2,0.22,0.35,0.23,0.5,0.93
2283,,Meienried,52,0.06,0.27,0.5,0.17,0.56,0.96
2284,,Niedermuhlern,463,0.17,0.27,0.38,0.18,0.51,0.95
2285,,Noflen,304,0.21,0.34,0.31,0.15,0.53,0.95
2286,,Rüti bei Lyssach,171,0.19,0.3,0.36,0.15,0.51,0.95


In [11]:
total.shape

(2287, 8)