Data from https://www.pxweb.bfs.admin.ch/default.aspx

In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500) # show more columns
nan = np.nan # store numpy.nan in 'nan'

In [2]:
raw_data = pd.read_csv('area_utf8_raw.csv', sep=';', header=1)
raw_data.shape

(2560, 2)

In [3]:
# rename the columns
d = {'Fläche - Total 2004/09':'area', 'Bezirk (>>) / Gemeinde (......)': 'commune_name'}
raw_data.rename(columns=d, inplace=True)

In [4]:
raw_data.head()

Unnamed: 0,commune_name,area
0,>> Affoltern,11308
1,......Aeugst am Albis,787
2,......Affoltern am Albis,1060
3,......Bonstetten,745
4,......Hausen am Albis,1368


In [5]:
# remove cantons and bezirk and remove '......' from names
def _remove_dots(row):
    return row['commune_name'].replace('......', '')
def _keep_only_communes(df):
    tmp = df[df['commune_name'].str.find('......') >= 0]
    tmp['commune_name'] = tmp.apply(_remove_dots, axis=1)
    return tmp


In [6]:
data = _keep_only_communes(raw_data)
data.shape

(2412, 2)

In [7]:
ids_commune = pd.read_csv("../../municipalities/2015/id_commune_2015.csv")
ids_commune.shape

(2324, 2)

In [8]:
# make fusions
fusions = pd.read_csv('../fusion_mappings.csv', parse_dates=[4])
def _get_fusions(f, from_date, to_date):
    """
    returns a dataframe containig only the rows where f['date'] 
    is between the two given dates (including from_date but EXCLUDING to_date)
    
    dates must be either parsable strings or numpy.datetime64
    Example: _get_fusions(fusions, '2014-01-01', '2016-01-01')
    """
    f1 = f[f['date'] >= np.datetime64(from_date)]
    f2 = f1[f1['date'] < np.datetime64(to_date)]
    return f2

def _apply_fusion_year(df, year):
    fs = _get_fusions(fusions, '{}-01-01'.format(year), '{}-01-01'.format(year+1))
    df_cpy = df.copy()
    assert fs['pre_name'].is_unique
    for index, f in fs.iterrows():
        df_cpy.replace(to_replace=f['pre_name'], value=f['post_name'], inplace=True)
    return df_cpy

In [9]:
res = {2012: data}
for y in range(2013, 2016):
    res[y] = _apply_fusion_year(res[y-1], y)
del res[2012]
fusioned_data = res[2015]
fusioned_data.shape

(2412, 2)

In [10]:
grouped = fusioned_data.groupby(by='commune_name', as_index=False).sum()

In [11]:
merged = grouped.merge(ids_commune, left_on='commune_name', right_on='commune')
merged.shape

(2322, 4)

In [12]:
merged.head()

Unnamed: 0,commune_name,area,id,commune
0,Aadorf,1990,4551,Aadorf
1,Aarau,1233,4001,Aarau
2,Aarberg,792,301,Aarberg
3,Aarburg,442,4271,Aarburg
4,Aarwangen,987,321,Aarwangen


In [13]:
# check integrity
correct_ids = np.array(ids_commune['id'].values)

missing_in_merged = []
missing_in_id = []
for i in correct_ids:
    if i not in merged['id'].values:
        missing_in_merged.append(i)
for i in merged['id'].values:
    if i not in correct_ids:
        missing_in_id.append(i)

print("correct_ids_comune", ids_commune.shape)
print("fusioned_data", merged.shape)
print()

print("len(missing_in_merged)", len(missing_in_merged))
print("missing_in_merged", missing_in_merged)

print("len(missing_in_id)", len(missing_in_id))
print("missing_in_id", missing_in_id)

correct_ids_comune (2324, 2)
fusioned_data (2322, 4)

len(missing_in_merged) 2
missing_in_merged [5624, 5702]
len(missing_in_id) 0
missing_in_id []


In [14]:
ids_commune[ids_commune['id'].isin([5624, 5702])]

Unnamed: 0,id,commune
1862,5624,Bussigny
1912,5702,Arzier-Le Muids


In [15]:
merged.drop('commune', axis=1, inplace=True)
merged.head()

Unnamed: 0,commune_name,area,id
0,Aadorf,1990,4551
1,Aarau,1233,4001
2,Aarberg,792,301
3,Aarburg,442,4271
4,Aarwangen,987,321


In [16]:
# add the missing communes by hand
merged = merged.append(pd.DataFrame([['Bussigny',482,5624]], columns=merged.columns), ignore_index=True)
merged = merged.append(pd.DataFrame([['Arzier-Le Muids',5191,5702]], columns=merged.columns), ignore_index=True)
merged[merged['id'].isin([5624, 5702])]

Unnamed: 0,commune_name,area,id
2322,Bussigny,482,5624
2323,Arzier-Le Muids,5191,5702


In [17]:
merged.drop('commune_name', axis=1, inplace=True)

In [19]:
merged.to_csv('../cleaned_data/2013/cleaned_area.csv', index=False)
merged.to_csv('../cleaned_data/2014/cleaned_area.csv', index=False)
merged.to_csv('../cleaned_data/2015/cleaned_area.csv', index=False)
merged.to_csv('../cleaned_data/2016/cleaned_area.csv', index=False)