In [573]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import json

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500) # show more columns
nan = np.nan # store numpy.nan in 'nan'

In [574]:
raw_data = pd.read_csv('languages_utf8_raw.csv', sep=';', header=1)

In [575]:
raw_data.drop('Année', axis=1, inplace=True)
# rename the columns
d = {'Canton (-) / District (>>) / Commune (......)': 'commune_name', 
                   'Allemand Suisses':'German',
                   'Français Suisses':'French',
                   'Italien Suisses':'Italian', 
                   'Romanche Suisses':'Romansh'}
raw_data.rename(columns=d, inplace=True)

In [576]:
raw_data.head()

Unnamed: 0,commune_name,German,French,Italian,Romansh
0,Suisse,4201237,1216304,248980,33868
1,- Kanton Zürich,915926,12971,10953,2419
2,>> Bezirk Affoltern,34433,299,190,67
3,......0001 Aeugst am Albis,1412,12,5,1
4,......0002 Affoltern am Albis,7215,58,41,13


In [577]:
# remove cantons and bezirk and remove '......' from names
def _remove_dots(row):
    return row['commune_name'].replace('......', '')
def _keep_only_communes(df):
    tmp = df[df['commune_name'].str.find('......') >= 0]
    tmp['commune_name'] = tmp.apply(_remove_dots, axis=1)
    return tmp

In [578]:
raw_data = _keep_only_communes(raw_data)

In [579]:
raw_data.head()

Unnamed: 0,commune_name,German,French,Italian,Romansh
3,0001 Aeugst am Albis,1412,12,5,1
4,0002 Affoltern am Albis,7215,58,41,13
5,0003 Bonstetten,3282,29,27,11
6,0004 Hausen am Albis,2706,16,16,1
7,0005 Hedingen,2420,21,17,9


In [580]:
# parse commune_id
def _parse_commune_id(row):
    return int(row['commune_name'].split()[0])

def _remove_id_from_name(row):
    return ' '.join(row['commune_name'].split()[1:])

In [581]:
data = raw_data
data['id'] = data.apply(_parse_commune_id, axis=1)
data['commune_name'] = data.apply(_remove_id_from_name, axis=1)
data.head()

Unnamed: 0,commune_name,German,French,Italian,Romansh,id
3,Aeugst am Albis,1412,12,5,1,1
4,Affoltern am Albis,7215,58,41,13,2
5,Bonstetten,3282,29,27,11,3
6,Hausen am Albis,2706,16,16,1,4
7,Hedingen,2420,21,17,9,5


In [582]:
data.shape

(2896, 6)

In [583]:
fusions = pd.read_csv('../fusion_mappings.csv', parse_dates=[4])
print(fusions.dtypes)
print(fusions.shape)
fusions.head()

pre_id                int64
pre_name             object
post_id               int64
post_name            object
date         datetime64[ns]
dtype: object
(804, 5)


Unnamed: 0,pre_id,pre_name,post_id,post_name,date
0,2028,Montagny-la-Ville,2029,Montagny (FR),2000-01-01
1,2029,Montagny-les-Monts,2029,Montagny (FR),2000-01-01
2,2249,Corsalettes,2200,Grolley,2000-01-01
3,2267,Kleingurmels,2262,Gurmels,2000-01-01
4,4542,Willisdorf,4545,Diessenhofen,2000-01-01


In [584]:
# filter all fusions from 2000 to dez.2015
def _get_fusions(f, from_date, to_date):
    """
    returns a dataframe containig only the rows where f['date'] 
    is between the two given dates (including from_date but EXCLUDING to_date)
    
    dates must be either parsable strings or numpy.datetime64
    Example: _get_fusions(fusions, '2014-01-01', '2016-01-01')
    """
    f1 = f[f['date'] >= np.datetime64(from_date)]
    f2 = f1[f1['date'] < np.datetime64(to_date)]
    return f2

In [585]:
def _apply_fusion_year(df, year):
    fs = _get_fusions(fusions, '{}-01-01'.format(year), '{}-01-01'.format(year+1))
    df_cpy = df.copy()
    assert fs['pre_id'].is_unique
    for index, f in fs.iterrows():
        df_cpy.replace(to_replace=f['pre_id'], value=f['post_id'], inplace=True)
    return df_cpy
        

In [586]:
res = {1999: data}
for y in range(2000, 2016):
    res[y] = _apply_fusion_year(res[y-1], y)
del res[1999]
fusioned_data = res[2015]
fusioned_data.head()

Unnamed: 0,commune_name,German,French,Italian,Romansh,id
3,Aeugst am Albis,1412,12,5,1,1
4,Affoltern am Albis,7215,58,41,13,2
5,Bonstetten,3282,29,27,11,3
6,Hausen am Albis,2706,16,16,1,4
7,Hedingen,2420,21,17,9,5


In [587]:
grouped_fusioned_data = fusioned_data.groupby(by=['id'], as_index=False).sum()
print('nbr comunes in fusioned', len(grouped_fusioned_data))
grouped_fusioned_data.head()

nbr comunes in fusioned 2324


Unnamed: 0,id,German,French,Italian,Romansh
0,1,1412,12,5,1
1,2,7215,58,41,13
2,3,3282,29,27,11
3,4,2706,16,16,1
4,5,2420,21,17,9


In [588]:
# assert that there are no duplicated id's
assert len(grouped_fusioned_data[grouped_fusioned_data.duplicated(subset='id')]) == 0

In [589]:
# fetch all existing commune id's
correct_ids_comune = pd.read_csv('../../municipalities/2015/id_commune_2015.csv')
correct_ids = np.array(correct_ids_comune['id'].values)
correct_ids_comune.head()

Unnamed: 0,id,commune
0,1,Aeugst am Albis
1,2,Affoltern am Albis
2,3,Bonstetten
3,4,Hausen am Albis
4,5,Hedingen


In [590]:
# check integrity
missing_in_fusioned = []
missing_in_id = []
for i in correct_ids:
    if i not in grouped_fusioned_data['id'].values:
        missing_in_fusioned.append(i)
for i in grouped_fusioned_data['id'].values:
    if i not in correct_ids:
        missing_in_id.append(i)


In [591]:
print("correct_ids_comune", correct_ids_comune.shape)
print("grouped_fusioned_data", grouped_fusioned_data.shape)


correct_ids_comune (2324, 2)
grouped_fusioned_data (2324, 5)


In [592]:
print("len(missing_in_fusioned)", len(missing_in_fusioned))
print("missing_in_fusioned", missing_in_fusioned)

len(missing_in_fusioned) 0
missing_in_fusioned []


In [593]:
print("len(missing_in_id_2015)", len(missing_in_id))
print("missing_in_id_2015", missing_in_id)

len(missing_in_id_2015) 0
missing_in_id_2015 []


In [594]:
correct_ids_comune[correct_ids_comune['commune'] == 'Scuol']

Unnamed: 0,id,commune
1243,3762,Scuol


In [595]:
grouped_fusioned_data.head()

Unnamed: 0,id,German,French,Italian,Romansh
0,1,1412,12,5,1
1,2,7215,58,41,13
2,3,3282,29,27,11
3,4,2706,16,16,1
4,5,2420,21,17,9


Compute probability

In [596]:
denom = grouped_fusioned_data[['German', 'French', 'Italian', 'Romansh']].sum(axis=1)

for col in grouped_fusioned_data.columns[1:]:
    grouped_fusioned_data[col] = grouped_fusioned_data[col]/denom *100
grouped_fusioned_data = grouped_fusioned_data.round(4) 
grouped_fusioned_data.head()

Unnamed: 0,id,German,French,Italian,Romansh
0,1,98.7413,0.8392,0.3497,0.0699
1,2,98.4714,0.7916,0.5596,0.1774
2,3,97.9994,0.8659,0.8062,0.3285
3,4,98.7952,0.5842,0.5842,0.0365
4,5,98.0949,0.8512,0.6891,0.3648


In [597]:
grouped_fusioned_data['Main Language'] = grouped_fusioned_data.apply(lambda row: np.argmax(row[1:]),axis=1)    

In [598]:
grouped_fusioned_data.head()

Unnamed: 0,id,German,French,Italian,Romansh,Main Language
0,1,98.7413,0.8392,0.3497,0.0699,German
1,2,98.4714,0.7916,0.5596,0.1774,German
2,3,97.9994,0.8659,0.8062,0.3285,German
3,4,98.7952,0.5842,0.5842,0.0365,German
4,5,98.0949,0.8512,0.6891,0.3648,German


In [599]:
grouped_fusioned_data.set_index('id', inplace=True)
grouped_fusioned_data.head()

Unnamed: 0_level_0,German,French,Italian,Romansh,Main Language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,98.7413,0.8392,0.3497,0.0699,German
2,98.4714,0.7916,0.5596,0.1774,German
3,97.9994,0.8659,0.8062,0.3285,German
4,98.7952,0.5842,0.5842,0.0365,German
5,98.0949,0.8512,0.6891,0.3648,German


In [600]:
grouped_fusioned_data.to_csv("../../municipalities/2013/cleaned_language.csv", index='id')
grouped_fusioned_data.to_csv("../../municipalities/2014/cleaned_language.csv", index='id')
grouped_fusioned_data.to_csv("../../municipalities/2015/cleaned_language.csv", index='id')
grouped_fusioned_data.to_csv("../../municipalities/2016/cleaned_language.csv", index='id')