In [2]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import json
import copy
import re
import numpy as np
import scipy.optimize as opt

sns.set(style="darkgrid")

# Load an example dataset with long-form data
#fmri = sns.load_dataset("fmri")

# Plot the responses for different events and regions
#plt = sns.lineplot(x="timepoint", y="signal",
#             hue="region", style="event",
#             data=fmri)


#plt.show()

## Load geolocalized points for communes

In [3]:
geo_communes_columns = ["place","zipcode","zusatzziffer","commune","bfsnr","canton","X","Y","language"]
geo_communes = pd.read_csv("PLZO_CSV_WGS84.csv",sep=";",encoding="utf-8", names=geo_communes_columns, header=0)
# was encoding windows1252 (pandas encoding="cp1252"), now encoding unknown -> ?!?
print(geo_communes.shape)

# Only keep gemeinde:
#geo_communes = geo_communes[geo_communes.place==geo_communes.commune]
#print(geo_communes.shape)

# Drop thurgau for now
geo_communes = geo_communes[geo_communes.canton!="TG"]
print(geo_communes.shape)

geo_communes.head()

(4138, 9)
(3937, 9)


Unnamed: 0,place,zipcode,zusatzziffer,commune,bfsnr,canton,X,Y,language
0,Aeugst am Albis,8914,0,Aeugst am Albis,1,ZH,8.488,47.267,de
1,Aeugstertal,8914,2,Aeugst am Albis,1,ZH,8.494,47.283,de
2,Zwillikon,8909,0,Affoltern am Albis,2,ZH,8.431,47.288,de
3,Affoltern am Albis,8910,0,Affoltern am Albis,2,ZH,8.449,47.279,de
4,Bonstetten,8906,0,Bonstetten,3,ZH,8.468,47.316,de


## Load cleaned communes population data

In [9]:
columns_communes = ["name","canton","url","firstmention","hab_year","raw_hab_year","notes"]

with open('../2_pop_extrapolation/communes_units_converted.json', 'r') as cf:
    communes = json.load(cf)
    
dfcommunes = pd.DataFrame(communes)[columns_communes]
print(dfcommunes.shape)

# Drop thurgau for now
dfcommunes = dfcommunes[dfcommunes.canton!="TG"]
print(dfcommunes.shape)

(2242, 7)
(2162, 7)


In [10]:
pd.reset_option('display.max_rows')
dfcommunes.to_csv("communes.csv", sep=";")

## Reshape DataFrame to 1 line per datapoint

In [11]:
columns_communes_datapoints = ["year","pop","unit","name","canton","url","firstmention","hab_year","notes"]
communes_datapoints = []

for commune in communes:
    for hy in commune["hab_year"]:
        hy_dict = copy.deepcopy(commune)
        hy_dict["year"] =  hy["year"]
        hy_dict["pop"] =  hy["pop"]
        hy_dict["unit"] = hy["unit"] if "unit" in hy else "undefined"
        communes_datapoints.append(hy_dict)

dfcommunes_datapoints = pd.DataFrame(communes_datapoints)[columns_communes_datapoints]
dfcommunes_datapoints = dfcommunes_datapoints.drop(columns=["hab_year"])
print(dfcommunes_datapoints.shape)

# Drop thurgau for now
dfcommunes_datapoints = dfcommunes_datapoints[dfcommunes_datapoints.canton!="TG"]
print(dfcommunes_datapoints.shape)

dfcommunes_datapoints

(13417, 8)
(13135, 8)


Unnamed: 0,year,pop,unit,name,canton,url,firstmention,notes
74,1416,600,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
75,1764,1751,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
76,1803,2001,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
77,1850,2054,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
78,1900,3025,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
79,1930,3840,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
80,1950,3381,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
81,1980,2872,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
82,1990,3110,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
83,2000,2949,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."


In [12]:
dfcommunes_datapoints.to_csv("communes_datapoints.csv", sep=";")

## TODO: merge communes and geo_communes using Hungarian algorithm
https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.optimize.linear_sum_assignment.html

##### Create the fuzzy merke_keys:

In [13]:
from jellyfish import jaro_distance

#create unique list of names
cantons = [ c for c in dfcommunes.canton.unique() if c!="MA"]
test_canton = "ZH"

communes_per_canton = {}
geo_communes_per_canton = {}
distance_matrix_per_canton = {}
merge_keys_per_canton = {}
for canton in cantons:
    communes_per_canton[canton] = dfcommunes.name[dfcommunes.canton == canton].unique()
    geo_communes_per_canton[canton] = geo_communes.commune[geo_communes.canton == canton].unique()
    
    # do the fuzzy merge_keys
    #if canton==test_canton:
    def distance(i, j):
        return 1-jaro_distance(communes_per_canton[canton][np.int(i)], geo_communes_per_canton[canton][np.int(j)])

    jaroDistanceProxy = np.vectorize(distance)
    distance_matrix_per_canton[canton] = np.fromfunction(
        jaroDistanceProxy,
        shape=(len(communes_per_canton[canton]),
               len(geo_communes_per_canton[canton])))

    fuzzy_merge = opt.linear_sum_assignment(distance_matrix_per_canton[canton])
    merge_keys_per_canton[canton] = {
        communes_per_canton[canton][fuzzy_merge[0][i]]:
        geo_communes_per_canton[canton][fuzzy_merge[1][i]]
        for i in range(0,len(fuzzy_merge[0]))
    }
        
    
    
    

print(communes_per_canton[test_canton])
print(geo_communes_per_canton[test_canton])
merge_keys_per_canton[test_canton]

['Adliswil' 'Aeugst am Albis' 'Affoltern am Albis' 'Altikon' 'Andelfingen'
 'Bachenbülach' 'Bachs' 'Bäretswil' 'Bassersdorf' 'Bauma' 'Benken (ZH)'
 'Berg am Irchel' 'Birmensdorf (ZH,' 'Bonstetten' 'Boppelsen' 'Brütten'
 'Bubikon' 'Buch am Irchel' 'Buchs (ZH)' 'Bülach' 'Dachsen' 'Dägerlen'
 'Dällikon' 'Dänikon' 'Dättlikon' 'Dielsdorf' 'Dietikon' 'Dietlikon'
 'Dinhard' 'Dorf' 'Dübendorf' 'Dürnten' 'Egg (ZH)' 'Eglisau' 'Elgg'
 'Ellikon an der Thur' 'Elsau' 'Embrach' 'Erlenbach (ZH)' 'Fällanden'
 'Fehraltorf' 'Feuerthalen' 'Fischenthal' 'Flaach' 'Flurlingen'
 'Freienstein-Teufen' 'Geroldswil' 'Glattfelden' 'Gossau (ZH)'
 'Greifensee' 'Aesch bei Birmensdorf' 'Adlikon' 'Grüningen' 'Hagenbuch'
 'Hausen am Albis' 'Hedingen' 'Henggart' 'Herrliberg' 'Hettlingen'
 'Hinwil' 'Hirzel' 'Hittnau' 'Hochfelden' 'Hofstetten (ZH)'
 'Hombrechtikon' 'Horgen' 'Höri' 'Humlikon' 'Hüntwangen' 'Hütten'
 'Hüttikon' 'Illnau-Effretikon' 'Kappel am Albis' 'Kilchberg (ZH)'
 'Kleinandelfingen' 'Kloten' 'Knonau' 'Küsna

{'Adliswil': 'Adliswil',
 'Aeugst am Albis': 'Aeugst am Albis',
 'Affoltern am Albis': 'Affoltern am Albis',
 'Altikon': 'Altikon',
 'Andelfingen': 'Andelfingen',
 'Bachenbülach': 'Bachenbülach',
 'Bachs': 'Bachs',
 'Bäretswil': 'Bäretswil',
 'Bassersdorf': 'Bassersdorf',
 'Bauma': 'Bauma',
 'Benken (ZH)': 'Benken (ZH)',
 'Berg am Irchel': 'Berg am Irchel',
 'Birmensdorf (ZH,': 'Birmensdorf (ZH)',
 'Bonstetten': 'Bonstetten',
 'Boppelsen': 'Boppelsen',
 'Brütten': 'Brütten',
 'Bubikon': 'Bubikon',
 'Buch am Irchel': 'Buch am Irchel',
 'Buchs (ZH)': 'Buchs (ZH)',
 'Bülach': 'Bülach',
 'Dachsen': 'Dachsen',
 'Dägerlen': 'Dägerlen',
 'Dällikon': 'Dällikon',
 'Dänikon': 'Dänikon',
 'Dättlikon': 'Dättlikon',
 'Dielsdorf': 'Dielsdorf',
 'Dietikon': 'Dietikon',
 'Dietlikon': 'Dietlikon',
 'Dinhard': 'Dinhard',
 'Dorf': 'Dorf',
 'Dübendorf': 'Dübendorf',
 'Dürnten': 'Dürnten',
 'Egg (ZH)': 'Egg',
 'Eglisau': 'Eglisau',
 'Elgg': 'Elgg',
 'Ellikon an der Thur': 'Ellikon an der Thur',
 'Elsau': '

#### Do the fuzzy merge:
...on dfcommunes:

In [14]:
dfcommunes["place"] = ""

for index, row in dfcommunes.iterrows():
    if row["canton"]!="MA" and row["name"] in merge_keys_per_canton[row["canton"]]:
        dfcommunes.loc[index,"place"] = merge_keys_per_canton[row["canton"]][row["name"]] 
    #print(dfcommunes.iloc[index])

#dfcommunes.canton=="MA"
result_communes = pd.merge(dfcommunes, geo_communes, on='place', how='left')
result_communes.to_csv("communes_geo.csv", sep=";")

result_communes

Unnamed: 0,name,canton_x,url,firstmention,hab_year,raw_hab_year,notes,place,zipcode,zusatzziffer,commune,bfsnr,canton_y,X,Y,language
0,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"[{'year': 1416, 'pop': 600, 'unit': 'Einw', 'o...","[{'year': 1416, 'pop': 120, 'unit': 'Haushalte...","hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
1,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1003.0,0.0,Lausanne,5586.0,VD,6.630,46.520,fr
2,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1004.0,0.0,Lausanne,5586.0,VD,6.619,46.528,fr
3,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1005.0,0.0,Lausanne,5586.0,VD,6.643,46.520,fr
4,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1006.0,0.0,Lausanne,5586.0,VD,6.637,46.511,fr
5,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1007.0,0.0,Lausanne,5586.0,VD,6.609,46.518,fr
6,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1010.0,0.0,Lausanne,5586.0,VD,6.659,46.536,fr
7,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1011.0,0.0,Lausanne,5586.0,VD,6.643,46.526,fr
8,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1012.0,0.0,Lausanne,5586.0,VD,6.657,46.526,fr
9,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","[{'year': 1219, 'pop': 8500, 'unit': 'Einw'}, ...","hab+foyer data!, data in html tabelle\nweird p...",Lausanne,1015.0,0.0,Lausanne,5586.0,VD,6.574,46.521,fr


...on dfcommunes:

In [15]:
dfcommunes_datapoints["place"] = ""

for index, row in dfcommunes_datapoints.iterrows():
    if row["canton"]!="MA" and row["name"] in merge_keys_per_canton[row["canton"]]:
        dfcommunes_datapoints.loc[index,"place"] = merge_keys_per_canton[row["canton"]][row["name"]] 
    #print(dfcommunes.iloc[index])

#dfcommunes.canton=="MA"
result_communes_datapoints = pd.merge(dfcommunes_datapoints, geo_communes, on='place', how='left')
result_communes_datapoints.to_csv("communes_datapoints_geo.csv", sep=";")

result_communes_datapoints

Unnamed: 0,year,pop,unit,name,canton_x,url,firstmention,notes,place,zipcode,zusatzziffer,commune,bfsnr,canton_y,X,Y,language
0,1416,600,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
1,1764,1751,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
2,1803,2001,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
3,1850,2054,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
4,1900,3025,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
5,1930,3840,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
6,1950,3381,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
7,1980,2872,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
8,1990,3110,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr
9,2000,2949,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit...",Château-d'Oex,1660.0,0.0,Château-d'Oex,5841.0,VD,7.129,46.473,fr


In [16]:
print(result_communes.shape)
print(dfcommunes.shape)

(2271, 16)
(2162, 8)
