In [26]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import json
import copy
import re
import numpy as np

sns.set(style="darkgrid")

# Load an example dataset with long-form data
#fmri = sns.load_dataset("fmri")

# Plot the responses for different events and regions
#plt = sns.lineplot(x="timepoint", y="signal",
#             hue="region", style="event",
#             data=fmri)


#plt.show()

## Load geolocalized points for communes

In [78]:
points_communes_columns = ["place","zipcode","zusatzziffer","commune","bfsnr","canton","east","north","language"]
points_communes = pd.read_csv("PLZO_CSV_WGS84.csv",sep=";",encoding="cp1252", names=points_communes_columns, header=0)
print(points_communes.shape)

# Only keep gemeinde:
#points_communes = points_communes[points_communes.place==points_communes.commune]
#print(points_communes.shape)

# Drop thurgau for now
points_communes = points_communes[points_communes.canton!="TG"]
print(points_communes.shape)

points_communes.head()

(4138, 9)
(3937, 9)


Unnamed: 0,place,zipcode,zusatzziffer,commune,bfsnr,canton,east,north,language
0,Aeugst am Albis,8914,0,Aeugst am Albis,1,ZH,8.488,47.267,de
1,Aeugstertal,8914,2,Aeugst am Albis,1,ZH,8.494,47.283,de
2,Zwillikon,8909,0,Affoltern am Albis,2,ZH,8.431,47.288,de
3,Affoltern am Albis,8910,0,Affoltern am Albis,2,ZH,8.449,47.279,de
4,Bonstetten,8906,0,Bonstetten,3,ZH,8.468,47.316,de


## Load cleaned communes population data

In [5]:

with open('../1_pop_cleaning/communes_V2_checkpoint_1553875811028.json', 'r') as cf:
    data = json.load(cf)


In [6]:
#cdata = pd.DataFrame(communes)
#cdata

reviewedCommunes = data["reviewedCommunes"]
communesToReview = data["communesToReview"]

# remember that some are reviewed and some not
for commune in reviewedCommunes:
    commune["hand_reviewed"] = True
for commune in communesToReview:
    commune["hand_reviewed"] = False

print("communes[0].keys()")
print(reviewedCommunes[0].keys())
print("communesToReview[0].keys()")
print(communesToReview[0].keys())


communes[0].keys()
communesToReview[0].keys()


In [24]:
columns_communes = ["name","canton","url","firstmention","hab_year","notes"]
communes_name_regex = re.compile(r"\W\(?Gemeinde\)?")

communes = reviewedCommunes+communesToReview
communes = [
    {
        "name": communes_name_regex.sub("",commune["name"]),
        "canton": commune["canton"],
        "url": commune["url"],
        "firstmention": commune["firstmention"] if "firstmention" in commune else -1,
        "hab_year": [
            {
                "year": hy["year"],
                "pop": hy["pop"],
                "unit": hy["unit"] if "unit" in hy else "undefined"
            } for hy in commune["hab_year"]
        ],
        "notes": commune["notes"] if "notes" in commune else ""
    }
    for commune in communes
]
dfcommuneKp thurgau for now
dfcommunes = dfcommunes[dfcommunes.canton!="TG"]
print(dfcommunes.shape)

pd.set_option('display.max_rows', None) 
dfcommunes

(2242, 6)
(2162, 6)


Unnamed: 0,name,canton,url,firstmention,hab_year,notes
27,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"[{'year': '1416', 'pop': '120', 'unit': 'Haush...","hab+foyer data!, unclear population count (wit..."
28,Lausanne,VD,/Articles/002408/?language=de,280,"[{'year': '1219', 'pop': '8500', 'unit': 'Einw...","hab+foyer data!, data in html tabelle\nweird p..."
29,Lutry,VD,/Articles/002420/?language=de,908,"[{'year': '1850', 'pop': '2011', 'unit': 'Einw...","hab+foyer data!, with or without Savigny,"
30,Ballaigues,VD,/Articles/002528/?language=de,1228,"[{'year': '1416', 'pop': '10', 'unit': 'Hausha...","hab+foyer data!,"
31,Belmont-sur-Yverdon,VD,/Articles/002623/?language=de,1154,"[{'year': '1409', 'pop': '31', 'unit': 'Hausha...","hab+foyer data!,"
32,Borex,VD,/Articles/002498/?language=de,1135,"[{'year': '1429', 'pop': '14', 'unit': 'Feuers...","hab+foyer data!,"
33,Ependes (VD),VD,/Articles/002635/?language=de,1154,"[{'year': '1404', 'pop': '17', 'unit': 'Hausha...","hab+foyer data!,"
34,Lignerolle,VD,/Articles/002538/?language=de,1160,"[{'year': '1416', 'pop': '60', 'unit': 'Feuers...","hab+foyer data!,"
35,Premier,VD,/Articles/002542/?language=de,1316,"[{'year': '1396', 'pop': '15', 'unit': 'Feuers...","hab+foyer data!,"
36,Pully,VD,/Articles/002412/?language=de,994,"[{'year': '1764', 'pop': '626', 'unit': 'Einw'...","hab+foyer data!,"


In [23]:
pd.reset_option('display.max_rows')
dfcommunes.to_csv("communes.csv", sep=";")

## Reshape DataFrame to 1 line per datapoint

In [77]:
columns_communes_datapoints = ["year","pop","unit","name","canton","url","firstmention","hab_year","notes"]
communes_datapoints = []

for commune in communes:
    for hy in commune["hab_year"]:
        hy_dict = copy.deepcopy(commune)
        hy_dict["year"] =  hy["year"]
        hy_dict["pop"] =  hy["pop"]
        hy_dict["unit"] = hy["unit"] if "unit" in hy else "undefined"
        communes_datapoints.append(hy_dict)

dfcommunes_datapoints = pd.DataFrame(communes_datapoints)[columns_communes_datapoints]
dfcommunes_datapoints = dfcommunes_datapoints.drop(columns=["hab_year"])
print(dfcommunes_datapoints.shape)

# Drop thurgau for now
dfcommunes_datapoints = dfcommunes_datapoints[dfcommunes_datapoints.canton!="TG"]
print(dfcommunes_datapoints.shape)

dfcommunes_datapoints

(13648, 8)
(13157, 8)


Unnamed: 0,year,pop,unit,name,canton,url,firstmention,notes
192,1416,120,Haushalte,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
193,1764,1751,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
194,1803,2001,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
195,1850,2054,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
196,1900,3025,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
197,1930,3840,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
198,1950,3381,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
199,1980,2872,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
200,1990,3110,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."
201,2000,2949,Einw,Château-d'Œx,VD,/Articles/002593/?language=de,1080,"hab+foyer data!, unclear population count (wit..."


In [78]:
dfcommunes_datapoints.to_csv("communes_datapoints.csv", sep=";")

'doXXdu'

## TODO: merge communes and points_communes using Hungarian algorithm
https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.optimize.linear_sum_assignment.html

In [115]:
from jellyfish import jaro_distance

#create unique list of names
cantons = dfcommunes.canton.unique() # ["AI"]#
test_canton = "NE"

communes_per_canton = {}
points_communes_per_canton = {}
distance_matrix_per_canton = {}
for canton in cantons:
    communes_per_canton[canton] = dfcommunes.name[dfcommunes.canton == canton].unique()
    points_communes_per_canton[canton] = points_communes.commune[points_communes.canton == canton].unique()
    
    print("canton")
    print(canton)
    print("communes_per_canton[canton].shape")
    print(communes_per_canton[canton].shape)
    
    if canton==test_canton:
        def distance(i, j):
            return 1-jaro_distance(communes_per_canton[canton][np.int(i)], points_communes_per_canton[canton][np.int(j)])

        jaroDistanceProxy = np.vectorize(distance)
        distance_matrix_per_canton[canton] = np.fromfunction(
            jaroDistanceProxy,
            shape=(len(communes_per_canton[canton]),
                   len(points_communes_per_canton[canton])))
    
    

print(communes_per_canton[test_canton])
print(points_communes_per_canton[test_canton])
distance_matrix_per_canton[test_canton]

canton
VD
communes_per_canton[canton].shape
(309,)
canton
LU
communes_per_canton[canton].shape
(83,)
canton
ZH
communes_per_canton[canton].shape
(168,)
canton
SO
communes_per_canton[canton].shape
(109,)
canton
BE
communes_per_canton[canton].shape
(351,)
canton
BL
communes_per_canton[canton].shape
(86,)
canton
VS
communes_per_canton[canton].shape
(127,)
canton
TI
communes_per_canton[canton].shape
(115,)
canton
GE
communes_per_canton[canton].shape
(45,)
canton
GR
communes_per_canton[canton].shape
(112,)
canton
JU
communes_per_canton[canton].shape
(57,)
canton
OW
communes_per_canton[canton].shape
(7,)
canton
SZ
communes_per_canton[canton].shape
(30,)
canton
UR
communes_per_canton[canton].shape
(20,)
canton
FR
communes_per_canton[canton].shape
(137,)
canton
SG
communes_per_canton[canton].shape
(77,)
canton
AG
communes_per_canton[canton].shape
(213,)
canton
AI
communes_per_canton[canton].shape
(6,)
canton
ZG
communes_per_canton[canton].shape
(11,)
canton
SH
communes_per_canton[canton].shape

array([[0.        , 0.52525253, 0.48888889, ..., 0.59027778, 0.58547009,
        0.50793651],
       [0.5       , 0.49494949, 0.54444444, ..., 0.49027778, 0.38797314,
        0.51190476],
       [0.49494949, 0.64141414, 0.53939394, ..., 0.62436869, 0.60955711,
        0.55844156],
       ...,
       [1.        , 0.5530303 , 0.55277778, ..., 0.54166667, 0.53205128,
        0.4047619 ],
       [0.50793651, 0.53679654, 0.6047619 , ..., 0.44940476, 0.48107448,
        0.        ],
       [0.48148148, 0.62457912, 0.69259259, ..., 0.51851852, 0.47863248,
        0.42910053]])

In [116]:
import scipy.optimize as opt

opt = opt.linear_sum_assignment(distance_matrix_per_canton[test_canton])
opt[0][0]

0

In [117]:
for i in range(0,len(communes_per_canton[test_canton])):
    print(communes_per_canton[test_canton][opt[0][i]])
    print(points_communes_per_canton[test_canton][opt[1][i]])

Boudry
Boudry
Brenets, Les
Les Brenets
Brévine, La
La Brévine
Brot-Plamboz
Brot-Plamboz
Cerneux-Péquignot, Le
Le Cerneux-Péquignot
Chaux-du-Milieu, La
La Chaux-du-Milieu
Corcelles-Cormondrèche
Corcelles-Cormondrèche
Cornaux
Cornaux
Cortaillod
Cortaillod
Côte-aux-Fées, La
La Côte-aux-Fées
Cressier (NE)
Cressier (NE)
Enges
Enges
Gorgier
La Grande-Béroche
Hauterive (NE)
Hauterive (NE)
Landeron, Le
Le Landeron
Lignières
Lignières
Locle, Le
Le Locle
Chaux-de-Fonds, La
La Chaux-de-Fonds
Verrières, Les
Les Verrières
Neuenburg
Neuchâtel
Val-de-Ruz
Val-de-Ruz
Milvignes
Milvignes
Peseux
Peseux
Planchettes, Les
Les Planchettes
Ponts-de-Martel, Les
Les Ponts-de-Martel
Rochefort
Rochefort
Sagne, La
La Sagne
Saint-Blaise
Saint-Blaise
Tène, La
La Tène
Valangin
Valangin
Val-de-Travers
Val-de-Travers


IndexError: index 31 is out of bounds for axis 0 with size 31