In [61]:
import json

DATE_RANGE = MIN_YEAR, MAX_YEAR = 1600, 1699
print(DATE_RANGE)

(1600, 1699)


In [None]:
from urllib.parse import quote
import os
from sqlalchemy import create_engine

sql_engine = create_engine(f'mysql+pymysql://folgert:{quote(os.environ["SQLPW"])}@localhost/DSD2018')

In [63]:
import pandas as pd

# First, create a table with song books plus their year of publication
sources = pd.read_sql_query(
    f"""SELECT bronid, sorteerjaar FROM bron
         WHERE sorteerjaar >= {MIN_YEAR} AND sorteerjaar <= {MAX_YEAR}""",
    sql_engine,
)

sources.sample(10)

Unnamed: 0,bronid,sorteerjaar
1749,1017536,1662
696,1004290,1620
1816,1019576,1634
1381,1016190,1613
1202,1011321,1655
2021,1020103,1618
1071,1010882,1614
1312,1015982,1673
749,1004625,1660
146,1000700,1649


In [64]:
sources.shape  # number of unique song books (excluding reprints)

(2044, 2)

In [65]:
locations = pd.read_sql_query("SELECT bronid, plaats FROM bron_drukker", sql_engine)
df = sources.merge(locations, on="bronid")
df.sample(10)

Unnamed: 0,bronid,sorteerjaar,plaats
1490,1016715,1650,Amsterdam
944,1010135,1646,Amsterdam
1238,1015514,1662,Kalken
110,1000662,1680,Amsterdam
911,1009989,1619,Antwerpen
628,1001614,1683,Utrecht
1531,1016867,1664,Den Haag
1330,1016105,1626,Amsterdam
856,1004982,1658,Amsterdam
1161,1011235,1649,Amsterdam


In [66]:
with open("../data/placenames.json") as f:
    places = pd.DataFrame(json.load(f))

df = df.merge(places, on="plaats", how="left")
df.sample(10)

Unnamed: 0,bronid,sorteerjaar,plaats,standardized place
404,1000937,1648,Amsterdam,Amsterdam
1028,1010768,1662,Brugge,Brugge
610,1001564,1614,Antwerpen,Antwerpen
1868,1019681,1695,'s-Gravenhage,Den Haag
91,1000648,1645,Amsterdam,Amsterdam
1980,1020083,1614,,
135,1000683,1617,Amsterdam,Amsterdam
279,1000823,1625,Zwolle (uitgave),Zwolle
246,1000787,1627,Amsterdam,Amsterdam
1823,1019548,1688,Amsterdam,Amsterdam


In [67]:
songs = pd.read_sql_query("SELECT recordid, incnormid, bronid FROM lied", sql_engine)

melodies = pd.read_sql_query(
    "SELECT recordid, melodieid FROM wijs WHERE melodieid != 0", sql_engine
)

In [68]:
df = df.merge(songs, on="bronid", how="left")
df = df.merge(melodies, on="recordid", how="left")
df.sample(10)

Unnamed: 0,bronid,sorteerjaar,plaats,standardized place,recordid,incnormid,melodieid
5096,1000738,1659,Amsterdam,Amsterdam,6298.0,47934.0,1280.0
35680,1015506,1663,Brussel,Brussel,36167.0,53723.0,4981.0
2370,1000671,1645,Haarlem,Haarlem,3215.0,10788.0,2849.0
27936,1001870,1677,Brussel,Brussel,36981.0,36109.0,
16376,1001030,1612,Franeker,Franeker,20313.0,37508.0,5068.0
32780,1011046,1654,Antwerpen,Antwerpen,33837.0,52084.0,4041.0
21862,1001561,1659,Hoorn,Hoorn,35772.0,59213.0,4615.0
36987,1015991,1624,Leiden,Leiden,187292.0,69140.0,7247.0
7124,1000805,1643,Haarlem,Haarlem,8694.0,2479.0,928.0
18321,1001065,1611,Parijs,Parijs,366.0,44731.0,7582.0


In [69]:
df = df[df["standardized place"] == "Amsterdam"]

In [74]:
df["sorteerjaar"] = df['sorteerjaar'].astype(int)

In [75]:
from copia.data import to_copia_dataset

def coverage(group):
    ds = to_copia_dataset(group)
    return 1 - (ds.f1 / ds.n) * (
        ((ds.n - 1) * ds.f1) / ((ds.n - 1) * ds.f1 + 2 * ds.f2)
    )

def coverage_interval(group):
    ds = to_copia_dataset(group)
    return 1.96 * ((((ds.f1 + 2 * ds.f2) / ds.n - (ds.f1 / ds.n)**2)/ds.n)**(1/2))


df["decade"] = df["sorteerjaar"] // 10 * 10

size_table = pd.DataFrame(index=df["decade"].unique())
size_table["source"] = df.groupby("decade")["bronid"].nunique()
size_table["melody"] = df.groupby("decade")["melodieid"].nunique()
size_table["songs"] = df.groupby("decade")["recordid"].nunique()
size_table["coverage"] = df.groupby("decade")["melodieid"].apply(coverage)
size_table["coverage_interval"] = df.groupby("decade")["melodieid"].apply(coverage_interval)
size_table = size_table.reset_index(names="decade")
size_table.sort_values("decade")

Unnamed: 0,decade,source,melody,songs,coverage,coverage_interval
0,1600,38,329,877,0.824299,0.033453
1,1610,99,291,935,0.878375,0.03025
6,1620,73,348,1036,0.903102,0.026436
2,1630,101,321,992,0.773,0.041096
3,1640,122,572,2042,0.919174,0.015849
4,1650,85,581,1726,0.90226,0.019754
5,1660,80,378,882,0.674783,0.048147
9,1670,69,420,991,0.882643,0.03273
8,1680,92,692,1220,0.860195,0.022937
7,1690,50,409,724,0.745398,0.045255


In [55]:
from copia.data import to_copia_dataset

def coverage(group):
    ds = to_copia_dataset(group)
    return 1 - (ds.f1 / ds.n) * (
        ((ds.n - 1) * ds.f1) / ((ds.n - 1) * ds.f1 + 2 * ds.f2)
    )

def coverage_interval(group):
    ds = to_copia_dataset(group)
    return 1.96 * ((((ds.f1 + 2 * ds.f2) / ds.n - (ds.f1 / ds.n)**2)/ds.n)**(1/2))


df["decade"] = df["jaar_begin"] // 10 * 10

size_table = pd.DataFrame(index=df["decade"].unique())
size_table["source"] = df.groupby("decade")["bronid"].nunique()
size_table["melody"] = df.groupby("decade")["melodieid"].nunique()
size_table["songs"] = df.groupby("decade")["recordid"].nunique()
size_table["coverage"] = df.groupby("decade")["melodieid"].apply(coverage)
size_table["coverage_interval"] = df.groupby("decade")["melodieid"].apply(coverage_interval)
size_table = size_table.reset_index(names="decade")
size_table.sort_values("decade")

Unnamed: 0,decade,source,melody,songs,coverage,coverage_interval
0,1600,40,342,925,0.816764,0.034667
1,1610,100,293,946,0.878469,0.030161
6,1620,72,348,1036,0.903102,0.026436
2,1630,100,317,981,0.772943,0.041298
3,1640,122,572,2042,0.919174,0.015849
4,1650,84,579,1723,0.903027,0.019748
5,1660,80,378,882,0.674783,0.048147
9,1670,70,420,991,0.882643,0.03273
8,1680,93,692,1220,0.860195,0.022937
7,1690,49,409,724,0.745398,0.045255


In [76]:
popsize_df = pd.read_csv("../data/population-estimates.csv")
popsize_df = popsize_df[popsize_df["location"] == "Amsterdam"].drop("location", axis=1)
size_table = size_table.merge(popsize_df, left_on="decade", right_on="decade")
size_table

Unnamed: 0,decade,source,melody,songs,coverage,coverage_interval,pop_estimate,error
0,1600,38,329,877,0.824299,0.033453,56752.541667,0.046078
1,1610,99,291,935,0.878375,0.03025,78808.285714,0.043182
2,1630,101,321,992,0.773,0.041096,127555.315789,0.044154
3,1640,122,572,2042,0.919174,0.015849,142153.555556,0.045614
4,1650,85,581,1726,0.90226,0.019754,172644.95,0.050008
5,1660,80,378,882,0.674783,0.048147,187175.043478,0.052763
6,1620,73,348,1036,0.903102,0.026436,92036.0,0.043115
7,1690,50,409,724,0.745398,0.045255,199624.078947,0.056107
8,1680,92,692,1220,0.860195,0.022937,198540.833333,0.055018
9,1670,69,420,991,0.882643,0.03273,190929.807692,0.054167


In [77]:
size_table['pop_estimate'] = size_table['pop_estimate'].astype(int)
print(size_table.sort_values('decade').round(2).to_latex(index=False, float_format="%.2f"))

\begin{tabular}{rrrrrrrr}
\toprule
decade & source & melody & songs & coverage & coverage_interval & pop_estimate & error \\
\midrule
1600 & 38 & 329 & 877 & 0.82 & 0.03 & 56752 & 0.05 \\
1610 & 99 & 291 & 935 & 0.88 & 0.03 & 78808 & 0.04 \\
1620 & 73 & 348 & 1036 & 0.90 & 0.03 & 92036 & 0.04 \\
1630 & 101 & 321 & 992 & 0.77 & 0.04 & 127555 & 0.04 \\
1640 & 122 & 572 & 2042 & 0.92 & 0.02 & 142153 & 0.05 \\
1650 & 85 & 581 & 1726 & 0.90 & 0.02 & 172644 & 0.05 \\
1660 & 80 & 378 & 882 & 0.67 & 0.05 & 187175 & 0.05 \\
1670 & 69 & 420 & 991 & 0.88 & 0.03 & 190929 & 0.05 \\
1680 & 92 & 692 & 1220 & 0.86 & 0.02 & 198540 & 0.06 \\
1690 & 50 & 409 & 724 & 0.75 & 0.05 & 199624 & 0.06 \\
\bottomrule
\end{tabular}



In [78]:
dfs = []

decades = set(size_table.loc[size_table["pop_estimate"].notnull(), "decade"].values)
for decade, rows in df[df["melodieid"].notnull()].groupby("decade"):
    if decade in decades:
        counts = rows["melodieid"].value_counts().reset_index()
        counts["melodieid"] = counts["melodieid"].astype(int)
        counts["decade"] = decade
        dfs.append(counts)

df = pd.concat(dfs)
df.to_csv("../data/dsd-amsterdam-melody-decade-counts.csv", index=False)

In [17]:
df

Unnamed: 0,melodieid,count,decade
0,2048,24,1600
1,1856,16,1600
2,3654,15,1600
3,2663,15,1600
4,2815,15,1600
...,...,...,...
404,4332,1,1690
405,2501,1,1690
406,1886,1,1690
407,529,1,1690
