In [None]:
import sys
sys.path.append("..")

In [None]:
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"

from src.changelog_reader import ChangelogReader

In [None]:
changes_per_date_and_id = dict()
for changelog_file, dates_file in ChangelogReader.get_changelog_files("stations"):
    reader = ChangelogReader(changelog_file, dates_file)
    for obj_id, changelogs in reader.data.items():
        for cl in changelogs:
            if "changes" in cl:
                if "replace" in cl["changes"] or "add" in cl["changes"] or "remove" in cl["changes"]:
                    key = (obj_id, cl["date"])
                    changes_per_date_and_id[key] = changes_per_date_and_id.get(key, 0) + 1

In [None]:
df = pd.DataFrame([{"date": key[1], "changes": value} for key, value in changes_per_date_and_id.items()])
df["date"] = pd.to_datetime(df["date"])
df.set_index("date", inplace=True)
df

In [None]:
df_changes_per_day = df.resample("1d").sum()
df_changes_per_day.plot.bar(
    title="Number of changes to station data per day", 
    labels={
        "value": "Number of changes (logarithmic)",
    },
    log_y=True,
)

In [None]:
stations_per_date = dict()
for obj_id, date in changes_per_date_and_id:
    if date not in stations_per_date:
        stations_per_date[date] = set()
    stations_per_date[date].add(obj_id)

In [None]:
df = pd.DataFrame([{"date": key, "stations": len(value)} for key, value in stations_per_date.items()])
df["date"] = pd.to_datetime(df["date"])
df.set_index("date", inplace=True)
df

In [None]:
df_stations_per_day = df.resample("1d").sum()
df_stations_per_day.plot.bar(
    title="Number of stations that are edited per day", 
    labels={
        "value": "Number of stations (logarithmic)",
    },
    log_y=True,
    barmode="group",
)

In [None]:
# dataframes are equal? 
all(df_changes_per_day == df_stations_per_day)

In [None]:
INTERESTING_DATES = (
    "2020-06-03",
    "2021-06-03",
    "2021-06-04",
    "2021-06-08",
    "2021-06-17",
    "2021-06-26",
    "2021-07-02",
)
paths_per_date = dict()
for changelog_file, dates_file in ChangelogReader.get_changelog_files("stations"):
    reader = ChangelogReader(changelog_file, dates_file)
    for obj_id, changelogs in reader.data.items():
        for cl in changelogs:
            key = cl["date"][:10]
            if key in INTERESTING_DATES:
                paths_per_date.setdefault(key, {})
                for change_type, changes in cl["changes"].items():
                    for c in changes:
                        path_key = f"{change_type} `{c['path']}`".replace("0.", "").replace("1.", "").replace("2.", "").replace("3.", "")
                        paths_per_date[key].setdefault(path_key, []).append(None if change_type == "remove" else str(c["value"]))

for date, paths in paths_per_date.items():
    print(f"- **`{date}`**")
    for path in sorted(paths, key=lambda k: -len(paths[k]))[:5]:
        values = sorted(set(paths[path]))[:100]
        print(f"  - {len(paths[path])} x {path}")

In [None]:
from itertools import islice

COMPARE_DATES = ["2021-06-02", "2021-06-03"]
objects_compare = dict()
for changelog_file, dates_file in ChangelogReader.get_changelog_files("stations"):
    reader = ChangelogReader(changelog_file, dates_file)
    for obj_id in reader.object_ids():
        found_dates = 0
        for dt, obj in reader.iter_object_snapshots(obj_id):
            if dt[:10] in COMPARE_DATES:
                objects_compare.setdefault(obj_id, {})[dt[:10]] = obj
                found_dates += 1
                if found_dates == 2:
                    break
                
for obj_id, versions in islice(objects_compare.items(), 0, 50):
    if len(versions) == 2:
        print(obj_id)
        for key in sorted(versions):
            print(" ", key, versions[key]["federalState"])