In [None]:
import altair as alt
import datetime
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re


In [None]:
BASE_DIR = Path(os.getenv("HOME")) / "git" / "github.com" / "dnswlt" / "ogd-weather"

In [None]:
DATA_DIR = BASE_DIR / "data"
for i, f in enumerate(DATA_DIR.glob("*.csv")):
    if i > 10:
        break
    print(f.name)

In [None]:
hourly_ber = DATA_DIR / "ogd-smn_ber_h_historical_2010-2019.csv"
df = pd.read_csv(
    hourly_ber,
    encoding="cp1252",
    sep=";",
    date_format={
        "reference_timestamp": "%d.%m.%Y %H:%M",
    },
    parse_dates=["reference_timestamp"],
)
df.head()

In [None]:
mcols = sorted(set(df.columns) - set(["station_abbr", "reference_timestamp"]))

In [None]:
nancols = df[mcols].isna().astype(int).sum()
nancols = nancols[nancols > 10]  # Ignore if just a few data points are missing
nancols

In [None]:
smn_params_csv = DATA_DIR / "ogd-smn_meta_parameters.csv"
ps = pd.read_csv(smn_params_csv, encoding="cp1252", sep=";").set_index("parameter_shortname")

In [None]:
ps.loc[nancols.index][["parameter_description_de", "parameter_description_en"]]

In [None]:
ps.loc[mcols][["parameter_description_de", "parameter_description_en", "parameter_unit"]]

In [None]:

day = datetime.date(2015, 8, 1)
dfd = df[df["reference_timestamp"].dt.date == day]
dfd.head()

In [None]:
def plot_day(df, date, variable, type_="bar", y_domain=None, title="Untitled chart"):
    dfd = df[df["reference_timestamp"].dt.date == date]
    chart = alt.Chart(dfd)
    chart = chart.mark_line() if type_=="line" else chart.mark_bar()
    x = alt.X("reference_timestamp:T", timeUnit="dayhours", title="Hour of day")
    y = alt.Y(f"{variable}:Q")
    if y_domain:
        y = y.scale(domain=y_domain)
    return chart.encode(
        x=x,
        y=y,
    ).properties(
        title=title,
    )

In [None]:
day1 = datetime.date(2015, 8, 2)
plot_day(df, day1, "gre000h0", title="Globalstrahlung")

In [None]:
plot_day(df, day1, "pva200h0", title="Dampfdruck")

In [None]:
plot_day(df, day1, "tde200h0", title="Taupunkt")

In [None]:
plot_day(df, day1, "ure200h0", title="Rel. Luftfeuchtigkeit")

In [None]:
plot_day(df, day1, "sre000h0", title="Sonnenscheindauer")

In [None]:
plot_day(df, day1, "prestah0", title="Luftdruck", type_="line", y_domain=(940, 970))

In [None]:
plot_day(df, day1, "tre200h0", title="Temperatur", type_="line")

In [None]:
# Calculate the vapour pressure from the dew point
# using the WMO-prescribed version of the Magnus-Tetens formula.
def dew_to_vapour(df):
    # For dew points ≥ 0°C (used everywhere yields closest approximation to MeteoSwiss data)
    a = 6.112
    b = 17.67
    c = 243.5

    T = df["tde200h0"]
    return a * np.exp((b * T) / (c + T))

df["vapour_from_dew"] = dew_to_vapour(df)

In [None]:
df["vfd_err"] = df["vapour_from_dew"] - df["pva200h0"]
plot_day(df, day1, "vfd_err", title="Dampfdruck (aus Taupunkt), Fehler (in hPa)", type_="line", y_domain=(-1, 1))

In [None]:
(df["vfd_err"]).describe()

In [None]:
# Check data availability for all stations
res = []
for i, f in enumerate(DATA_DIR.glob("*h_historical_2020-2029.csv")):
    df = pd.read_csv(
        f,
        encoding="cp1252",
        sep=";",
        date_format={
            "reference_timestamp": "%d.%m.%Y %H:%M",
        },
        parse_dates=["reference_timestamp"],
    )
    nz = (df[["pva200h0", "tde200h0", "gre000h0"]] > 0).astype(int).sum()
    stn = re.search(r'ogd-smn_(\w+)_h_hist', str(f)).group(1)
    res.append((stn.upper(), nz))

avail = pd.DataFrame({f: nz for (f, nz) in sorted(res)}).T
avail.head(10)

In [None]:
def dew_to_vapour1(T):
    # For dew points ≥ 0°C (used everywhere yields closest approximation to MeteoSwiss data)
    a = 6.112
    b = 17.67
    c = 243.5

    return a * np.exp((b * T) / (c + T))

dew_to_vapour1(15.853)

In [None]:
daily_ber = DATA_DIR / "ogd-smn_ber_d_historical.csv"
dfd = pd.read_csv(
    daily_ber,
    encoding="cp1252",
    sep=";",
    date_format={
        "reference_timestamp": "%d.%m.%Y %H:%M",
    },
    parse_dates=["reference_timestamp"],
)
dfd = dfd[dfd["reference_timestamp"].dt.date == datetime.date(2015, 6, 1)]
dfd[["station_abbr", "reference_timestamp", "gre000d0"]]

In [None]:
hourly_ber = DATA_DIR / "ogd-smn_ber_h_historical_2010-2019.csv"
dfh = pd.read_csv(
    hourly_ber,
    encoding="cp1252",
    sep=";",
    date_format={
        "reference_timestamp": "%d.%m.%Y %H:%M",
    },
    parse_dates=["reference_timestamp"],
)
dfh = dfh[dfh["reference_timestamp"].dt.date == datetime.date(2015, 6, 1)]
dfh[["station_abbr", "reference_timestamp", "gre000h0"]]

In [None]:
dfh["gre000h0"].mean()