In [None]:
import json
import polars as pl
import polars.selectors as cs
import pandas as pd
from requests import Session
from tqdm.notebook import tqdm, trange

from pathlib import Path

In [None]:
base = Path("/Users/davidenicoli/Local_Workspace/Datasets/ARPA/EMILIA-ROMAGNA")
test = base / "2016-01.json.gz"

In [None]:
def dwn_url(year, month):
    return f"https://dati-simc.arpae.it/opendata/osservati/meteo/storico/{year}-{month:02d}.json.gz"


def dwn_gen(session: Session):
    for year in range(2006, 2024):
        for month in range(1, 13):
            tarball_url = dwn_url(year, month)
            tarball_content = session.get(tarball_url).content
            tarball_path = base / f"{year}-{month:02d}.json.gz"
            with open(tarball_path, "wb") as tarball:
                tarball.write(tarball_content)
            yield tarball_path

In [None]:
with Session() as session:
    dwned = list(dwn_gen(session))

In [None]:
import pyarrow.json as pjson

d = pjson.read_json(test)

In [None]:
tab = pl.from_arrow(d)

In [None]:
exp = tab.explode("data")

In [None]:
pl.from_arrow(pjson.read_json(test)).explode("data").unnest("data").unnest("vars")

In [None]:
def decent_table(year, month):
    tarball = base / f"{year}-{month:02d}.json.gz"
    data = pl.from_arrow(pjson.read_json(tarball))
    return (
        data.explode("data")
        .unnest("data")
        .unnest("vars")
        .with_columns(
            pl.col("B01019").struct.field("v").alias("name"),
            pl.col("B12101").struct.field("v").alias("T"),
            pl.col("B01001").struct.field("v").alias("WMO_block"),
            pl.col("B01002").struct.field("v").alias("WMO_num"),
            # pl.col("B05001").struct.field("v").alias("lat"),
            # pl.col("B06001").struct.field("v").alias("lon"),
            pl.col("B07030").struct.field("v").alias("elevation"),
        )
        .select(
            "network",
            "lon",
            "lat",
            "elevation",
            "date",
            "name",
            "T",
            cs.starts_with("WMO"),
            "timerange",
            "level",
        )
        .drop("B01019", "B12101")
        .filter(pl.col("name").is_not_null() | pl.col("T").is_not_null())
        .with_columns(
            cs.by_name("name", "T").fill_null(strategy="forward"),
        )
        .drop_nulls("T")
        .with_columns(
            pl.col("level").list.first().alias("level_code"),
            pl.col("level").list.get(1).alias("level_value"),
            pl.col("timerange").list.first().alias("variable"),
            pl.col("timerange").list.get(1).alias("delValidity"),
            pl.col("timerange").list.get(2).alias("statDuration"),
        )
        .drop("level", "timerange")
        .group_by("network", "name", pl.col("date").dt.date())
        .agg(
            pl.col("T").min().alias("T_MIN"),
            pl.col("T").max().alias("T_MAX"),
            pl.col("lon").drop_nulls().mode(),
            pl.col("lat").drop_nulls().mode(),
            pl.col("elevation").drop_nulls().mode(),
            pl.col("WMO_block").drop_nulls().first(),
            pl.col("WMO_num").drop_nulls().first()
        )
        .sort("name", "date")
        .melt(
            id_vars=[
                cs.starts_with("WMO"),
                "lon",
                "lat",
                "elevation",
                "name",
                "network",
                "date",
            ],
            value_vars=["T_MIN", "T_MAX"],
        )
    )

In [None]:
for year in trange(2006, 2024):
    for month in trange(1, 13, leave=False):
        if not (base / "dataset" / f"{year}-{month:02d}.arrow").exists():
            (
                decent_table(year, month)
                .with_columns(
                    pl.col("lon").list.first(),
                    pl.col("lat").list.first(),
                    pl.col("elevation").list.first(),
                )
                .cast(
                    {
                        "lon": pl.Int32(),
                        "lat": pl.Int32(),
                        "elevation": pl.Float32(),
                        "name": pl.Utf8(),
                        "network": pl.Utf8(),
                        "variable": pl.Utf8(),
                        "value": pl.Float32(),
                    }
                )
                .write_ipc(base / "dataset" / f"{year}-{month:02d}.arrow")
            )

In [None]:
dt = decent_table(2006, 1).sort("name", "date")