In [None]:
import json
import polars as pl
import polars.selectors as cs
import pandas as pd
from requests import Session
from tqdm.notebook import tqdm, trange
import pyarrow.json as pjson
import seaborn as sns
from datetime import date, timedelta
import pyarrow.feather as pf

sns.set_theme("notebook")

from pathlib import Path

In [None]:
base = Path("/Users/davidenicoli/Local_Workspace/Datasets/ARPA/EMILIA-ROMAGNA")
fragments = base / "fragments"
sub_fragments = base / "fragments.subhour/"
arr_fragments = base / "fragments.arrow/"
test = fragments / "2016-01.json.gz"

In [None]:
def dwn_url(year, month):
    return f"https://dati-simc.arpae.it/opendata/osservati/meteo/storico/{year}-{month:02d}.json.gz"


def dwn_gen(session: Session):
    for year in range(2006, 2024):
        for month in range(1, 13):
            tarball_url = dwn_url(year, month)
            tarball_content = session.get(tarball_url).content
            tarball_path = fragments / f"{year}-{month:02d}.json.gz"
            with open(tarball_path, "wb") as tarball:
                tarball.write(tarball_content)
            yield tarball_path

In [None]:
with Session() as session:
    dwned = list(dwn_gen(session))

In [None]:
def extract_value(var_code):
    return (
        pl.col("data")
        .list.first()
        .struct.field("vars")
        .struct.field(var_code)
        .struct.field("v")
    )


def extract_station_props(data):
    return data.with_columns(
        extract_value("B01019").alias("name"),
        extract_value("B07030").alias("elevation"),
        extract_value("B01001").alias("WMO_block"),
        extract_value("B01002").alias("WMO_num"),
    )


def timerange_level_to_struct(data):
    return data.with_columns(
        pl.col("timerange").list.first().alias("variable"),
        pl.col("timerange").list.get(1).alias("P1"),
        pl.col("timerange").list.get(2).alias("agg_period"),
        pl.col("level").list.first().alias("level_code"),
        pl.col("level").list.get(1).alias("level_value"),
    ).drop("timerange", "level")


def load_data(year, month):
    tarball = sub_fragments / f"{year}-{month:02d}.arrow"
    return (
        # pl.from_arrow(pjson.read_json(tarball))
        pl.read_ipc(tarball, memory_map=False)
        .pipe(extract_station_props)
        .explode("data")
        .unnest("data")
        .drop_nulls("timerange")
        .unnest("vars")
        .select(
            "lon",
            "lat",
            "elevation",
            "WMO_block",
            "WMO_num",
            "name",
            "network",
            "date",
            "B12101",
            "timerange",
            "level",
        )
        .with_columns(pl.col("B12101").struct.field("v").alias("T"))
        .drop_nulls("T")
        .drop("B12101")
        .pipe(timerange_level_to_struct)
    )

def collect_all_data(year, month):
    cdate = date(year, month, 1)
    pdate = cdate + timedelta(days=-1)
    ndate = date(year, month, 28) + timedelta(weeks=1)
    ndate = date(ndate.year, ndate.month, 1)
    data = pl.concat([
        pl.from_arrow(
            pjson.read_json(fragments / f"{date.year}-{date.month:02d}.json.gz")
        ).filter(pl.col("date").is_between(pdate, ndate))
        for date in [pdate, cdate, ndate]
    ])
    return data

def process_temp(data):
    return (
        data.group_by("name", "network", pl.col("date").dt.date())
        .agg(
            pl.col("T").min().alias("T_MIN"),
            pl.col("T").max().alias("T_MAX"),
            pl.col("lon").drop_nulls().mode(),
            pl.col("lat").drop_nulls().mode(),
            pl.col("elevation").drop_nulls().mode(),
        )
        .melt(
            id_vars=["network", "name", "date", "lon", "lat", "elevation"],
            value_vars=["T_MIN", "T_MAX"],
        )
        .sort("variable", "network", "name", "date")
    )


def decent_table(year, month):
    return (
        load_data(year, month)
        .pipe(process_temp)
        .with_columns(
            pl.col("lon").list.first() / 100000,
            pl.col("lat").list.first() / 100000,
            pl.col("elevation").list.first(),
        )
    )

In [None]:
for year in trange(2016, 2024):
    for month in trange(1, 13, leave=False):
        load_data(year, month).write_ipc(base / "tables" / f"{year}-{month:02d}.arrow")