In [1]:
import polars as pl
from requests import Session
from tqdm.notebook import tqdm, trange
import pyarrow.json as pjson
from datetime import date, timedelta

from pathlib import Path

In [2]:
base = Path("/Users/davidenicoli/Local_Workspace/Datasets/ARPA/EMILIA-ROMAGNA/OpenData/")
fragments = base / "fragments"
sub_fragments = base / "fragments.subhour/"
arr_fragments = base / "fragments.arrow/"

In [4]:
def dwn_url(year, month):
    return f"https://dati-simc.arpae.it/opendata/osservati/meteo/storico/{year}-{month:02d}.json.gz"


def dwn_gen(session: Session):
    for year in range(2006, 2025):
        for month in range(1, 13):
            tarball_path = fragments / f"{year}-{month:02d}.json.gz"
            if tarball_path.exists() or date(year, month, 1) > date.today():
                continue
            tarball_url = dwn_url(year, month)
            tarball_content = session.get(tarball_url).content
            with open(tarball_path, "wb") as tarball:
                tarball.write(tarball_content)
            yield tarball_path

In [5]:
with Session() as session:
    dwned = list(dwn_gen(session))
    

In [3]:
def extract_value(var_code):
    return (
        pl.col("data")
        .list.first()
        .struct.field("vars")
        .struct.field(var_code)
        .struct.field("v")
    )


def extract_station_props(data):
    return data.with_columns(
        extract_value("B01019").alias("name"),
        extract_value("B07030").alias("elevation"),
        extract_value("B01001").alias("WMO_block"),
        extract_value("B01002").alias("WMO_num"),
    )


def timerange_level_to_struct(data):
    return data.with_columns(
        pl.col("timerange").list.first().alias("variable"),
        pl.col("timerange").list.get(1).alias("P1"),
        pl.col("timerange").list.get(2).alias("agg_period"),
        pl.col("level").list.first().alias("level_code"),
        pl.col("level").list.get(1).alias("level_value"),
    ).drop("timerange", "level")


def load_data(year, month):
    tarball = fragments / f"{year}-{month:02d}.json.gz"
    return (
        pl.from_arrow(pjson.read_json(tarball))
        # pl.read_ipc(tarball, memory_map=False)
        .pipe(extract_station_props)
        .explode("data")
        .unnest("data")
        .drop_nulls("timerange")
        .unnest("vars")
        .select(
            "lon",
            "lat",
            "elevation",
            "WMO_block",
            "WMO_num",
            "name",
            "network",
            "date",
            "B12101",
            "timerange",
            "level",
        )
        .with_columns(pl.col("B12101").struct.field("v").alias("T"))
        .drop_nulls("T")
        .drop("B12101")
        .pipe(timerange_level_to_struct)
    )

# def collect_all_data(year, month):
#     cdate = date(year, month, 1)
#     pdate = cdate + timedelta(days=-1)
#     ndate = date(year, month, 28) + timedelta(weeks=1)
#     ndate = date(ndate.year, ndate.month, 1)
#     data = pl.concat([
#         pl.from_arrow(
#             pjson.read_json(fragments / f"{date.year}-{date.month:02d}.json.gz")
#         ).filter(pl.col("date").is_between(pdate, ndate))
#         for date in [pdate, cdate, ndate]
#     ])
#     return data

# def process_temp(data):
#     return (
#         data.group_by("name", "network", pl.col("date").dt.date())
#         .agg(
#             pl.col("T").min().alias("T_MIN"),
#             pl.col("T").max().alias("T_MAX"),
#             pl.col("lon").drop_nulls().mode(),
#             pl.col("lat").drop_nulls().mode(),
#             pl.col("elevation").drop_nulls().mode(),
#         )
#         .melt(
#             id_vars=["network", "name", "date", "lon", "lat", "elevation"],
#             value_vars=["T_MIN", "T_MAX"],
#         )
#         .sort("variable", "network", "name", "date")
#     )


# def decent_table(year, month):
#     return (
#         load_data(year, month)
#         .pipe(process_temp)
#         .with_columns(
#             pl.col("lon").list.first() / 100000,
#             pl.col("lat").list.first() / 100000,
#             pl.col("elevation").list.first(),
#         )
#     )

In [7]:
tarball = fragments / f"{2011}-{2:02d}.json.gz"
raw = pl.from_arrow(pjson.read_json(tarball))

In [13]:
raw.explode("data").unnest("data").unnest("vars")

version,network,ident,lon,lat,date,B01019,B01194,B04001,B04002,B04003,B04004,B04005,B04006,B05001,B06001,B07030,B07031,B13011,B12101,B13003,B11002,B11041,B11001,B14198,B10004,B13013,B13215,B11043,B13082,B22001,B22070,B22074,B22071,B22043,B01001,B01002,B13080,B13083,B13231,B22062,timerange,level
str,str,null,i64,i64,datetime[ms],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[1],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[2],struct[1],struct[1],struct[1],struct[1],struct[1],struct[2],struct[1],struct[1],struct[2],struct[2],struct[1],struct[2],list[i64],list[i64]
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,"{""Carpineti""}","{""agrmet""}",{2011},{2},{1},{0},{0},{0},{44.47562},{10.50865},{580.0},{580.0},"{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}",,
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{0.0,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[1, 0, 900]","[1, null, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{0.2,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[1, 0, 3600]","[1, null, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{13.2,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[1, 0, 86400]","[1, null, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{null,{null}}","{273.85,{null}}","{97,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[0, 0, 3600]","[103, 2000, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{null,{null}}","{274.27,{null}}","{95,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[0, 0, 86400]","[103, 2000, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{null,{null}}","{274.35,{null}}","{98,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[2, 0, 3600]","[103, 2000, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{null,{null}}","{276.95,{null}}","{99,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[2, 0, 86400]","[103, 2000, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{null,{null}}","{273.55,{null}}","{97,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[3, 0, 3600]","[103, 2000, … null]"
"""0.1""","""agrmet""",,1050865,4447562,2011-02-01 00:00:00,{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},{null},"{null,{null}}","{271.75,{null}}","{84,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}","{null,{null}}",{null},{null},{null},{null},{null},"{null,{null}}",{null},{null},"{null,{null}}","{null,{null}}",{null},"{null,{null}}","[3, 0, 86400]","[103, 2000, … null]"


In [None]:
for year in trange(2006, 2025):
    for month in trange(1, 13, leave=False):
        archive_path = base / "tables" / f"{year}-{month:02d}.parquet"
        if archive_path.exists() or date(year, month, 1) > date.today():
            continue
        load_data(year, month).write_parquet(archive_path)

In [4]:
load_data(2022, 11).sort("date")

lon,lat,elevation,WMO_block,WMO_num,name,network,date,T,variable,P1,agg_period,level_code,level_value
i64,i64,f64,i64,i64,str,str,datetime[ms],f64,i64,i64,i64,i64,i64
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,283.95,0,0,3600,103,2000
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,287.81,0,0,86400,103,2000
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,284.65,2,0,3600,103,2000
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,298.45,2,0,86400,103,2000
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,282.65,3,0,3600,103,2000
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,280.75,3,0,86400,103,2000
958959,4504139,68.0,,,"""S. Nicolo'""","""agrmet""",2022-11-01 00:00:00,284.05,254,0,0,103,2000
1000494,4500253,41.0,,,"""Castellazzo Vi…","""agrmet""",2022-11-01 00:00:00,282.95,0,0,3600,103,2000
1000494,4500253,41.0,,,"""Castellazzo Vi…","""agrmet""",2022-11-01 00:00:00,287.99,0,0,86400,103,2000
1000494,4500253,41.0,,,"""Castellazzo Vi…","""agrmet""",2022-11-01 00:00:00,283.95,2,0,3600,103,2000
