In [1]:
import requests
import pandas as pd
import pyarrow as pa
import pyarrow.json as pjson
import polars as pl
from tqdm.notebook import tqdm
from datetime import datetime
from time import sleep
from io import StringIO

import json
from datetime import date
from pathlib import Path

content_path = Path("/Users/davidenicoli/Local_Workspace/Datasets/ARPA/PIEMONTE/")
meteo_path = content_path / "meteo"
centenaria_path = content_path / "centenaria"


def db_path(section):
    return content_path / section / "fragments"


def meta_path(section):
    return content_path / section / "meta.csv"


def dati_giornalieri(
    id_punto_misura, section, session: requests.Session
):
    return session.get(f"https://utility.arpa.piemonte.it/meteoidro/dati_giornalieri_{section}/", params={f"fk_id_punto_misura_{section}": id_punto_misura}).json()

In [2]:
content = requests.get(
    "https://utility.arpa.piemonte.it/meteoidro/stazione_meteorologica/"
).content
meteo_info = json.loads(content)

content = requests.get(
    "https://utility.arpa.piemonte.it/meteoidro/stazione_centenaria/"
).content
centenaria_info = json.loads(content)

In [3]:
# Meteo metadata
meteo_interm = pl.DataFrame(
    meteo_info["results"],
    schema_overrides={
        "url": pl.Utf8(),
        "codice_istat_comune": pl.Utf8(),
        "sensori_meteo": pl.List(
            pl.Struct(
                {
                    "url": pl.Utf8(),
                    "id_parametro": pl.Utf8(),
                    "data_inizio": pl.Utf8(),
                    "data_fine": pl.Utf8(),
                    "quota_da_pc": pl.Float32(),
                    "altezza_supporto": pl.Float32(),
                    "note": pl.Utf8(),
                    "fk_id_stazione_meteorologica": pl.Utf8(),
                }
            )
        ),
    },
)
meteo_meta = (
    (
        meteo_interm.rename({"url": "url_stazione"})
        .explode("sensori_meteo")
        .with_columns(
            pl.col("sensori_meteo").struct.rename_fields(
                [
                    "url_sensore",
                    "id_parametro",
                    "data_inizio_sensore",
                    "data_fine_sensore",
                    "quota_da_pc",
                    "altezza_supporto",
                    "note_sensore",
                    "copy_url_stazione",
                ]
            )
        )
        .unnest("sensori_meteo")
        .filter(pl.col("id_parametro") == "TERMA")
    )
    .with_columns(
        pl.col("url_stazione")
        .str.split("/")
        .list.get(-2)
        .str.slice(0, 14)
        .alias("codice_punto_misura"),
    )
    .select(
        [
            "url_stazione",
            "url_sensore",
            "id_parametro",
            "data_inizio_sensore",
            "data_fine_sensore",
            "quota_da_pc",
            "altezza_supporto",
            "note_sensore",
            "codice_istat_comune",
            "codice_stazione",
            "denominazione",
            "indirizzo_localita",
            "nazione",
            "longitudine_e_wgs84_d",
            "latitudine_n_wgs84_d",
            "quota_stazione",
            "esposizione",
            "note",
            "tipo_staz",
            "data_inizio",
            "data_fine",
            "sigla_prov",
            "comune",
            "fk_id_punto_misura_meteo",
            "codice_punto_misura",
        ]
    )
)
meteo_meta.write_csv(meta_path("meteo"), quote_style="always")

# Centenaria metadata
centenarie_meta = pl.DataFrame(
    centenaria_info["results"],
).with_columns(
    pl.col("fk_id_punto_misura_centenaria")
    .str.split("/")
    .list.get(-2)
    .alias("codice_punto_misura"),
    pl.col("longitudine_e_wgs84_d").cast(pl.Float64()),
    pl.col("latitudine_n_wgs84_d").cast(pl.Float64()),
)
centenarie_meta.write_csv(meta_path("centenaria"), quote_style="always")

In [4]:
meteo_meta = pl.read_csv(meta_path("meteo"))
centenarie_meta = pl.read_csv(meta_path("centenaria"))

In [5]:
def punto_path(id_punto, section) -> Path:
    return db_path(section) / f"{id_punto}/"

def fragment_path(id_punto, first_date, last_date, section) -> Path:
    return (
        punto_path(id_punto, section) / f"{first_date.strftime(r"%Y-%m-%d")}_{last_date.strftime(r"%Y-%m-%d")}.csv"
    )

def date_extremes(results):
    dates = [datetime.strptime(r["data"], r"%Y-%m-%d") for r in results]
    return min(dates), max(dates)

def dwn_punto_misura(id_punto, session: requests.Session, section):
    pp = punto_path(id_punto, section)
    if not pp.exists():
        pp.mkdir(parents=True, exist_ok=True)
    else:
        return True
    # try:
    page = dati_giornalieri(id_punto, section, session)
    start_date, stop_date = date_extremes(page["results"])
    pl.read_json(StringIO(json.dumps(page["results"]))).drop("url").write_csv(fragment_path(id_punto, start_date, stop_date, section))
    try:
        while page["next"] is not None:
            page = session.get(page["next"]).json()
            start_date, stop_date = date_extremes(page["results"])
            pl.read_json(StringIO(json.dumps(page["results"]))).drop("url").write_csv(fragment_path(id_punto, start_date, stop_date, section))
            sleep(0.2)
        return True
    except:
        for file in pp.glob("*"):
            file.unlink()
        pp.rmdir()
        return False



In [8]:
ids = meteo_meta["codice_punto_misura"].unique().to_list()
for id in tqdm(ids):
    with requests.Session() as session:
        if not dwn_punto_misura(id, session, "meteo"):
            print("Could not download meteo data for ", id)
            sleep(5)

  0%|          | 0/311 [00:00<?, ?it/s]

In [6]:
ids = centenarie_meta["codice_punto_misura"].unique().to_list()
for id in tqdm(ids):
    with requests.Session() as session:
        if not dwn_punto_misura(id, session, "centenaria"):
            print("Could not download meteo data for ", id)
            sleep(5)

  0%|          | 0/2 [00:00<?, ?it/s]