In [3]:
import pandas as pd
import polars as pl
import polars.selectors as cs
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import re
from tqdm.notebook import tqdm
from time import sleep
from random import uniform
from zipfile import is_zipfile

base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "TRENTINO" / "trento"

In [40]:
stat2 = pd.read_xml(base / "meteostations2.xml")
stat2["codice"] = stat2["codice"].str.upper().str.strip()

In [42]:
pl.from_pandas(stat2).filter(pl.col("codice").is_duplicated())

codice,nome,nomebreve,quota,latitudine,longitudine,est,north,inizio,fine
str,str,str,i64,f64,f64,f64,f64,str,str


In [3]:
stat2.to_csv(base / "meta.csv", index=False)

In [4]:
stat_ids = stat2["codice"].str.lower().to_list()

In [26]:
tvar_params = {
    "tmin": {"vn": "Temperatura aria (gradi Celsius) Min da Annale Idrologico", "v": "400.55_400.55", "conv": "T_MIN"},
    "tmax": {"vn": "Temperatura aria (gradi Celsius) Max da Annale Idrologico", "v": "400.56_400.56", "conv": "T_MAX"},
    "t": {"vn": "Temperatura aria (gradi Celsius) ", "v": "400.00_400.00"},
}

def payload(cod_stazione, tvar, data_inizio="01/01/2000", data_fine="01/01/2023"):
    return {
        "co": cod_stazione,
        "v": tvar_params[tvar]["v"],
        "vn": tvar_params[tvar]["vn"],
        "p": "Tutti i dati,01/01/1800,01/01/1800,period,1",
        "o": "Download,download",
        "i": "Giornaliera,Day,1",
        "cat": "rs"
    }


def payload_subhour(cod_stazione, tvar, data_inizio="01/01/2000", data_fine="01/01/2023"):
    return {
        "co": cod_stazione,
        "v": tvar_params["t"]["v"],
        "vn": tvar_params["t"]["vn"],
        "p": "Tutti i dati,01/01/1800,01/01/1800,period,1",
        "o": "Download,download",
        "i": "Tutte le misure,Point,1",
        "cat": "rs",
        # "d1": data_inizio,
        # "d2": data_fine,
    }


base_url = "http://storico.meteotrentino.it/cgi/webhyd.pl"


def download_data(
    cod_stazione,
    tvar,
    path,
    data_inizio="01/01/2000",
    data_fine="01/01/2023",
    payload_fn=payload,
    tout=120,
):
    with requests.Session() as s:
        r = s.get(
            base_url,
            params=payload_fn(cod_stazione, tvar, data_inizio, data_fine),
            timeout=tout,
        )
        archive_url = re.search(r"(http://.+.zip\?\d+)", r.text).group(0)
        with s.get(archive_url, stream=True) as archive:
            with open(path, "wb") as f:
                for chunk in archive.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)


def fragment_path(cod_stazione, tvar):
    path = base / "fragments" / tvar / f"{cod_stazione}.zip"
    if not path.parent.exists():
        path.parent.mkdir(parents=True)
    return path


def subhour_fragment_path(cod_stazione):
    path = base / "fragments_subhour" / f"{cod_stazione}.zip"
    if not path.parent.exists():
        path.parent.mkdir(parents=True)
    return path

In [None]:
for sid in tqdm(stat_ids):
    for tvar in ["tmin", "tmax", "t"]:
        path = fragment_path(sid, tvar)
        if path.exists() and is_zipfile(path):
            continue
        if path.exists():
            path.unlink()
        try:
            download_data(sid, tvar, path)
        except Exception as e:
            print(f"Error with {sid} {tvar}: {e}. Continuing...")
        sleep(uniform(0.2, 2))

In [18]:
def pl_read_t(path):
    stat_id = path.stem
    data = pd.read_csv(
        path,
        encoding="iso-8859-1",
        skiprows=3,
        usecols=range(7),
        header=0,
        names=[
            "time",
            "value_TAVG",
            "valid_TAVG",
            "value_TMIN",
            "valid_TMIN",
            "value_TMAX",
            "valid_TMAX",
        ],
    )
    # data["time"] = pd.to_datetime(data["time"], format=r"%H:%M:%S %d/%m/%Y") #.dt.tz_localize("Europe/Rome")
    data = (
        pl.from_pandas(
            data,
            schema_overrides={
                "time": pl.Utf8(),
                "value_TAVG": pl.Float64(),
                "valid_TAVG": pl.Int32(),
                "value_TMIN": pl.Float64(),
                "valid_TMIN": pl.Int32(),
                "value_TMAX": pl.Float64(),
                "valid_TMAX": pl.Int32(),
            },
        ).with_columns(
            pl.lit(stat_id).str.to_uppercase().str.strip_chars().alias("original_id"),
            pl.col("time").str.to_datetime(format=r"%H:%M:%S %d/%m/%Y").alias("time"),
        ).with_columns(
            pl.col("time").dt.date().alias("date"),
        )
        # .select(~cs.ends_with("TAVG"))
        .filter(
            pl.col("value_TAVG").is_not_null()
            | pl.col("value_TMIN").is_not_null()
            | pl.col("value_TMAX").is_not_null()
        )
    )
    return data

def pl_read_annali(path):
    stat_id = path.stem
    tvar = path.parent.stem
    data = pd.read_csv(
        path,
        encoding="iso-8859-1",
        skiprows=3,
        usecols=range(3),
        header=0,
        names=[
            "time",
            "value",
            "valid"
        ],
    )
    # data["time"] = pd.to_datetime(data["time"], format=r"%H:%M:%S %d/%m/%Y") #.dt.tz_localize("Europe/Rome")
    data = (
        pl.from_pandas(
            data,
            schema_overrides={
                "time": pl.Utf8(),
                "value": pl.Float64(),
                "valid": pl.Int32()
            },
        ).with_columns(
            pl.lit(stat_id).str.to_uppercase().str.strip_chars().alias("original_id"),
            pl.lit(tvar_params[tvar]["conv"]).str.to_uppercase().str.strip_chars().alias("variable"),
            (pl.col("time") + "+0100").str.to_datetime(format=r"%H:%M:%S %d/%m/%Y%z", time_zone="UTC").dt.convert_time_zone("CET").alias("time"),
        )
        # .select(~cs.ends_with("TAVG"))
        .filter(
            pl.col("value").is_not_null()
        )
    )
    return data

def pl_read_subhour(path):
    stat_id = path.stem
    data = pd.read_csv(
        path,
        encoding="iso-8859-1",
        skiprows=3,
        usecols=[0, 1, 2],
        header=0,
        names=["time", "value", "valid"],
    )
    # data["time"] = pd.to_datetime(data["time"], format=r"%H:%M:%S %d/%m/%Y").dt.tz_localize("CET")
    data = (
        pl.from_pandas(
            data,
            schema_overrides={
                "time": pl.Utf8(),
                "value": pl.Float64(),
                "valid": pl.Int32(),
            },
        )
        .with_columns(
            pl.lit(stat_id).str.to_uppercase().str.strip_chars().alias("original_id"),
            (pl.col("time") + "+0100")
            .str.to_datetime(format=r"%H:%M:%S %d/%m/%Y%z")
            .dt.convert_time_zone("CET")
            .alias("time")
        )
        .filter(pl.col("value").is_not_null() & pl.col("valid").lt(150))
        .sort("time")
    )
    return data

In [43]:
# for file in list((base / "fragments" / "t").glob("*.zip")):
#     if not is_zipfile(file):
#         print(f"Error in {file}")
# pl.concat(
#     [pl_read_t(file) for file in (base / "fragments" / "t").glob("*.zip")], how="vertical"
# ).write_parquet(base / "fragments" / "t" / "data.parquet")
data = pl.read_parquet(base / "fragments" / "t" / "data.parquet")

In [38]:
for file in list((base / "fragments" / "tmax").glob("*.zip")) + list((base / "fragments" / "tmin").glob("*.zip")):
    if not is_zipfile(file):
        print(f"Error in {file}")
pl.concat(
    [pl_read_annali(file) for file in list((base / "fragments" / "tmax").glob("*.zip")) + list((base / "fragments" / "tmin").glob("*.zip"))], how="vertical"
).write_parquet(base / "fragments" / "annali.parquet")
data = pl.read_parquet(base / "fragments" / "annali.parquet")

In [10]:
frags = list((base / "fragments").glob("*.zip"))

In [57]:
for file in list((base / "fragments_subhour").glob("*.zip")):
    if not is_zipfile(file):
        print(f"Error in {file}")
    pl_read_subhour(file).write_parquet(
        base / "fragments_subhour" / "dataset" / f"{file.stem}.parquet"
    )

# pl.concat(
#     [pl_read_subhour(file) for file in (base / "fragments").glob("*.zip")], how="vertical"
# ).write_parquet(base / "fragments" / "data.parquet")

In [27]:
to_redwn = (
    data.filter(
        ((pl.col("value_TMIN").is_null() & pl.col("value_TMAX").is_null())
        & pl.col("value_TAVG").is_not_null()) | (pl.col("value_TMIN") >= pl.col("value_TMAX"))
    )["original_id"]
    .unique()
    .to_list()
)

In [31]:
for sid in tqdm(to_redwn):
    path = subhour_fragment_path(sid)
    if path.exists() and is_zipfile(path):
        continue
    try:
        download_data(
            sid, "t", path, "01/01/2000", "01/01/2023", payload_fn=payload_subhour, tout=300
        )
    except Exception as e:
        print(f"Error with {sid}: {e}. Continuing...")
    sleep(uniform(0.2, 1))

  0%|          | 0/44 [00:00<?, ?it/s]

KeyboardInterrupt: 