In [1]:
import pandas as pd
import polars as pl
import polars.selectors as cs
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import re
from tqdm.notebook import tqdm
from time import sleep
from random import uniform
from zipfile import is_zipfile

base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "TRENTINO" / "trento"

In [4]:
stat2 = pd.read_xml(base / "meteostations2.xml")
stat2["codice"] = stat2["codice"].str.upper().str.strip()


In [5]:
stat2.to_csv(base / "meta.csv", index=False)

In [6]:
stat_ids = stat2["codice"].str.lower().to_list()

In [13]:
def payload(cod_stazione, data_inizio="01/01/2000", data_fine="01/01/2023"):
    return {
        "co": cod_stazione,
        "v": "400.00_400.00,400.55_400.55,400.56_400.56",
        "vn": "Temperatura aria (gradi Celsius),Temperatura aria (gradi Celsius) Min da Annale Idrologico,Temperatura aria (gradi Celsius) Max da Annale Idrologico",
        "p": "Altro,1,1,custom,1",
        "o": "Download,download",
        "i": "Giornaliera,Day,1",
        "cat": "rs",
        "d1": data_inizio,
        "d2": data_fine,
    }


def payload_every(cod_stazione, data_inizio="01/01/2000", data_fine="01/01/2023"):
    return {
        "co": cod_stazione,
        "v": "400.00_400.00",
        "vn": "Temperatura aria (gradi Celsius)",
        "p": "Altro,1,1,custom,1",
        "o": "Download,download",
        "i": "Tutte le misure,Point,1",
        "cat": "rs",
        "d1": data_inizio,
        "d2": data_fine,
    }


base_url = "http://storico.meteotrentino.it/cgi/webhyd.pl"


def download_data(
    cod_stazione,
    path,
    data_inizio="01/01/2000",
    data_fine="01/01/2023",
    payload_fn=payload,
    tout=120,
):
    with requests.Session() as s:
        r = s.get(
            base_url,
            params=payload_fn(cod_stazione, data_inizio, data_fine),
            timeout=tout,
        )
        archive_url = re.search(r"(http://.+.zip\?\d+)", r.text).group(0)
        with s.get(archive_url, stream=True) as archive:
            with open(path, "wb") as f:
                for chunk in archive.iter_content(chunk_size=1024 * 1024):
                    if chunk:
                        f.write(chunk)


def fragment_path(cod_stazione):
    path = base / "fragments" / f"{cod_stazione}.zip"
    if not path.parent.exists():
        path.parent.mkdir(parents=True)
    return path


def subhour_fragment_path(cod_stazione):
    path = base / "fragments_subhour" / f"{cod_stazione}.zip"
    if not path.parent.exists():
        path.parent.mkdir(parents=True)
    return path

In [None]:
for sid in tqdm(stat_ids):
    path = fragment_path(sid)
    if path.exists() and is_zipfile(path):
        continue
    try:
        download_data(sid, path, "01/01/2000", "01/01/2023")
    except:
        print(f"Error with {sid}. Continuing...")
    sleep(uniform(1, 5))

In [7]:
def pl_read(path):
    stat_id = path.stem
    data = pd.read_csv(
        path,
        encoding="iso-8859-1",
        skiprows=3,
        usecols=[0, 1, 2, 3, 4, 5, 6],
        header=0,
        names=[
            "time",
            "value_TAVG",
            "valid_TAVG",
            "value_TMIN",
            "valid_TMIN",
            "value_TMAX",
            "valid_TMAX",
        ],
    )
    # data["time"] = pd.to_datetime(data["time"], format=r"%H:%M:%S %d/%m/%Y") #.dt.tz_localize("Europe/Rome")
    data = (
        pl.from_pandas(
            data,
            schema_overrides={
                "time": pl.Utf8(),
                "value_TAVG": pl.Float64(),
                "valid_TAVG": pl.Int32(),
                "value_TMIN": pl.Float64(),
                "valid_TMIN": pl.Int32(),
                "value_TMAX": pl.Float64(),
                "valid_TMAX": pl.Int32(),
            },
        ).with_columns(
            pl.lit(stat_id).str.to_uppercase().str.strip_chars().alias("original_id"),
            pl.col("time").str.to_datetime(format=r"%H:%M:%S %d/%m/%Y").alias("time")
        )
        # .select(~cs.ends_with("TAVG"))
        .filter(
            pl.col("value_TAVG").is_not_null()
            | pl.col("value_TMIN").is_not_null()
            | pl.col("value_TMAX").is_not_null()
        )
    )
    return data

def pl_read_subhour(path):
    stat_id = path.stem
    data = pd.read_csv(
        path,
        encoding="iso-8859-1",
        skiprows=3,
        usecols=[0, 1, 2],
        header=0,
        names=[
            "time",
            "value",
            "valid"
        ],
    )
    # data["time"] = pd.to_datetime(data["time"], format=r"%H:%M:%S %d/%m/%Y") #.dt.tz_localize("Europe/Rome")
    data = (
        pl.from_pandas(
            data,
            schema_overrides={
                "time": pl.Utf8(),
                "value": pl.Float64(),
                "valid": pl.Int32(),
            },
        ).with_columns(
            pl.lit(stat_id).str.to_uppercase().str.strip_chars().alias("original_id"),
            pl.col("time").str.to_datetime(format=r"%H:%M:%S %d/%m/%Y").alias("time")
        )
        .filter(
            pl.col("value").is_not_null() &
            pl.col("valid").lt(150)
        )
    )
    return data


In [9]:
# for file in list((base / "fragments").glob("*.zip"))[:1]:
#     if not is_zipfile(file):
#         print(f"Error in {file}")
# pl.concat(
#     [pl_read(file) for file in (base / "fragments").glob("*.zip")], how="vertical"
# ).write_parquet(base / "fragments" / "data.parquet")
data = pl.read_parquet(base / "fragments" / "data.parquet")

In [10]:
frags = list((base / "fragments").glob("*.zip"))

In [11]:
for file in list((base / "fragments_subhour").glob("*.zip")):
    if not is_zipfile(file):
        print(f"Error in {file}")
    pl_read_subhour(file).write_parquet(base / "fragments_subhour" / "dataset" / f"{file.stem}.parquet")

# pl.concat(
#     [pl_read_subhour(file) for file in (base / "fragments").glob("*.zip")], how="vertical"
# ).write_parquet(base / "fragments" / "data.parquet")

In [9]:
to_redwn = (
    data.filter(
        (pl.col("value_TMIN").is_null() & pl.col("value_TMAX").is_null())
        & pl.col("value_TAVG").is_not_null()
    )["original_id"]
    .unique()
    .to_list()
)

In [52]:
for sid in tqdm(to_redwn):
    path = subhour_fragment_path(sid)
    if path.exists() and is_zipfile(path):
        continue
    try:
        download_data(
            sid, path, "01/01/2000", "01/01/2023", payload_fn=payload_every, tout=300
        )
    except Exception as e:
        print(f"Error with {sid}: {e}. Continuing...")
    sleep(uniform(5, 10))

  0%|          | 0/169 [00:00<?, ?it/s]