In [1]:
import requests
from requests import Session
import polars as pl
from pathlib import Path
from time import sleep
from random import random
from tqdm.notebook import tqdm
from pathlib import Path
import os

In [2]:
# termo_csv è temperature
# pluvio0_24 è precipitazioni 0-24
# pluvio è precipitazioni 9-9
def download_payload(id, variable):
    return {
        "IDST": variable,
        "IDS": id
    }
base_url = "http://www.sir.toscana.it/archivio/download.php"

In [3]:
# All auto_sir stations are in automatiche.csv. All stations are in stazioni.csv
def read_metas(base: Path):
    auto_sir = pl.read_csv(
        base / "automatiche_sir.csv",
        encoding="iso-8859-1",
        separator=";",
        dtypes={"IDSensoreRete": pl.Int64()},
    ).with_columns(pl.lit("Automatiche SIR").alias("network"))
    auto = pl.read_csv(
        base / "automatiche.csv", encoding="iso-8859-1", separator=";"
    ).with_columns(pl.lit("Automatiche").alias("network")).join(auto_sir, on="IDStazione", how="anti")
    trad = pl.read_csv(
        base / "tradizionali.csv",
        encoding="iso-8859-1",
        separator=";",
        dtypes={"IDSensoreRete": pl.Int64()},
    ).with_columns(pl.lit("Tradizionali").alias("network"))

    all_meta = pl.concat([auto, auto_sir, trad], how = "vertical")
    all_meta.write_csv(base / "stazioni.csv")
    station_ids = all_meta["IDStazione"].to_list()
    return station_ids

In [4]:
headers_l = [
			{
				"name": "Accept",
				"value": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
			},
			{
				"name": "Accept-Encoding",
				"value": "gzip, deflate"
			},
			{
				"name": "Accept-Language",
				"value": "it-IT,it;q=0.8,en-US;q=0.5,en;q=0.3"
			},
			{
				"name": "Cache-Control",
				"value": "no-cache"
			},
			{
				"name": "Connection",
				"value": "keep-alive"
			},
			{
				"name": "Host",
				"value": "www.sir.toscana.it"
			},
			{
				"name": "Pragma",
				"value": "no-cache"
			},
			{
				"name": "Referer",
				"value": "http://www.sir.toscana.it/consistenza-rete"
			},
			{
				"name": "Upgrade-Insecure-Requests",
				"value": "1"
			},
			{
				"name": "User-Agent",
				"value": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0"
			}
		]
headers = {}
for h in headers_l:
	headers[h["name"]] = h["value"]

In [8]:
# Per 'variable':
# - termo_csv è temperature
# - pluvio0_24 è precipitazioni 0-24
# - pluvio è precipitazioni 9-9
def download_station(base_path, stat_id, session, variable, skip_existing = True, verbose = False):
    if variable is list:
        for v in variable:
            try:
                download_station(base_path, stat_id, session, v, skip_existing)
            except:
                if verbose:
                    print(f"Error downloading {v} for station {stat_id}")
            return True
    dwn_path = base_path / "fragments" / f"{stat_id}.csv"
    # Meccanismo di resume: se è già presente un file di almeno 100 byte, non lo scarico
    present = dwn_path.exists() and (dwn_path.stat().st_size > 100)
    if not present or not skip_existing:
        csv_content = session.get(base_url, params=download_payload(stat_id, variable), headers=headers, cookies=session.cookies).text
        dwn_path.write_text(
            csv_content, encoding="utf-8"
        )
    return present

def download_dataset(base_path: Path, variable: str|list[str], max_pause = 3):
    station_ids = read_metas(base_path)
    os.makedirs(base_path / "fragments", exist_ok=True)
    with Session() as session:
        for stat_id in tqdm(station_ids):
            if not download_station(base_path, stat_id, session, variable):
                sleep(max_pause * random())

In [None]:
base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "TOSCANA" / "pluvio"
download_dataset(base, ["pluvio0_24", "pluvio"])