In [1]:
import requests
from requests import Session
import polars as pl
from pathlib import Path
from time import sleep
from random import random
from tqdm.notebook import tqdm

base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "TOSCANA"

In [2]:
def dwn_url(id):
    return f"http://www.sir.toscana.it/archivio/download.php?IDST=termo_csv&IDS={id}"

In [7]:
# All auto_sir stations are in automatiche.csv. All stations are in stazioni.csv
auto_sir = pl.read_csv(
    base / "automatiche_sir.csv",
    encoding="iso-8859-1",
    separator=";",
    dtypes={"IDSensoreRete": pl.Int64()},
).with_columns(pl.lit("Automatiche SIR").alias("network"))
auto = pl.read_csv(
    base / "automatiche.csv", encoding="iso-8859-1", separator=";"
).with_columns(pl.lit("Automatiche").alias("network")).join(auto_sir, on="IDStazione", how="anti")
trad = pl.read_csv(
    base / "tradizionali.csv",
    encoding="iso-8859-1",
    separator=";",
    dtypes={"IDSensoreRete": pl.Int64()},
).with_columns(pl.lit("Tradizionali").alias("network"))

all_meta = pl.concat([auto, auto_sir, trad], how = "vertical")
all_meta.write_csv(base / "stazioni.csv")
station_ids = all_meta["IDStazione"].to_list()

In [8]:
headers_l = [
			{
				"name": "Accept",
				"value": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
			},
			{
				"name": "Accept-Encoding",
				"value": "gzip, deflate"
			},
			{
				"name": "Accept-Language",
				"value": "it-IT,it;q=0.8,en-US;q=0.5,en;q=0.3"
			},
			{
				"name": "Cache-Control",
				"value": "no-cache"
			},
			{
				"name": "Connection",
				"value": "keep-alive"
			},
			{
				"name": "Host",
				"value": "www.sir.toscana.it"
			},
			{
				"name": "Pragma",
				"value": "no-cache"
			},
			{
				"name": "Referer",
				"value": "http://www.sir.toscana.it/consistenza-rete"
			},
			{
				"name": "Upgrade-Insecure-Requests",
				"value": "1"
			},
			{
				"name": "User-Agent",
				"value": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0"
			}
		]
headers = {}
for h in headers_l:
	headers[h["name"]] = h["value"]

In [10]:
# headers = {
#     "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"
# }
def dwn(stat_id, session):
    dwn_path = base / "fragments" / f"{stat_id}.csv"
    present = dwn_path.exists() and (dwn_path.stat().st_size > 100)
    if not present:
        csv_url = dwn_url(stat_id)
        dwn_path.write_text(
            session.get(csv_url, headers=headers, cookies=session.cookies).text, encoding="utf-8"
        )
    return present

with Session() as session:
    for stat_id in tqdm(station_ids):
        if not dwn(stat_id, session):
            sleep(3 * random())

  0%|          | 0/513 [00:00<?, ?it/s]