In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
import polars as pl
from time import sleep
from random import uniform
from io import StringIO
from datetime import datetime, date

base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "MARCHE"

In [2]:
variables = {"T": "TminMdmx", "X": "LonLat_Z", "rawT": "Temp_Val"}


def arpam_payload(
    station_codes, from_date: datetime, to_date: datetime, variable, validated=True
):
    if type(station_codes) == str:
        station_codes = [station_codes]
    payload = {
        "sessid": "9oqbigf87uqb9fdco9bii0fkmd",
        "outputType": "file",
        "SelezionaStazione[]": [f"{code}" for code in station_codes],
        "TipoDato": "validato" if validated else "originale",
        "TipoTabella": f"{variables[variable]}",
        "BeginDate": from_date.strftime(r"%Y-%m-%d+00:00"),
        "EndDate": to_date.strftime(r"%Y-%m-%d+00:00"),
        "LineNumberPdf": "0",
    }
    if variable == "T":
        payload["TimeStepType"] = "d"
        payload["TimeStep"] = "1"
    return payload


def arpam_cookies(sessid="9oqbigf87uqb9fdco9bii0fkmd"):
    return {
        "displayCookieConsent": "y",
        "PHPSESSID": sessid,
    }


def arpam_headers():
    return {
        "Host": "app.protezionecivile.marche.it",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "it-IT,it;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip, deflate",
        "Content-Type": "application/x-www-form-urlencoded",
        "Content-Length": "215",
        "Origin": "http://app.protezionecivile.marche.it",
        "Connection": "keep-alive",
        "Referer": "http://app.protezionecivile.marche.it/sol/temperatura/menu.sol?lang=it",
        # "Cookie": f"displayCookieConsent=y; displayCookieConsent=y; PHPSESSID={sessid}",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
    }


def request_data(stat_number, from_date, to_date, variable, validated=True):
    r = requests.post(
        "http://app.protezionecivile.marche.it/sol/temperatura/queryResultsFile.sol?lang=it",
        data=arpam_payload(stat_number, from_date, to_date, variable, validated),
        headers=arpam_headers(),
        cookies=arpam_cookies(),
    )
    return r.text

In [3]:
form_page = BeautifulSoup((base / "form.html").read_text(), "html.parser")

In [4]:
def station_options(tag):
    return tag.has_attr("data-multiselectid") and tag["data-multiselectid"].startswith(
        "multiselect_7qo2j7oogv7"
    )


opts = form_page.find_all(station_options)
station_numbers = [opt["value"] for opt in opts]
station_infos = [opt.text for opt in opts]

In [5]:
import re

infos_pattern = re.compile(
    r"(?P<name>[^\(]+) \((?P<kind>R.)-(?P<code>\d{4})\) Dati da (?P<start_date>\d{4}-\d{2}-\d{2}) a (?P<end_date>\d{4}-\d{2}-\d{2})"
)


def parse_station_infos(station_info):
    p = re.match(infos_pattern, station_info).groupdict()
    p["start_date"] = datetime.strptime(p["start_date"], r"%Y-%m-%d").date()
    p["end_date"] = datetime.strptime(p["end_date"], r"%Y-%m-%d").date()
    return p


stations = list(map(parse_station_infos, station_infos))

In [6]:
def temperature_fragment_path(station_number, year) -> Path:
    return base / "fragments" / f"{station_number}" / f"{year}.csv"


def raw_temperature_fragment_path(station_number, year, part) -> Path:
    return base / "fragments" / f"{station_number}" / f"{year}_{part}_raw.csv"


def location_path(station_number) -> Path:
    return base / "locations" / f"{station_number}.csv"


def download_station_temperatures(station, sleep_seed=5, progbar=None):
    start_date = max(date(2000, 1, 1), station["start_date"])
    end_date = max(date(2000, 1, 2), station["end_date"])
    years = range(start_date.year, end_date.year + 1)
    starts = [date(year, 1, 1) for year in years]
    ends = [date(year + 1, 1, 1) for year in years]
    any_downloaded = False
    if progbar is None:
        progbar = tqdm(leave=False, position=1)
    progbar.reset(total=len(years))
    progbar.set_description_str(station["name"])
    for start_date, end_date in zip(starts, ends):
        year = start_date.year
        path = temperature_fragment_path(station["code"], year)
        if path.exists():
            progbar.update()
            continue
        any_downloaded = True
        data = request_data(station["code"], start_date, end_date, "T", True)
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(data)
        sleep(uniform(sleep_seed, sleep_seed + 2))
        progbar.update()
    progbar.refresh()
    return any_downloaded


def download_station_raw_temperatures(station, sleep_seed=5, progbar=None):
    years = range(station["start_date"].year, station["end_date"].year + 1)
    starts = [date(year, 1, 1) for year in years] + [date(year, 6, 1) for year in years]
    ends = [date(year, 6, 1) for year in years] + [
        date(year + 1, 1, 1) for year in years
    ]
    parts = [1] * len(years) + [2] * len(years)
    any_downloaded = False
    if progbar is None:
        progbar = tqdm(leave=False, position=1)
    progbar.reset(total=2 * len(years))
    progbar.set_description_str(station["name"])
    for start_date, end_date, part in zip(starts, ends, parts):
        year = start_date.year
        path = raw_temperature_fragment_path(station["code"], year, part)
        if path.exists():
            progbar.update()
            continue
        any_downloaded = True
        data = request_data(station["code"], start_date, end_date, "rawT", True)
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(data)
        sleep(uniform(sleep_seed, sleep_seed + 2))
        progbar.update()
    progbar.refresh()
    return any_downloaded


def download_stations_location(station):
    path = location_path(station["code"])
    if path.exists():
        return False
    path.parent.mkdir(parents=True, exist_ok=True)
    data = request_data(
        station["code"], station["start_date"], station["end_date"], "X"
    )
    path.write_text(data)
    return True

In [7]:
download_station_raw_temperatures(stations[4])

0it [00:00, ?it/s]

True

In [None]:
for station in tqdm(stations):
    with tqdm(leave=True, position=1) as internal_progbar:
        try:
            dwn = download_station_temperatures(
                station, sleep_seed=2, progbar=internal_progbar
            )
            if dwn:
                sleep(uniform(10, 15))
        except Exception as e:
            print(f"There was an error downloading {station['name']}: {e}")
            sleep(uniform(60, 90))
            continue

In [None]:
for station in tqdm(stations):
    try:
        dwn = download_stations_location(station)
        if dwn:
            sleep(uniform(1, 2))
    except Exception as e:
        print(f"There was an error downloading {station['name']}: {e}")
        sleep(uniform(4, 5))
        continue

In [35]:
pd.concat(
    [pd.read_csv(station_meta) for station_meta in (base / "locations").glob("*.csv")],
    axis=0,
    ignore_index=True,
).to_csv(base / "metadata.csv", index=False)

In [8]:
for fragment in (base / "fragments").glob("**/*.csv"):
    if fragment.stat().st_size < 360:
        fragment.unlink()

In [7]:
paths_list = [base / "fragments" / f"{station['code']}" for station in stations]

In [84]:
stations[89]

{'name': 'Montegranaro',
 'kind': 'RT',
 'code': '3471',
 'start_date': datetime.date(2023, 2, 2),
 'end_date': datetime.date(2023, 12, 27)}

In [95]:
def read_possibly_malformed_csv(path: Path, **kwargs) -> pl.DataFrame:
    return (
        pl.read_csv(path, truncate_ragged_lines=True, ignore_errors=False, **kwargs)
        .with_columns(
            pl.col("anno").str.strip_chars().cast(pl.Int32),
            pl.col("mese").str.strip_chars().cast(pl.Int32),
            pl.col("giorno").str.strip_chars().cast(pl.Int32),
        )
        .filter(pl.col("anno").eq(int(path.stem)))
    )


def read_agg_fragments(station_code):
    data_folder = base / "fragments" / f"{station_code}"
    available_tables = list(
        filter(lambda path: not path.stem.endswith("_raw"), data_folder.glob("*.csv"))
    )
    if len(available_tables) == 0:
        return None
    all_tables = [
        read_possibly_malformed_csv(
            fragment,
            columns=[0, 1, 2, 3, 11, 18, 24, 25, 26],
            dtypes=[
                pl.Int32(),
                pl.Utf8(),
                pl.Utf8(),
                pl.Utf8(),
                pl.Float64(),
                pl.Float64(),
                pl.Int32(),
                pl.Float64(),
                pl.Int32(),
            ],
            new_columns=[
                "codice_sensore",
                "anno",
                "mese",
                "giorno",
                "tmin",
                "tmax",
                "num_valori",
                "quality",
                "codice_stazione",
            ],
            null_values=["  Dato mancante", ""],
        )
        for fragment in available_tables
    ]
    return (
        pl.concat(all_tables, how="vertical")
        .with_columns(date=pl.date(pl.col("anno"), pl.col("mese"), pl.col("giorno")))
        .drop(["anno", "mese", "giorno"])
        .filter(pl.col("num_valori").gt(0))
    )


for station in tqdm(stations):
    try:
        station_data = read_agg_fragments(station["code"])
        if station_data is not None:
            station_data.write_parquet(base / "dataset" / f"{station['code']}.parquet")
    except Exception as e:
        print(f"Error reading {station['code']}: {e}")
        continue

  0%|          | 0/157 [00:00<?, ?it/s]