In [2]:
import requests
import bs4
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "FVG"

In [3]:
with open(base / "page.html") as f:
    soup = bs4.BeautifulSoup(f, "html.parser")
station_ids = [
    line.attrs["value"] for line in soup.find(id="stazione").contents if line != "\n"
][1:]
station_ids = sorted(station_ids)

In [4]:
import json

with open(base / "test.har") as test_req:
    test_req = json.load(test_req)
data = test_req["log"]["entries"][0]["request"]["postData"]["params"]
headers = test_req["log"]["entries"][0]["request"]["headers"]

In [None]:
from time import sleep
import random

request_url = "https://www.osmer.fvg.it/ajax/getStationData.php"


def req_data(station_id: str, year: str, session: requests.Session) -> str:
    """
    Sends a POST request to retrieve data for a specific station and year.

    Args:
        station_id (str): The ID of the station.
        year (int): The year for which data is requested.
        session (requests.Session): The session object to use for the request.

    Returns:
        str: The response text of the request.
    """
    r = session.get("https://www.osmer.fvg.it/archivio.php?ln=&p=dati")
    data = {
        "a": f"{year}",
        "m": "99",
        "g": "10",
        "s": station_id,
        "t": "H_3",
        "o": "visualizza",
        "ln": "",
    }
    sleep(random.uniform(1, 3))
    cookie = f"SameSite=None; Secure=1; meteofvg_cookie=1; PHPSESSID={session.cookies['PHPSESSID']}"
    headers1 = {header["name"]: header["value"] for header in headers}
    headers1.pop("Cookie")
    headers1["Cookie"] = cookie
    r = session.post(request_url, data=data, headers=headers1)
    return r.text


def stat_path(station_id):
    return base / "data" / f"{station_id.split('@')[0]}"


def file_path(station_id, year):
    return stat_path(station_id) / f"{year}.html"

# Downloading data
for station_id in tqdm(station_ids):
    skipped = True
    if not stat_path(station_id).exists():
        stat_path(station_id).mkdir(parents=True)
    with requests.Session() as s:
        for year in tqdm(range(1980, 2025), leave=False):
            if not file_path(station_id, year).exists():
                data = req_data(station_id, year, s)
                skipped = False
                with open(file_path(station_id, year), "wt") as f:
                    f.write(data)
                sleep(random.uniform(1, 2))
    if not skipped:
        sleep(random.uniform(10, 20))

In [5]:
# Writing station metadata to CSV
(
    pd.Series(station_ids)
    .str.split("@", expand=True)
    .rename(
        columns={
            0: "station_code",
            1: "station_name",
            2: "station_kind",
            3: "lat",
            4: "lon",
            5: "elevation",
        }
    )
    .astype(
        {
            "station_code": "string",
            "station_name": "string",
            "station_kind": "string",
            "lat": "float",
            "lon": "float",
            "elevation": "float",
        }
    )
    .to_csv(base / "station_info.csv", index=False)
)