In [20]:
from pathlib import Path
import re
import pandas as pd
from io import StringIO
import polars as pl
import polars.selectors as cs
from tqdm.notebook import tqdm
from requests import Session
from datetime import datetime, date
import requests
from time import sleep
from bs4 import BeautifulSoup


liguria_path = Path.home() / "Local_Workspace/Datasets/ARPA/LIGURIA"
fragments_path = liguria_path / "fragments"

In [21]:
def show_resp(resp):
    from bs4 import BeautifulSoup

    print(BeautifulSoup(resp.text).prettify())


def payload_11():
    return {"Azione": "", "CodRete": "", "CodTema": "STAZIONE"}


def payload_12(stat_code):
    return {
        "CodBacino": "",
        "CodUbic": stat_code,
        "CodProv": "",
        "Azione": "",
        "Frequenza": "GG",
    } | payload_11()


def payload_13(req_id, stat_code, year1, year2):
    return {
        "Frequenza": "GG",
        "IdRichiesta": req_id,
        "IdRichiestaCarto": "",
        "CodTema": "STAZIONE",
        "Param": ["TEMPTRMWC4,TEMPTRMWC7"],
        "TipoOutput": "XLS",
        "Separatore": "TAB",
        "DataIniz": f"01/01/{year1}",
        "InizOra": "00:00",
        "DataFine": f"31/12/{year2}",
        "FineOra": "23:59",
        "Anno": f"{year1}",
        "IdTema": "",
        "CodParam": "",
        "CodProv": "",
        "CodCom": "",
    } | payload_12(stat_code)


def final_payload(req_id, year1, year2):
    return {
        # "Pagina": "PubAccessoDatiMeteoPost.asp",
        "CodParam": "TEMPTRMWC4,TEMPTRMWC7",
        "CodTema": "STAZIONE",
        "IdEstraz": "DE",
        "Frequenza": "GG",
        "TipoOutput": "XLS",
        "Separatore": "TAB",
        "IdRichiesta": req_id,
        "IdRichiestaCarto": "",
        "DataIniz": f"01/01/{year1}",
        "InizOra": "00:00",
        "DataFine": f"31/12/{year2}",
        "FineOra": "23:59",
    }

In [27]:
def url_x(idx):
    return f"https://ambientepub.regione.liguria.it/SiraQualMeteo/script/PubAccessoDatiMeteo{idx}.asp"

def request_data(path: Path, year1, year2, stat_code):
    with requests.Session() as s:
        r0 = s.get(
            "https://ambientepub.regione.liguria.it/SiraQualMeteo/script/PubAccessoDatiMeteo.asp"
        )
        s.get(url_x(11))
        r2 = s.get(url_x(12))
        req_id = re.search(r"NAME=IdRichiesta VALUE=(\d+)", r2.text).group(1)
        s.get(url_x(13))
        sleep(1)
        # Selecting STAZIONE
        s.post(url_x(12), data=payload_11())
        r3 = s.get(url_x(13), data={"CodTema": "STAZIONE"})
        r2 = s.post(
            url_x(12),
            data={
                "TipoTema": "STAZIONE",
                "Azione": "",
                "CodRete": "",
                "CodTema": "STAZIONE",
            },
        )
        sleep(1)
        # Selecting the station
        r3 = s.get(
            url_x(13),
            data={
                "Frequenza": "GG",
                "IdRichiesta": req_id,
                "IdRichiestaCarto": "",
                "CodTema": "STAZIONE",
                "CodUbic": stat_code,
            },
        )
        bs3 = BeautifulSoup(r3.text)
        sensors = []
        if bs3.find("option", value="TEMPTRMWC4"):
            sensors.append("TEMPTRMWC4")
        if bs3.find("option", value="TEMPTRMWC7"):
            sensors.append("TEMPTRMWC7")
        if len(sensors) == 0:
            return "NOSENSORS"
        s.post(
            url_x(12),
            data={
                "CodBacino": "",
                "CodUbic": stat_code,
                "CodProv": "",
                "Ubic": stat_code,
                "LatMin": "",
                "LonMin": "",
                "LatMax": "",
                "LonMax": "",
                "Frequenza": "GG",
                "Azione": "INSERISCI_TEMA",
                "CodTema": "STAZIONE",
                "IdRichiesta": req_id,
                "IdRichiestaCarto": "",
            },
        )
        sleep(5)
        r = s.get(
            "https://ambientepub.regione.liguria.it/SiraQualMeteo/script/PubAccessoDatiMeteoPost.asp",
            data=final_payload(req_id, year1, year2),
        ).text
        try:
            dwn_url = re.search(
                r"(https\:\/\/ambientepub\.regione\.liguria\.it\/SiraQualMeteo\/report\/\d+\.csv)",
                r,
            ).group(1)
        except:
            print(f"Could not request {stat_code} for {year1}")
            return "REQ_ERROR"
        sleep(5)
        try:
            path.parent.mkdir(parents=True, exist_ok=True)
            with open(path, "wb") as f:
                f.write(s.get(dwn_url).content)
        except:
            print(f"Could not download {stat_code} for {year1}")
            return "DWN_ERROR"
    return "OK"

In [28]:
def parse_meta(table):
    start_date = re.search(r"\d{2}/\d{4}", table.iloc[6, 2])
    if start_date:
        start_date = datetime.strptime(start_date.group(0), "%m/%Y").date()
    else:
        start_date = None
    return pl.from_dict(
        {
            "lon": float(table.iloc[3, 4]),
            "lat": float(table.iloc[4, 4]),
        }
    )


def retr_data(user_codes):
    tables = pl.concat(
        [
            parse_meta(
                pd.read_html(
                    liguria_path / "stazioni" / f"{uc}.html", encoding="utf-8"
                )[0]
            )
            for uc in user_codes
        ]
    )

    return pl.concat(
        [pl.DataFrame({"user_code": user_codes}), tables], how="horizontal"
    )

meta = (
    pl.from_pandas(
        pd.read_html(
            liguria_path / "Lista Stazioni.html", header=0, extract_links="all"
        )[0].iloc[::2]
    )
    .with_columns(
        pl.col("('CODICE STAZIONE', None)").list.get(1).str.extract(r"stazione=(.+)$").alias("identifier"),
        cs.all().list.first(),
    )
    .rename(
        {
            "('CODICE STAZIONE', None)": "user_code",
            "('NOME STAZIONE', None)": "anagrafica",
            "('PROVINCIA', None)": "province",
            "('BACINO', None)": "BACINO",
            "('QUOTA (m)', None)": "elevation",
        }
    )
)

stations_info = (
    pl.from_records(
        pd.read_html(
            liguria_path / "Lista Stazioni.html", header=0, extract_links="all"
        )[0].iloc[::2, 0]
    )
    .rename({"column_0": "user_code", "column_1": "url"})
    .with_columns(
        pl.col("url").str.replace(
            r"../", "https://ambientepub.regione.liguria.it/SiraQualMeteo/"
        )
    )
)

meta = (
    meta.join(
        retr_data(stations_info["user_code"].to_list()),
        on="user_code",
        how="left",
        validate="1:1",
    )
    .sort("anagrafica")
)
meta.write_csv(liguria_path / "metadata.csv")

In [33]:
results = {}
for codes in tqdm(meta.select("user_code", "identifier").to_dicts()):
    user_code = codes["user_code"]
    identifier = codes["identifier"]
    path = fragments_path / f"{user_code}.csv"
    if not path.exists():
        results[user_code] = request_data(path, 1950, 2024, identifier)
        if results[user_code] != "NOSENSORS":
            sleep(5)


  0%|          | 0/345 [00:00<?, ?it/s]

Could not request ME00310 for 1950
Could not request ME00313 for 1950
Could not request ME00020 for 1950
Could not request ME00331 for 1950
Could not request ME00269 for 1950
Could not request ME00344 for 1950
Could not request ME00270 for 1950
Could not request ME00232 for 1950
Could not request ME00012 for 1950
Could not request ME00097 for 1950
Could not request ME00314 for 1950


In [34]:
input_schema = {
    "Inizio rilevazione": pl.Date(),
    "Fine rilevazione": pl.Date(),
    "Valore": pl.Float64(),
    "Dataset": pl.Utf8(),
    "Valido": pl.Utf8(),
}

output_schema = {
    "date": pl.Date(),
    "value": pl.Float64(),
    "Dataset": pl.Utf8(),
    "valid": pl.Boolean(),
}


def parse_table(text: str):
    if text.startswith("Nessun dato"):
        table = pl.DataFrame(schema=output_schema)
        dati_info = {"read": 0, "valid": 0}
    else:
        table, dati_info = text.strip().split("\n\n")
        table = (
            pl.read_csv(StringIO(table), schema=input_schema)
            .rename({"Inizio rilevazione": "date", "Valore": "value"})
            .select(
                "date", "value", "Dataset", pl.col("Valido").eq("Sì").alias("valid")
            )
        )
        dati_info = list(map(lambda line: line.split(",")[1], dati_info.split("\n")))
        dati_info = {"read": dati_info[0], "valid": dati_info[1]}
    return table, dati_info


tvars = {"MASSIMA": "T_MAX", "MINIMA": "T_MIN"}


class StationData:
    def __init__(self, line: str):
        parts = line.split("\n", maxsplit=3)
        self.anagrafica = parts[0].strip()
        self.variable = tvars[re.search("MASSIMA|MINIMA", parts[1]).group(0)]
        self.table, self.dati_info = parse_table(parts[3])
        self.table = self.table.with_columns(
            pl.lit(self.anagrafica).alias("anagrafica"),
            pl.lit(self.variable).alias("variable"),
        )

In [35]:
for fragment_path in tqdm(list(fragments_path.glob("*.csv"))):
    with open(fragment_path, "rt", encoding="iso-8859-1") as fragment_handle:
        fragments = fragment_handle.read().split('"Stazione",')
    fragments = [block.strip() for block in fragments if block.strip()]
    var_tables = []
    for fragment in fragments:
        sd = StationData(fragment)
        var_tables.append(sd.table)
    pl.concat(var_tables, how="vertical").with_columns(pl.lit(fragment_path.stem).alias("user_code")).write_parquet(liguria_path / "dataset" / f"{fragment_path.stem}.parquet")

  0%|          | 0/190 [00:00<?, ?it/s]

In [37]:
full_data = pl.scan_parquet((liguria_path / "dataset").glob("*.parquet")) #pl.read_parquet(liguria_path / "dataset.parquet")

In [51]:

    # .with_columns(
    #     pl.col("start_date").fill_null(date(2001, 1, 1)),
    #     pl.col("stop_date").fill_null(date(2023, 12, 31)),
    # )
    # .filter(
    #     pl.col("anagrafica").ne("BOLSINE")
    #     | pl.col("start_date").ne(date(2001, 1, 1))
    #     | pl.col("stop_date").ne(date(2023, 12, 31))
    # )

In [52]:
meta.write_csv(liguria_path / "metadata.csv")

In [44]:
fd1 = (
    full_data.join(
        meta[["identifier", "anagrafica", "start_date", "stop_date"]], on="anagrafica"
    )
    .filter(
        (pl.col("start_date") <= pl.col("date"))
        & (pl.col("date") <= pl.col("stop_date"))
    )
    .drop("start_date", "stop_date")
)

In [45]:
fd1.filter(pl.struct("date", "identifier", "variable").is_duplicated()).sort(
    "identifier", "date"
)

date,value,valid,anagrafica,variable,identifier
date,f64,bool,str,str,str


In [43]:
full_data.filter(pl.struct("date", "anagrafica", "variable").is_duplicated()).group_by(
    "anagrafica"
).agg(pl.col("date").min().alias("first_date"), pl.col("date").max().alias("last_date"))

anagrafica,first_date,last_date
str,date,date
"""ISOVERDE""",2014-06-09,2014-06-13
"""VERZI LOANO""",2014-03-10,2015-12-31
"""CAMOGLI""",2015-04-25,2017-12-22
"""AIROLE""",2015-07-14,2015-12-31
"""STATALE""",2012-02-27,2015-12-31
"""MONTE CAPPELLI…",2013-03-15,2013-03-20
"""DOLCEDO""",2005-12-31,2006-12-31


In [44]:
meta.filter(pl.col("start_date") >= pl.col("stop_date"))

anagrafica,identifier,province,BACINO,elevation,lon,lat,start_date,stop_date
str,str,str,str,str,f64,f64,date,date


### è impossibile risalire al periodo di riferimento dei metadati!

In [34]:
meta = (
    meta.sort("anagrafica", "stop_date")
    .group_by("anagrafica")
    .agg(
        pl.col("identifier").last(),
        pl.col("province").last(),
        pl.col("BACINO").last(),
        pl.col("elevation").mean(),
        pl.col("lon").mean(),
        pl.col("lat").mean(),
    )
)

In [27]:
meta

anagrafica,identifier,province,BACINO,elevation,lon,lat
str,str,str,str,f64,f64,f64
"""CONNA""","""CONNA""","""SAVONA""","""FRA IMPERO E C…",350.0,8.10363,43.97923
"""COLLODARI""","""RCOL0""","""GENOVA""","""RECCO""",14.0,9.15162,44.37697
"""RESORDO""","""CICA0""","""GENOVA""","""ENTELLA""",171.0,9.34719,44.4
"""PITELLI""","""PITLL""","""LA SPEZIA""","""MAGRA""",132.0,9.88761,44.09562
"""OSIGLIA""","""OSIGL""","""SAVONA""","""BORMIDA DI MIL…",620.0,8.20254,44.28647
"""GENOVA - NERVI…","""NEVI0""","""GENOVA""","""FRA BISAGNO ED…",45.0,9.0438,44.38237
"""TESTICO""","""TESTI""","""SAVONA""","""FRA IMPERO E C…",439.0,8.03517,44.00653
"""PANESI""","""PANES""","""GENOVA""","""ENTELLA""",25.0,9.35624,44.34209
"""CASARZA LIGURE…","""CASR0""","""GENOVA""","""FRA ENTELLA E …",120.0,9.45258,44.29405
"""URBE - VARA SU…","""URVAS""","""SAVONA""","""ORBA""",810.0,8.62739,44.46953


In [57]:
fd2 = full_data.join(
    meta[["anagrafica", "identifier"]], on="anagrafica", how="left"
).drop("anagrafica")

In [58]:
fd2.write_parquet(liguria_path / "dataset.parquet")

In [43]:
meta.write_csv(liguria_path / "metadata.csv")

In [201]:
diffs.filter(
    (pl.col("londiff").abs() > 1e-4)
    | (pl.col("latdiff").abs() > 1e-4)
    | (pl.col("elevdiff").abs() > 20)
)

identifier,anagrafica,province,BACINO,elevation,lon,lat,start_date,end_date,londiff,latdiff,elevdiff
str,str,str,str,f64,f64,f64,date,date,f64,f64,f64
"""CAMIN""","""CAMINATA""","""GENOVA""","""ENTELLA""",90.0,9.41004,44.34004,,,0.000225,0.000025,-1.0
"""CMNT0""","""CAMINATA""","""GENOVA""","""ENTELLA""",92.0,9.40959,44.33999,,,-0.000225,-0.000025,1.0
"""CAMOG""","""CAMOGLI""","""GENOVA""","""FRA BISAGNO ED…",60.0,9.15931,44.34704,1969-12-01,2005-12-31,0.00017,0.0,2.0
"""CMGL0""","""CAMOGLI""","""GENOVA""","""FRA BISAGNO ED…",56.0,9.15897,44.34704,2006-01-01,,-0.00017,0.0,-2.0
"""CARPE""","""CARPE""","""SAVONA""","""FRA CENTA E QU…",610.0,8.16447,44.14819,2012-11-01,,-0.000375,0.002715,105.0
"""TOIR0""","""CARPE""","""SAVONA""","""FRA CENTA E QU…",400.0,8.16522,44.14276,,,0.000375,-0.002715,-105.0
"""DOLCD""","""DOLCEDO""","""IMPERIA""","""FRA ARGENTINA …",77.0,7.95115,43.90674,2003-10-01,2005-11-30,-0.000175,0.00042,0.0
"""DOLCE""","""DOLCEDO""","""IMPERIA""","""FRA ARGENTINA …",77.0,7.9515,43.9059,2005-12-01,,0.000175,-0.00042,0.0
"""GEMOL""","""GENOVA - MOLAS…","""GENOVA""","""BISAGNO""",62.0,8.98654,44.45433,,,0.00294,-0.00044,-34.5
"""MOLA0""","""GENOVA - MOLAS…","""GENOVA""","""BISAGNO""",131.0,8.98066,44.45521,,,-0.00294,0.00044,34.5


In [212]:
meta

identifier,anagrafica,province,BACINO,elevation,lon,lat,start_date,end_date
str,str,str,str,f64,f64,f64,date,date
"""AGORR""","""ALPE GORRETO""","""GENOVA""","""TREBBIA""",915.0,9.2363,44.6047,2004-12-01,
"""AIRO0""","""AIROLE""","""IMPERIA""","""ROYA""",110.0,7.54478,43.87309,2007-01-01,2012-12-31
"""AIROL""","""AIROLE""","""IMPERIA""","""ROYA""",103.0,7.54484,43.87296,2013-01-01,
"""ALASS""","""ALASSIO""","""SAVONA""","""FRA IMPERO E C…",25.0,8.1693,44.00624,2003-10-01,
"""ALBG0""","""ALBENGA""","""SAVONA""","""CENTA""",5.0,8.21262,44.04899,,
"""ALBIS""","""ALBISOLA""","""SAVONA""","""SANSOBBIA""",3.0,8.50887,44.32931,,
"""ALPIC""","""ALPICELLA""","""SAVONA""","""FRA SANSOBBIA …",435.0,8.52604,44.40667,2001-03-01,
"""AMBOR""","""AMBORZASCO""","""GENOVA""","""AVETO""",908.0,9.45496,44.51447,2019-10-01,
"""AMEFM""","""AMEGLIA FOCE M…","""LA SPEZIA""","""MAGRA""",4.0,9.9722,44.07884,2011-01-01,
"""AMERE""","""MERELLI""","""IMPERIA""","""ARGENTINA""",70.0,7.84759,43.88137,,
