In [31]:
from pathlib import Path
import re
import pandas as pd
from io import StringIO
import polars as pl
from tqdm.notebook import tqdm
from requests import Session
from datetime import datetime, date


liguria_path = Path.home() / "Local_Workspace/Datasets/ARPA/LIGURIA"
# meta = pl.from_pandas(pd.read_html(liguria_path / "Lista Stazioni.html", header=0, extract_links="all")[0].iloc[::2])
meta = pl.read_ipc("../../../db/metadata/ARPA Liguria.arrow")

In [18]:
input_schema = {
    "Inizio rilevazione": pl.Date(),
    "Fine rilevazione": pl.Date(),
    "Valore": pl.Float32(),
    "Dataset": pl.Utf8(),
    "Valido": pl.Utf8(),
}

output_schema = {
    "date": pl.Date(),
    "value": pl.Float32(),
    "Dataset": pl.Utf8(),
    "valid": pl.Boolean(),
}


def parse_table(text: str):
    if text.startswith("Nessun dato"):
        table = pl.DataFrame(schema=output_schema)
        dati_info = {"read": 0, "valid": 0}
    else:
        table, dati_info = text.strip().split("\n\n")
        table = (
            pl.read_csv(StringIO(table), schema=input_schema)
            .rename({"Inizio rilevazione": "date", "Valore": "value"})
            .select(
                "date", "value", "Dataset", pl.col("Valido").eq("Sì").alias("valid")
            )
        )
        dati_info = list(map(lambda line: line.split(",")[1], dati_info.split("\n")))
        dati_info = {"read": dati_info[0], "valid": dati_info[1]}
    return table, dati_info


tvars = {"MASSIMA": "T_MAX", "MINIMA": "T_MIN"}


class StationData:
    def __init__(self, line: str):
        parts = line.split("\n", maxsplit=3)
        self.anagrafica = parts[0].strip()
        self.variable = tvars[re.search("MASSIMA|MINIMA", parts[1]).group(0)]
        self.table, self.dati_info = parse_table(parts[3])
        self.table = self.table.with_columns(
            pl.lit(self.anagrafica).alias("anagrafica"),
            pl.lit(self.variable).alias("variable"),
        )

In [19]:
data_tables = {}
for fragment_path in liguria_path.glob("*.csv"):
    with open(fragment_path, "rt", encoding="iso-8859-1") as fragment_handle:
        fragments = fragment_handle.read().split('"Stazione",')
    fragments = [block.strip() for block in fragments if block.strip()]
    for fragment in fragments:
        sd = StationData(fragment)
        data_tables[sd.anagrafica].appstop(sd.table)
full_data = pl.concat(data_tables).sort("variable", "anagrafica", "date")
full_data.drop("Dataset").write_ipc(liguria_path / "dataset.arrow")

In [3]:
full_data = pl.read_ipc(liguria_path / "dataset.arrow")

In [6]:
stations_info = (
    pl.from_records(
        pd.read_html(
            liguria_path / "Lista Stazioni.html", header=0, extract_links="all"
        )[0].iloc[::2, 0]
    )
    .rename({"column_0": "identifier", "column_1": "url"})
    .with_columns(
        pl.col("url").str.replace(
            r"../", "https://ambientepub.regione.liguria.it/SiraQualMeteo/"
        )
    )
)


# with Session() as session:
#     for identifier, url in tqdm(stations_info.iter_rows(), total = len(stations_info)):
#         with open(liguria_path / "stazioni" / f"{identifier}.html", "wt") as station_file:
#             station_file.write(session.get(url).text)

In [32]:
def parse_meta(table):
    start_date = re.search(r"\d{2}/\d{4}", table.iloc[6, 2])
    if start_date:
        start_date = datetime.strptime(start_date.group(0), "%m/%Y").date()
    else:
        start_date = None
    return pl.from_dict(
        {
            "lon": float(table.iloc[3, 4]),
            "lat": float(table.iloc[4, 4]),
            "start_date": start_date,
        },
        schema_overrides={"start_date": pl.Date()},
    )


def retr_data(identifiers):
    tables = pl.concat(
        [
            parse_meta(
                pd.read_html(
                    liguria_path / "stazioni" / f"{identifier}.html", encoding="utf-8"
                )[0]
            )
            for identifier in identifiers
        ]
    )

    return pl.concat(
        [pl.DataFrame({"identifier": identifiers}), tables], how="horizontal"
    )

In [33]:
meta = (
    meta.join(
        retr_data(stations_info["identifier"].to_list()),
        on="identifier",
        how="left",
        validate="1:1",
    )
    .sort("anagrafica", "start_date")
    .with_columns(
        pl.col("start_date")
        .shift(-1)
        .over("anagrafica")
        .alias("stop_date")
        .dt.offset_by("-1d")
    )
    .with_columns(
        pl.col("start_date").fill_null(date(2000, 1, 1)),
        pl.col("stop_date").fill_null(date(2023, 12, 31)),
    )
    .filter(
        pl.col("anagrafica").ne("BOLSINE")
        | pl.col("start_date").ne(date(2000, 1, 1))
        | pl.col("stop_date").ne(date(2023, 12, 31))
    )
)

In [8]:
fd1 = (
    full_data.join(
        meta[["identifier", "anagrafica", "start_date", "stop_date"]], on="anagrafica"
    )
    .filter(
        (pl.col("start_date") <= pl.col("date"))
        & (pl.col("date") <= pl.col("stop_date"))
    )
    .drop("start_date", "stop_date")
)

In [11]:
fd1.filter(pl.struct("date", "identifier", "variable").is_duplicated()).sort(
    "identifier", "date"
)

date,value,valid,anagrafica,variable,identifier
date,f32,bool,str,str,str
2015-07-14,34.0,true,"""AIROLE""","""T_MAX""","""AIROL"""
2015-07-14,32.400002,true,"""AIROLE""","""T_MAX""","""AIROL"""
2015-07-14,22.0,true,"""AIROLE""","""T_MIN""","""AIROL"""
2015-07-14,20.0,true,"""AIROLE""","""T_MIN""","""AIROL"""
2015-07-15,32.099998,true,"""AIROLE""","""T_MAX""","""AIROL"""
2015-07-15,31.0,true,"""AIROLE""","""T_MAX""","""AIROL"""
2015-07-15,22.0,true,"""AIROLE""","""T_MIN""","""AIROL"""
2015-07-15,20.299999,true,"""AIROLE""","""T_MIN""","""AIROL"""
2015-07-16,31.799999,true,"""AIROLE""","""T_MAX""","""AIROL"""
2015-07-16,31.9,true,"""AIROLE""","""T_MAX""","""AIROL"""


In [17]:
full_data.filter(pl.struct("date", "anagrafica", "variable").is_duplicated()).group_by(
    "anagrafica"
).agg(pl.col("date").min().alias("first_date"), pl.col("date").max().alias("last_date"))

anagrafica,first_date,last_date
str,date,date
"""STATALE""",2012-02-27,2015-12-31
"""MONTE CAPPELLI…",2013-03-15,2013-03-20
"""CAMOGLI""",2015-04-25,2017-12-22
"""VERZI LOANO""",2014-03-10,2015-12-31
"""AIROLE""",2015-07-14,2015-12-31
"""DOLCEDO""",2005-12-31,2006-12-31
"""ISOVERDE""",2014-06-09,2014-06-13


In [25]:
meta.filter(pl.col("start_date") >= pl.col("stop_date"))

identifier,anagrafica,province,BACINO,elevation,lon,lat,start_date,stop_date
str,str,str,str,f64,f64,f64,date,date


### è impossibile risalire al periodo di rifermiento dei metadati!

In [34]:
meta = meta.sort("anagrafica", "stop_date").group_by("anagrafica").agg(
    pl.col("identifier").last(),
    pl.col("province").last(),
    pl.col("BACINO").last(),
    pl.col("elevation").mean(),
    pl.col("lon").mean(),
    pl.col("lat").mean()
)

In [35]:
meta

anagrafica,identifier,province,BACINO,elevation,lon,lat
str,str,str,str,f64,f64,f64
"""CONNA""","""CONNA""","""SAVONA""","""FRA IMPERO E C…",350.0,8.10363,43.97923
"""COLLODARI""","""RCOL0""","""GENOVA""","""RECCO""",14.0,9.15162,44.37697
"""RESORDO""","""CICA0""","""GENOVA""","""ENTELLA""",171.0,9.34719,44.4
"""PITELLI""","""PITLL""","""LA SPEZIA""","""MAGRA""",132.0,9.88761,44.09562
"""OSIGLIA""","""OSIGL""","""SAVONA""","""BORMIDA DI MIL…",620.0,8.20254,44.28647
"""GENOVA - NERVI…","""NEVI0""","""GENOVA""","""FRA BISAGNO ED…",45.0,9.0438,44.38237
"""TESTICO""","""TESTI""","""SAVONA""","""FRA IMPERO E C…",439.0,8.03517,44.00653
"""PANESI""","""PANES""","""GENOVA""","""ENTELLA""",25.0,9.35624,44.34209
"""CASARZA LIGURE…","""CASR0""","""GENOVA""","""FRA ENTELLA E …",120.0,9.45258,44.29405
"""URBE - VARA SU…","""URVAS""","""SAVONA""","""ORBA""",810.0,8.62739,44.46953


In [40]:
fd2 = full_data.join(meta[["anagrafica", "identifier"]], on = "anagrafica", how = "left").drop("anagrafica")

In [42]:
fd2.write_ipc(liguria_path / "dataset.arrow")

In [44]:
meta.write_ipc("../../../db/metadata/ARPA Liguria.arrow")

In [201]:
diffs.filter(
    (pl.col("londiff").abs() > 1e-4)
    | (pl.col("latdiff").abs() > 1e-4)
    | (pl.col("elevdiff").abs() > 20)
)

identifier,anagrafica,province,BACINO,elevation,lon,lat,start_date,end_date,londiff,latdiff,elevdiff
str,str,str,str,f64,f64,f64,date,date,f64,f64,f64
"""CAMIN""","""CAMINATA""","""GENOVA""","""ENTELLA""",90.0,9.41004,44.34004,,,0.000225,0.000025,-1.0
"""CMNT0""","""CAMINATA""","""GENOVA""","""ENTELLA""",92.0,9.40959,44.33999,,,-0.000225,-0.000025,1.0
"""CAMOG""","""CAMOGLI""","""GENOVA""","""FRA BISAGNO ED…",60.0,9.15931,44.34704,1969-12-01,2005-12-31,0.00017,0.0,2.0
"""CMGL0""","""CAMOGLI""","""GENOVA""","""FRA BISAGNO ED…",56.0,9.15897,44.34704,2006-01-01,,-0.00017,0.0,-2.0
"""CARPE""","""CARPE""","""SAVONA""","""FRA CENTA E QU…",610.0,8.16447,44.14819,2012-11-01,,-0.000375,0.002715,105.0
"""TOIR0""","""CARPE""","""SAVONA""","""FRA CENTA E QU…",400.0,8.16522,44.14276,,,0.000375,-0.002715,-105.0
"""DOLCD""","""DOLCEDO""","""IMPERIA""","""FRA ARGENTINA …",77.0,7.95115,43.90674,2003-10-01,2005-11-30,-0.000175,0.00042,0.0
"""DOLCE""","""DOLCEDO""","""IMPERIA""","""FRA ARGENTINA …",77.0,7.9515,43.9059,2005-12-01,,0.000175,-0.00042,0.0
"""GEMOL""","""GENOVA - MOLAS…","""GENOVA""","""BISAGNO""",62.0,8.98654,44.45433,,,0.00294,-0.00044,-34.5
"""MOLA0""","""GENOVA - MOLAS…","""GENOVA""","""BISAGNO""",131.0,8.98066,44.45521,,,-0.00294,0.00044,34.5


In [212]:
meta

identifier,anagrafica,province,BACINO,elevation,lon,lat,start_date,end_date
str,str,str,str,f64,f64,f64,date,date
"""AGORR""","""ALPE GORRETO""","""GENOVA""","""TREBBIA""",915.0,9.2363,44.6047,2004-12-01,
"""AIRO0""","""AIROLE""","""IMPERIA""","""ROYA""",110.0,7.54478,43.87309,2007-01-01,2012-12-31
"""AIROL""","""AIROLE""","""IMPERIA""","""ROYA""",103.0,7.54484,43.87296,2013-01-01,
"""ALASS""","""ALASSIO""","""SAVONA""","""FRA IMPERO E C…",25.0,8.1693,44.00624,2003-10-01,
"""ALBG0""","""ALBENGA""","""SAVONA""","""CENTA""",5.0,8.21262,44.04899,,
"""ALBIS""","""ALBISOLA""","""SAVONA""","""SANSOBBIA""",3.0,8.50887,44.32931,,
"""ALPIC""","""ALPICELLA""","""SAVONA""","""FRA SANSOBBIA …",435.0,8.52604,44.40667,2001-03-01,
"""AMBOR""","""AMBORZASCO""","""GENOVA""","""AVETO""",908.0,9.45496,44.51447,2019-10-01,
"""AMEFM""","""AMEGLIA FOCE M…","""LA SPEZIA""","""MAGRA""",4.0,9.9722,44.07884,2011-01-01,
"""AMERE""","""MERELLI""","""IMPERIA""","""ARGENTINA""",70.0,7.84759,43.88137,,
