In [2]:
import bs4
import polars as pl
import pandas as pd
from pathlib import Path
from io import StringIO
from urllib.parse import unquote
from tqdm.notebook import tqdm


base = Path.home() / "Local_Workspace" / "Datasets" / "ARPA" / "FVG"
sample = base / "data" / "103400" / "2016.html"
dataset = base / "dataset"
if not dataset.exists():
    dataset.mkdir(parents=True)

In [3]:
def read_csv_table(path: Path) -> str:
    """
    Reads a CSV table from a given file path. Escapes all the junk put in the html response by the ARPA website.

    Args:
        path (str): The file path of the CSV file.

    Returns:
        str: The CSV table as a string.
    """
    soup = bs4.BeautifulSoup(
        path.read_text()[1:-1].replace(r"\n\t", "").replace(r"\n", "").replace("\\", "")
    )

    csv = soup.find(id="salvaDati").attrs["href"][
        len("data:application/csv;charset=utf-8,") :
    ]
    # Escapse the csv string from html codes
    return unquote(csv)


read_schema = {
    "mese": pl.Utf8(),
    "giorno*": pl.Int32(),
    "Pioggia mm": pl.Float64(),
    "Temp. min gradi C": pl.Float64(),
    "Temp. med gradi C": pl.Float64(),
    "Temp. max gradi C": pl.Float64(),
    "Umidita' min %": pl.Float64(),
    "Umidita' med %": pl.Float64(),
    "Umidita' max %": pl.Float64(),
    "Vento med km/h": pl.Float64(),
    "Vento max km/h": pl.Float64(),
    "Dir. V. max gradi N": pl.Float64(),
    "Radiaz. KJ/m2": pl.Float64(),
    "Press. med hPa": pl.Float64(),
}


def read_dataframe(path: Path) -> pl.DataFrame:
    """
    Read a CSV file from the given path and return a processed DataFrame.

    Args:
        path (Path): The path to the CSV file.

    Returns:
        pl.DataFrame: The processed DataFrame.
    """
    csv = read_csv_table(path)
    year = int(path.stem)
    df = (
        pl.read_csv(
            StringIO(csv),
            separator=";",
            has_header=True,
            null_values=["-"],
            ignore_errors=False,
            schema=read_schema,
        )
        .head(-3)
        .with_columns_seq(
            pl.col("mese").cast(pl.Int32()),
            pl.date(pl.lit(year), pl.col("mese"), pl.col("giorno*")).alias("date"),
        )
        .drop(["mese", "giorno*"])
    )
    return df

In [8]:
errors = []
for station in tqdm(list((base / "data").glob("*/"))):
    station_archive = dataset / f"{station.stem}.parquet"
    if station_archive.exists():
        continue
    station_data = []
    for year in station.glob("*.html"):
        try:
            df = read_dataframe(year)
        except:
            print(f"Error in {station.stem}: {year.stem}")
            errors.append((station.stem, year.stem))
        station_data.append(df)
    station_data = pl.concat(
        station_data, how="vertical"
    ).with_columns(pl.lit(station.stem).alias("stazione")).sort("date")
    station_data.write_parquet(station_archive)

  0%|          | 0/57 [00:00<?, ?it/s]

Error in SAP: 2021
Error in SAP: 2017
Error in SAP: 2001
Error in SAP: 2000
Error in SAP: 2016
Error in SAP: 2020
Error in SAP: 2011
Error in SAP: 2007
Error in SAP: 2006
Error in SAP: 2010
Error in SAP: 2005
Error in SAP: 2013
Error in SAP: 2009
Error in SAP: 2008
Error in SAP: 2012
Error in SAP: 2004
Error in SAP: 2019
Error in SAP: 2003
Error in SAP: 2015
Error in SAP: 2014
Error in SAP: 2002
Error in SAP: 2018
Error in SAP: 2022


In [28]:
pl.read_parquet(base / "data" / "103400" / "data.parquet")

Pioggia mm,Temp. min gradi C,Temp. med gradi C,Temp. max gradi C,Umidita' min %,Umidita' med %,Umidita' max %,Vento med km/h,Vento max km/h,Dir. V. max gradi N,Radiaz. KJ/m2,Press. med hPa,date,stazione
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,date,str
,,,,,,,,,,,,2000-01-01,"""103400"""
,,,,,,,,,,,,2000-01-02,"""103400"""
,,,,,,,,,,,,2000-01-03,"""103400"""
,,,,,,,,,,,,2000-01-04,"""103400"""
,,,,,,,,,,,,2000-01-05,"""103400"""
,,,,,,,,,,,,2000-01-06,"""103400"""
,,,,,,,,,,,,2000-01-07,"""103400"""
,,,,,,,,,,,,2000-01-08,"""103400"""
,,,,,,,,,,,,2000-01-09,"""103400"""
,,,,,,,,,,,,2000-01-10,"""103400"""


In [15]:
pd.read_csv(StringIO(read_csv_table(sample)), sep=";", na_values=["-"], engine="python")

Unnamed: 0,mese,giorno*,Pioggia mm,Temp. min gradi C,Temp. med gradi C,Temp. max gradi C,Umidita' min %,Umidita' med %,Umidita' max %,Vento med km/h,Vento max km/h,Dir. V. max gradi N,Radiaz. KJ/m2,Press. med hPa
0,1,1.0,0.0,-4.6,-0.6,7.0,43.0,81.0,97.0,2.0,8.0,360.0,6203.0,
1,1,2.0,0.0,-5.9,-2.0,1.2,71.0,87.0,95.0,1.0,6.0,360.0,2014.0,
2,1,3.0,5.6,-0.4,0.8,2.7,84.0,94.0,98.0,1.0,15.0,43.0,1510.0,
3,1,4.0,1.2,-1.0,0.2,5.1,75.0,94.0,98.0,2.0,9.0,219.0,2624.0,
4,1,5.0,0.2,-2.1,-0.6,2.1,84.0,95.0,98.0,2.0,10.0,360.0,1465.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,12,30.0,0.2,-4.7,-1.8,6.0,52.0,84.0,95.0,1.0,10.0,91.0,6368.0,
365,12,31.0,0.0,-5.9,-2.4,6.7,40.0,80.0,94.0,1.0,6.0,58.0,6714.0,
366,NOTA BENE: l'utilizzo dei dati e delle informa...,,,,,,,,,,,,,
367,* dati dalle 00 alle 24 UTC (ora di Greenwich)...,,,,,,,,,,,,,


In [6]:
pd.read_html(sample)[0]

Unnamed: 0_level_0,mese<\/th>,giorno*<\/th>,Pioggia mm<\/th>,Temp. min °C<\/th>,Temp. med °C<\/th>,Temp. max °C<\/th>,Umidità min %<\/th>,Umidità med %<\/th>,Umidità max %<\/th>,Vento med km\/h<\/th>,Vento max km\/h<\/th>,Dir. V. max °N<\/th>,Radiaz. KJ\/m2<\/th>,Press. med hPa<\/th><\/tr>\n\t\t\n\t\t<\/thead>\n\t \t
Unnamed: 0_level_1,<\/th>,totale<\/th>,minimo<\/th>,medio<\/th>,massimo<\/th>\n\t <\/tr>\n\t\t <\/thead>\n\t\t,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1<\/td>,1<\/td>,0.2<\/td>,-4.6<\/td>,-1.1<\/td>,5.3<\/td>,55<\/td>,87<\/td>,99<\/td>,0<\/td>,8<\/td>,196<\/td>,6093<\/td>,-<\/td><\/tr>
1,1<\/td>,2<\/td>,10.8<\/td>,-3.5<\/td>,-0.4<\/td>,1.7<\/td>,81<\/td>,95<\/td>,100<\/td>,0<\/td>,6<\/td>,53<\/td>,1402<\/td>,-<\/td><\/tr>
2,1<\/td>,3<\/td>,9.0<\/td>,0.7<\/td>,3.3<\/td>,7.0<\/td>,88<\/td>,98<\/td>,100<\/td>,0<\/td>,10<\/td>,90<\/td>,2885<\/td>,-<\/td><\/tr>
3,1<\/td>,4<\/td>,0.0<\/td>,-0.6<\/td>,1.1<\/td>,3.6<\/td>,-<\/td>,-<\/td>,-<\/td>,0<\/td>,9<\/td>,360<\/td>,1415<\/td>,-<\/td><\/tr>
4,1<\/td>,5<\/td>,1.4<\/td>,-1.4<\/td>,0.6<\/td>,3.4<\/td>,83<\/td>,96<\/td>,100<\/td>,1<\/td>,14<\/td>,66<\/td>,3803<\/td>,-<\/td><\/tr>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,precipitazione [mm]<\/td>,2470.8<\/td>,-<\/td>,-<\/td>,-<\/td><\/tr>\n\t,,,,,,,,,
368,temperatura [\u00b0C]<\/td>,-<\/td>,-98.0<\/td>,10.2<\/td>,32.5<\/td><\/tr>\n\t,,,,,,,,,
369,vento [km\/h]<\/td>,-<\/td>,-<\/td>,3<\/td>,54<\/td><\/tr>\n\t,,,,,,,,,
370,umidit\u00e0 [%]<\/td>,-<\/td>,0<\/td>,81<\/td>,100<\/td><\/tr>\n\t,,,,,,,,,
