In [1]:
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
from pyarrow import csv
from pathlib import Path

base = Path("/Users/davidenicoli/Local_Workspace/Datasets/ARPA/LOMBARDIA/")

In [2]:
metadata = csv.read_csv(
    base / "Stazioni_Meteorologiche.tsv",
    parse_options=csv.ParseOptions(delimiter="\t"),
    convert_options=csv.ConvertOptions(
        column_types=pa.schema(
            {
                "IdSensore": pa.uint32(),
                "IdStazione": pa.uint32(),
                "Tipologia": pa.dictionary(pa.int32(), pa.utf8()),
                "Provincia": pa.dictionary(pa.int32(), pa.utf8()),
                "DataStart": pa.utf8(),
                "DataStop": pa.utf8(),
                "Quota": pa.float32(),
                "lng": pa.float32(),
                "lat": pa.float32(),
            }
        ),
        timestamp_parsers=r"%d/%m/%Y",
    ),
).filter(pc.field("Tipologia") == "Temperatura")
ids = metadata.select(["IdSensore"])

In [4]:
dataset = ds.dataset(
    list(base.glob("*.csv")),
    schema=pa.schema(
        {
            "IdSensore": pa.uint32(),
            "Data": pa.timestamp(unit="s"),
            "Valore": pa.float32(),
            "Stato": pa.utf8(),
            "idOperatore": pa.uint8(),
        }
    ),
    format=ds.CsvFileFormat(
        read_options=csv.ReadOptions(
            column_names=["IdSensore", "Data", "Valore", "Stato", "idOperatore"],
            skip_rows = 1,
            # use_threads = False
        ),
        convert_options=csv.ConvertOptions(
            timestamp_parsers=[r"%d/%m/%Y %H:%M:%S", r"%d/%m/%Y %H:%M:%S.000"]
        ),
    ),
).join(metadata, "IdSensore", join_type = "left semi")

In [5]:
ds.write_dataset(
    dataset,
    base / "dataset",
    format="parquet",
    # partitioning=["IdSensore"],
    # partitioning_flavor="hive",
    # max_partitions=1250,
    max_open_files=1250,
)

In [4]:
reopen = ds.dataset(
    base / "dataset"
)

In [4]:
ds.write_dataset(
    reopen,
    base / "dataset2",
    format="feather",
    partitioning=["IdSensore"],
    partitioning_flavor="hive"
)

In [2]:
import polars as pl
plds = pl.scan_parquet(base / "dataset" / "part-0.parquet")

In [4]:
plds.sort(["IdSensore", "Data", "idOperatore"]).sink_parquet(base / "sorted.parquet")

In [5]:
pldss = pl.scan_parquet(base / "sorted.parquet")

In [None]:
pldss.group_by("IdSensore")