In [1]:
import pyarrow as pa
import pyarrow.dataset as pds
import pyarrow.compute as pc
from pyarrow import parquet
import polars as pl
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

scia_base = Path("/Users/davidenicoli/Local_Workspace/Datasets/SCIA/giornaliere/")
tmax_path = scia_base / "massime" / "db" / "series.parquet"
tmin_path = scia_base / "minime" / "db" / "series.parquet"

In [4]:
tmin = parquet.read_table(
    tmin_path, filters=[("date", ">=", datetime(2000, 1, 1))]
).rename_columns(["value", "date", "identifier"])
tmin = tmin.append_column("variable", pa.array(["T_MIN"] * tmin.num_rows, pa.string()))
tmax = parquet.read_table(
    tmax_path, filters=[("date", ">=", datetime(2000, 1, 1))]
).rename_columns(["value", "date", "identifier"])
tmax = tmax.append_column("variable", pa.array(["T_MAX"] * tmax.num_rows, pa.string()))

ds = (
    pa.concat_tables([tmin, tmax])
    .sort_by(
        [("variable", "ascending"), ("identifier", "ascending"), ("date", "ascending")]
    )
    .cast(
        pa.schema(
            [
                ("value", pa.float32()),
                ("date", pa.date32()),
                ("identifier", pa.uint32()),
                ("variable", pa.string()),
            ]
        )
    )
)
part = pds.partitioning(schema=pa.schema([("variable", pa.string())]), flavor="hive")

In [5]:
ds

pyarrow.Table
value: float
date: date32[day]
identifier: uint32
variable: string
----
value: [[23.6,22,22,15.7,21,...,-1.4,-5.3,-2.6,-2.7,2.9]]
date: [[2001-05-18,2001-05-19,2001-05-20,2001-05-21,2001-05-22,...,2022-12-27,2022-12-28,2022-12-29,2022-12-30,2022-12-31]]
identifier: [[1,1,1,1,1,...,15181,15181,15181,15181,15181]]
variable: [["T_MAX","T_MAX","T_MAX","T_MAX","T_MAX",...,"T_MIN","T_MIN","T_MIN","T_MIN","T_MIN"]]

In [6]:
pds.write_dataset(ds, "../db/data/scia", format="feather", partitioning=part)

In [8]:
m_tmin = parquet.read_table(
    "../cache/metadata/SCIA/T_MIN_dated.parquet",
    filters=[("last_date", ">", datetime(2000, 1, 1))],
)
m_tmin = m_tmin.append_column(
    "variable", pa.array(["T_MIN"] * m_tmin.num_rows, pa.string())
)
m_tmax = parquet.read_table(
    "../cache/metadata/SCIA/T_MAX_dated.parquet",
    filters=[("last_date", ">", datetime(2000, 1, 1))],
)
m_tmax = m_tmax.append_column(
    "variable", pa.array(["T_MAX"] * m_tmax.num_rows, pa.string())
)

In [11]:
metadata = (
    pa.concat_tables([m_tmin, m_tmax])
    .sort_by([("variable", "ascending"), ("anagrafica", "ascending")])
    .cast(
        pa.schema(
            [
                ("valid_days", pa.uint32()),
                ("identifier", pa.uint32()),
                ("net_code", pa.uint8()),
                ("user_code", pa.string()),
                ("elevation", pa.float32()),
                ("lat", pa.float32()),
                ("lon", pa.float32()),
                ("state", pa.string()),
                ("province", pa.string()),
                ("anagrafica", pa.string()),
                ("rete", pa.string()),
                ("first_date", pa.date32()),
                ("last_date", pa.date32()),
                ("variable", pa.string())
            ]
        )
    )
)
part = pds.partitioning(schema=pa.schema([("variable", pa.string())]), flavor="hive")

In [12]:
metadata

pyarrow.Table
valid_days: uint32
identifier: uint32
net_code: uint8
user_code: string
elevation: float
lat: float
lon: float
state: string
province: string
anagrafica: string
rete: string
first_date: date32[day]
last_date: date32[day]
variable: string
----
valid_days: [[8307,6255,7300,1914,6238,...,7970,11257,6330,5243,5837]]
identifier: [[14716,14935,7332,7650,12490,...,8109,5770,7095,7115,11701]]
net_code: [[54,54,15,15,38,...,15,27,20,20,15]]
user_code: [["OR009B001","OR009B501","01133","01S4062","00078",...,"05184","00184","20034","20056","182780"]]
elevation: [[317,328,1645,2000,1171,...,11,12,31,65,451]]
lat: [[40.12547,40.13055,44.485153,44.520138,42.682728,...,45.59004,45.59004,45.006943,44.49614,38.653473]]
lon: [[8.818833,8.81389,6.981433,6.906438,13.209904,...,12.17524,12.17524,10.167858,11.200052,15.982944]]
state: [["Sardegna","Sardegna","Piemonte","Piemonte","Lazio",...,"Veneto","Veneto","Emilia-Romagna","Emilia-Romagna","Calabria"]]
province: [["Oristano","Oristano","Cun

In [14]:
pds.write_dataset(metadata, "../db/metadata/scia", format="feather", partitioning=part)