In [1]:
import polars as pl
from pathlib import Path
import re
from zipfile import ZipFile

from lib.read import read_dext3r_tables, read_payloads, read_existing_requests
from lib.metas import load_series_info, load_stations

base = Path("/Users/davidenicoli/Local_Workspace/Datasets/ARPA/EMILIA-ROMAGNA/Dext3r")

In [2]:
payloads = read_payloads()
requests = read_existing_requests()
stations_meta = load_stations()

In [3]:
# for task in requests["task"].unique():
#     with ZipFile(base / "data" / "zip" / f"dexter-{task}.zip") as zip:
#         zip.extractall(base / "data" / "fragments")
data_fragments = list((base / "data" / "fragments").glob("*.csv"))

In [3]:
# for zip in (base / "data" / "zip").glob("*.zip"):
#     with ZipFile(zip, "r") as z:
#         z.extractall(base / "data" / "fragments")
# data_fragments = list((base / "data" / "fragments").glob("*.csv"))

In [None]:
metas = []
for d in data_fragments:
    meta, tab = read_dext3r_tables(d, payloads, stations_meta)
    if meta is not None:
        metas.append(meta)
        tab.write_parquet(d.with_suffix(".parquet"))
metas = pl.concat(metas, how="vertical")

In [14]:
metas.unique(["network", "name", "id", "lon", "lat", "elevation"]).write_csv(base / "email_metadata.csv")

In [22]:
invalid_tasks = [
    task + "\n" for task in metas["task"].unique().to_list() if task not in payloads
]
# with open(base / "data" / "invalid" / "no_pl.txt", "wt") as f:
#     f.writelines(invalid_tasks)

In [14]:
series_info = load_series_info()
metas.filter(pl.col("task").eq("c104ab09-d626-4c91-98dc-adbca918f147")).join(
    series_info, left_on="id", right_on="station"
).select(
    "id", "name", "network", "begin", "end", "agg_period", "agg_code", "lon", "lat"
).to_pandas()

Unnamed: 0,id,name,network,begin,end,agg_period,agg_code,lon,lat
0,"-/1062619,4419469/idrost",S. Michele centrale,idrost,1959-08-25 00:00:00,2013-01-01 00:00:00,86400,2,1062619,4419469
1,"-/1062619,4419469/idrost",S. Michele centrale,idrost,1959-08-25 00:00:00,2013-01-01 00:00:00,86400,3,1062619,4419469
2,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,2021-10-06 12:45:00,2024-02-08 14:15:00,900,2,1162264,4465378
3,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-09 11:00:00,2024-02-08 14:15:00,0,4,1162264,4465378
4,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-09 11:00:00,2024-02-08 14:00:00,3600,0,1162264,4465378
5,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-09 11:00:00,2024-02-08 14:00:00,3600,3,1162264,4465378
6,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-09 11:00:00,2024-02-08 14:00:00,3600,2,1162264,4465378
7,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-10 00:00:00,2024-02-08 00:00:00,86400,3,1162264,4465378
8,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-10 00:00:00,2024-02-08 00:00:00,86400,2,1162264,4465378
9,"-/1162264,4465378/locali",S. Pietro Capofiume,locali,1985-12-10 00:00:00,2024-02-08 00:00:00,86400,0,1162264,4465378


In [24]:
data = pl.scan_parquet((base / "data" / "fragments").glob("*.parquet"))
data.filter(
    pl.col("value").is_not_null(), pl.struct("id", "start", "variable").is_duplicated()
).collect()

start,stop,value,variable,id,task
"datetime[μs, UTC]","datetime[μs, UTC]",f64,str,str,str
2005-01-01 00:00:00 UTC,2005-01-02 00:00:00 UTC,-5.0,"""T_MIN""","""-/1048650,4420…","""0978a3e3-2713-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,-0.1,"""T_MIN""","""-/1196188,4383…","""0978a3e3-2713-…"
2005-01-01 00:00:00 UTC,2005-01-02 00:00:00 UTC,-5.0,"""T_MIN""","""-/1196188,4383…","""0978a3e3-2713-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,-2.2,"""T_MIN""","""-/1103295,4408…","""5df772eb-0469-…"
2005-01-01 00:00:00 UTC,2005-01-02 00:00:00 UTC,-2.5,"""T_MIN""","""-/1103295,4408…","""5df772eb-0469-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,-3.5,"""T_MIN""","""-/940532,44632…","""5df772eb-0469-…"
2005-01-01 00:00:00 UTC,2005-01-02 00:00:00 UTC,-4.0,"""T_MIN""","""-/940532,44632…","""5df772eb-0469-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,-5.0,"""T_MIN""","""-/980336,44770…","""3d453769-3359-…"
2005-01-01 00:00:00 UTC,2005-01-02 00:00:00 UTC,-6.0,"""T_MIN""","""-/980336,44770…","""3d453769-3359-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,0.0,"""T_MIN""","""-/944335,45017…","""3d453769-3359-…"


In [25]:
data.filter(pl.col("value").is_not_null()).unique("id").collect()

start,stop,value,variable,id,task
"datetime[μs, UTC]","datetime[μs, UTC]",f64,str,str,str
1989-01-01 00:00:00 UTC,1989-01-02 00:00:00 UTC,0.0,"""T_MIN""","""-/933556,44833…","""a729a0ac-d4e1-…"
2006-12-31 09:00:00 UTC,2007-01-01 09:00:00 UTC,6.2,"""T_MAX""","""-/1033049,4480…","""a3cb7edd-3054-…"
2014-06-29 00:00:00 UTC,2014-06-30 00:00:00 UTC,14.4,"""T_MIN""","""-/1023567,4443…","""ac9a7e7e-cc2a-…"
2006-12-31 09:00:00 UTC,2007-01-01 09:00:00 UTC,7.1,"""T_MAX""","""-/1153379,4457…","""ae91cdd2-c081-…"
2006-01-01 00:00:00 UTC,2006-01-02 00:00:00 UTC,-2.8,"""T_MIN""","""-/1193626,4454…","""d19caadc-9ae2-…"
2013-01-01 00:00:00 UTC,2013-01-02 00:00:00 UTC,8.6,"""T_MAX""","""-/1167357,4468…","""1a7f2fd5-03bf-…"
1991-01-01 00:00:00 UTC,1991-01-02 00:00:00 UTC,3.0,"""T_MIN""","""-/1016216,4442…","""3af1a5ba-128e-…"
2006-12-31 09:00:00 UTC,2007-01-01 09:00:00 UTC,6.9,"""T_MAX""","""-/967965,45054…","""1aab5728-c763-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,-3.8,"""T_MIN""","""-/1086924,4476…","""7d633190-c2fa-…"
2004-12-31 00:00:00 UTC,2005-01-01 00:00:00 UTC,-2.1,"""T_MIN""","""-/1017834,4449…","""1f67eaab-1d4f-…"


name,network,Comune,Provincia,Regione,Nazione,elevation,lon,lat,Bacino,id,task
str,str,str,str,str,str,f64,f64,f64,str,str,str
"""Forli' Villagr…","""climat""","""FORLI'""","""FORLI-CESENA""","""EMILIA-ROMAGNA…","""ITALY""",41.0,11.969637,44.225098,,"""-/1196964,4422…","""1eb3ea6f-3b04-…"
"""Forlimpopoli""","""climat""","""FORLIMPOPOLI""","""FORLI-CESENA""","""EMILIA-ROMAGNA…","""ITALY""",23.0,12.148725,44.198116,,"""-/1214873,4419…","""1eb3ea6f-3b04-…"
"""Lizzano""","""climat""","""CESENA""","""FORLI-CESENA""","""EMILIA-ROMAGNA…","""ITALY""",110.0,12.170409,44.126011,,"""-/1217041,4412…","""1eb3ea6f-3b04-…"
"""Bulgaria""","""climat""","""CESENA""","""FORLI-CESENA""","""EMILIA-ROMAGNA…","""ITALY""",36.0,12.300218,44.120007,,"""-/1230022,4412…","""1eb3ea6f-3b04-…"
"""Mulazzano""","""idrost""","""CORIANO""","""RIMINI""","""EMILIA-ROMAGNA…","""ITALY""",190.0,12.536224,43.952493,"""MARANO""","""-/1253622,4395…","""ee40a5da-e409-…"
"""Volano""","""locali""","""CODIGORO""","""FERRARA""","""EMILIA-ROMAGNA…","""ITALY""",1.0,12.250367,44.812868,"""PIANURA FRA PO…","""-/1225037,4481…","""ee40a5da-e409-…"
"""Campogalliano""","""idrost""","""CAMPOGALLIANO""","""MODENA""","""EMILIA-ROMAGNA…","""ITALY""",40.0,10.853262,44.693446,"""PIANURA FRA CR…","""-/1085326,4469…","""90b320cb-c191-…"
"""Verago""","""rmap""","""ALTA VAL TIDON…","""PIACENZA""","""EMILIA-ROMAGNA…","""ITALY""",320.0,9.40037,44.96295,"""TIDONE""","""-/940037,44962…","""90b320cb-c191-…"
"""Monte Cimone a…","""idrost""","""RIOLUNATO""","""MODENA""","""EMILIA-ROMAGNA…","""ITALY""",2165.0,10.7,44.183334,"""PANARO""","""-/1070000,4418…","""90b320cb-c191-…"
"""Ravarino""","""idrost""","""RAVARINO""","""MODENA""","""EMILIA-ROMAGNA…","""ITALY""",19.0,11.119224,44.765548,"""PIANURA FRA PA…","""-/1111922,4476…","""a431616a-6d0d-…"
