In [1]:
setwd(fs::path_abs("~/Local_Workspace/TesiMag"))
Sys.setlocale("LC_ALL", "UTF-8")
library(dplyr, warn.conflicts = FALSE)
library(openxlsx, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(stringi, warn.conflicts = FALSE)
library(assertr, warn.conflicts = FALSE)

source("src/database/startup.R")
source("src/database/query/data.R")

conns <- load_dbs()
sets <- c("ER", "FVG", "LOM", "MAR", "TAA2", "TOS", "UMB", "VDA", "PIE", "LIG", "VEN")
datasets <- c("ARPAPiemonte", "ARPAV", "ARPAL", "ARPALombardia", "ARPAUmbria", "TAA", "SIRToscana", "Dext3r", "ARPAFVG", "ARPAM", "ISAC", "SCIA")


In [2]:
rg <- "LOM"
meta <- query_checkpoint_meta(rg, "merged", conns$data) |> collect()
corrections <- read.xlsx(fs::path("external", "correzioni", paste0(rg, "_edit.xlsx")))
# mutate(name = stri_unescape_unicode(str_replace_all(name, regex("<U\\+(.{4})>"), paste0("\\\\u", "\\1"))))


In [None]:
corrections |>
    mutate(across(c(from_sensor_keys, from_datasets), ~ str_split(., fixed("*"), simplify = FALSE))) |>
    select(name, from_sensor_keys, from_datasets) |>
    inner_join(meta, by = c("from_sensor_keys", "from_datasets"))


In [4]:
meta |>
    rowwise() |>
    mutate(across(starts_with("from_"), ~ paste0(., collapse = ";"))) |>
    ungroup() |>
    full_join(corrections, by = "sensor_key", suffix = c(".now", ".manual")) |>
    filter(name.now != name.manual | from_datasets.now != from_datasets.manual | from_sensor_keys.now != from_sensor_keys.manual | is.na(from_sensor_keys.now) | is.na(from_sensor_keys.manual)) |>
    select(name.now, name.manual, from_sensor_keys.now, from_sensor_keys.manual) |>
    as.data.frame()


name.now,name.manual,from_sensor_keys.now,from_sensor_keys.manual
<chr>,<chr>,<chr>,<chr>


In [9]:
corrections <- fs::path_abs("./external/correzioni") |>
    fs::dir_ls(regex = regex("[^\\~]+_edit.xlsx")) |>
    purrr::map(
        .f = \(path) read.xlsx(path) |> select(sensor_key, dataset, from_sensor_keys, from_datasets, ends_with("_ok"), ends_with("_precision"), keep)
    ) |>
    bind_rows() |>
    mutate(
        sensor_key = as.integer(sensor_key),
        from_datasets = str_split(from_datasets, regex(";|\\*")),
        from_sensor_keys = str_split(from_sensor_keys, regex(";|\\*")) |> purrr::map(as.integer),
        keep = coalesce(keep, TRUE),
        loc_correction = !is.na(lon_ok) | !is.na(lat_ok),
        elev_correction = !is.na(ele_ok),
        loc_precision = if_else(loc_correction, coalesce(loc_precision, -1), coalesce(loc_precision, 0)) |> as.integer(),
    ) |>
    mutate(
        elev_precision = if_else(elev_correction | (loc_precision == -1L), coalesce(elev_precision, -1), coalesce(elev_precision, 0)) |> as.integer()
    )


In [10]:
corrected_meta <- query_checkpoint_meta(sets, "merged", conns$data) |>
    collect() |>
    as.data.frame() |>
    full_join(corrections, by = c("sensor_key", "dataset", "from_sensor_keys", "from_datasets")) |>
    assert(not_na, c(network, sensor_key, dataset, from_sensor_keys, from_datasets)) |>
    mutate(
        lon = coalesce(lon_ok, lon),
        lat = coalesce(lat_ok, lat),
        elevation = coalesce(ele_ok, elevation),
        name = coalesce(name_ok, name)
    )


In [2]:
metadata <- query_checkpoint_meta(datasets, "raw", conns$data) |> collect()

groups_table <- query_parquet(fs::path_wd("db", "extra", "series_groups", sets, ext = "parquet"), filename = T) |>
    mutate(set = parse_filename(filename, TRUE)) |>
    select(-filename) |>
    collect()

network_rank_table <- tribble(
    ~dataset, ~network, ~network_rank,
    "ISAC", "ISAC", 1L, # ISAC series are always ranked first
    "ISAC", "DPC", 4L #  DPC series are always ranked last
) |>
    bind_rows(
        metadata |> filter(dataset == "SCIA") |> distinct(dataset, network) |> mutate(network_rank = 3L) #  SCIA series are ranked second to last
    ) |>
    bind_rows(
        metadata |> filter(!dataset %in% c("SCIA", "ISAC")) |> distinct(dataset, network) |> mutate(network_rank = 2L) # ARPA series are ranked second
    )

# VEDERE SE USARE: ASSEGNAZIONE RANKING PER LUNGHEZZA DELLA SERIE, IN MODO DA NON PRENDERE COME RIFERIMENTO UNA SERIE TROPPO CORTE
length_rank_table <- tribble(
    ~from, ~to, ~length_rank,
    0L, 365L * 4L, 2L,
    365L * 4L + 1L, Inf, 1L
) # ???

gs <- groups_table |>
    left_join(metadata |> select(dataset, sensor_key, network, sensor_last), by = c("dataset", "sensor_key"))

ranked_groups <- gs |>
    left_join(network_rank_table, by = c("dataset", "network")) |>
    group_by(set, gkey, variable) |>
    arrange(network_rank, desc(sensor_last), .by_group = TRUE) |>
    mutate(rank = row_number(), skip_correction = "ISAC" %in% network) |>
    ungroup() |>
    select(!c(sensor_last, network_rank, network, from))

# ignore_corrections <- make_exclusion_table(tagged_analysis, NULL, network_x == "ISAC" | network_y == "ISAC")


In [45]:
source("notebooks/merging/correzioni_manuali.R")
ds_path <- fs::path_wd("data")
rk_test <- ranked_groups |> slice_head(n = 2L)
offs <- dynamic_merge.group(ds_path, rk_test, 10, 0L)


In [48]:
source("notebooks/merging/correzioni_manuali.R")
dynamic_merge.full(ds_path, fs::path_wd("merged_db"), ranked_groups, 10, 0L)


In [50]:
merged_data <- open_dataset(fs::path(fs::path_wd("merged_db"), "data"))
merged_meta <- open_dataset(fs::path(fs::path_wd("merged_db"), "meta"))

In [52]:
merged_data

FileSystemDataset with 6038 Parquet files
date: date32[day]
from_dataset: string
from_sensor_key: int32
value: double
set: string
gkey: int32
variable: int32

In [53]:
merged_meta

FileSystemDataset with 6038 Parquet files
gkey: int32
variable: int32
sensor_key: int32
dataset: string
set: string
rank: int32
skip_correction: bool
k0: double
k1: double
k2: double
k3: double
merged: bool
offset: int32