classification end 1

franzbischoff · Jun 25, 2023 · 9c64159 · 9c64159
1 parent 17b43aa
commit 9c64159
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 66 deletions.
diff --git a/_contrast_profile_ex/meta/meta b/_contrast_profile_ex/meta/meta
@@ -1,26 +1,26 @@
 name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error
-.Random.seed|object|fbe882969501c735|||||||||||||||
+.Random.seed|object|0bc6f3245e3a0b90|||||||||||||||
 "%|||%"|function|031bda8ec980931b|||||||||||||||
 "%||NA%"|function|ef771f7e0b2b61dc|||||||||||||||
 activity|function|514ba81c8efb42b6|||||||||||||||
 ampl|function|22bb8917cc3362d0|||||||||||||||
 analysis_split|stem|280806d638f6257a|3c9d74acfb239c7d|9fe8c079932d5cb0|1188165946||t19530.6896124422s|f1ea2081a4ca0bea|734755|rds|local|group||analysis_split_1210acf9*analysis_split_d7478d53|0.037||
 assessment_split|stem|52e78968a55834c2|1b41fd63223669d1|9fe8c079932d5cb0|-264140407||t19530.6896095486s|227a75896b1a57c9|734108|rds|local|group|||0.051||
-best_shapelets|pattern|9048e953c1526a3e|1c009d3ed0dd0292||-261715276||||43772|rds|local|list||best_shapelets_d9008992*best_shapelets_1b4be4f2|0.084||
-best_shapelets_1b4be4f2|branch|0ddbda491763cd62|1c009d3ed0dd0292|d30f89d8b3f2c411|-631355285||t19530.6961702823s|3381959778bde498|23065|rds|local|list|best_shapelets||0.039||
-best_shapelets_8af6b52c|branch||af821b2a2061fc7e|713037692fee6b1d|143699523||t19524.0978760455s||0|rds|local|list|best_shapelets||0.106||1m22mColumn 1 must be named.Use .name_repair to specify repair.1mCaused by error in repaired_names22m33m39m Names cant be empty.31m39m Empty name found at location 1.
-best_shapelets_d9008992|branch|173cd1d0df1498f6|1c009d3ed0dd0292|f314d453af08f106|7945773||t19530.69616681s|7cf218cce386f5bd|20707|rds|local|list|best_shapelets||0.045||
 clean_pred|function|2f000150c7903a2e|||||||||||||||
 clean_splits_data|function|62dff47d87f498a6|||||||||||||||
 clean_truth|function|566b7692f08c8733|||||||||||||||
+combine_metrics|function|f8956391cf32bf4e|||||||||||||||
+combine_shapelets|pattern|23bb5269a2a49376|89d4c03d7e16deef||-644834951||||8048633|rds|local|list||combine_shapelets_5ce81ed9*combine_shapelets_5a53051f|251.084||
+combine_shapelets_5a53051f|branch|fc054da0ff414b85|89d4c03d7e16deef|5d36991918f6a77c|-210299615||t19532.9144598266s|e61edd702441e4a1|4069990|rds|local|list|combine_shapelets||123.524||
+combine_shapelets_5ce81ed9|branch|5b647f138f8e53cc|89d4c03d7e16deef|e82d83129e96e6de|-1911163695||t19532.9130236457s|01d2592bc5ab26be|3978643|rds|local|list|combine_shapelets||127.56||
 compl|function|1cc0810c2c8fe26b|||||||||||||||
 complexity|function|5fe702a01cef2a6e|||||||||||||||
 compute_arcs|function|1a4a6dc48008b78a|||||||||||||||
 compute_companion_stats|function|76613610273412a9|||||||||||||||
 compute_filters|function|7329a063b58bdd91|||||||||||||||
 compute_floss|function|2b4de81bfe11e55d|||||||||||||||
 compute_metrics_topk|function|7be1c2170ed3e678|||||||||||||||
-compute_overall_metric|function|48f197ea760af3ee|||||||||||||||
+compute_overall_metric|function|b35f2808515fc23b|||||||||||||||
 compute_s_profile_with_stats|function|2d5c844d8e0e655f|||||||||||||||
 compute_score_regimes|function|d6926f3b87fe56cf|||||||||||||||
 compute_streaming_profile|function|358c55ef8991349d|||||||||||||||
@@ -30,14 +30,16 @@ const_signals|object|90e8be76eb583b77|||||||||||||||
 contrast_profiles|pattern|8ea810d80feb870c|334467772a0e266e||-1929489330||||72541974|rds|local|list||contrast_profiles_abdeb381*contrast_profiles_a82e39cb|119.409||
 contrast_profiles_a82e39cb|branch|2497215cdf26af32|334467772a0e266e|5bb714e3c20da8e1|-1652393871||t19530.6910479256s|e23278cc435f1cf6|37327513|rds|local|list|contrast_profiles||58.355|self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.|
 contrast_profiles_abdeb381|branch|1632ad0e3c814b87|334467772a0e266e|13039016b63cc2b6|881813762||t19530.6903450048s|149433ddef8a067d|35214461|rds|local|list|contrast_profiles||61.054|self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.. self_mp contains non finite values. This may happen for small windows.|
-contrast_profiles_e327ff9d|branch||99a97d46b20453c6|f8db41fc6e0c9dbc|1788068980||t19524.0587254531s||0|rds|local|list|contrast_profiles||0.09|Unknown or uninitialised column splits.|Assertion on split failed. Must be of class list, not NULL.
 contrastprofile_topk|function|07384bc0c436ef7a|||||||||||||||
 dataset|stem|50b98d6e099df34f|aba3c22efecde197|ebe8dca02f56cdbb|914614260||t19524.0559056349s|5f270017c99e1c09|490195|rds|local|vector|||36.41||
 deriv|function|e067b65c68e2da0b|||||||||||||||
 deriv2|function|b26f60bb778048b2|||||||||||||||
 deriv3|function|388b2d63c0e892a5|||||||||||||||
 dev_mode|object|df161ddba3dacc9e|||||||||||||||
 ecg_kurtosis|function|224ebe1546bc6262|||||||||||||||
+extract_metadata|pattern|369b92fb1d67c1cf|3ef92268c8a48d8a||-966511094||||360619|rds|local|list||extract_metadata_289470d8*extract_metadata_c993f884|14.571||
+extract_metadata_289470d8|branch|17b66f1c0164c7fb|3ef92268c8a48d8a|804b58607944dc99|-41936223||t19532.9114513049s|9155146f8f79cbb9|193475|rds|local|list|extract_metadata||7.362||
+extract_metadata_c993f884|branch|34ed29116d53284c|3ef92268c8a48d8a|849ec2ef7595555f|-689378196||t19532.9115406569s|f4f7b0e4e6f07057|167144|rds|local|list|extract_metadata||7.209||
 extract_regime_sample|function|8d677706b0893101|||||||||||||||
 extract_regimes|function|e70966b5d9abe62e|||||||||||||||
 f_score|function|38b2fb4e008f352a|||||||||||||||
@@ -388,11 +390,6 @@ file_paths_files|stem|a3579fc2766a451a|c44d358350220e73|8fb75deda0f036e8|1253326
 filter_best_solutions|function|32670694f6830d9f|||||||||||||||
 filter_data|function|96623b0c421b9dd1|||||||||||||||
 find_all_files|function|6463af0421144fad|||||||||||||||
-find_shapelets|pattern|c6707fe2c86c8c6f|c8f0fdc057954dd2||-1129503597||||8048633|rds|local|list||find_shapelets_21e88cf5*find_shapelets_6852b839|199.679||
-find_shapelets_21e88cf5|branch|5b647f138f8e53cc|c8f0fdc057954dd2|c3ff81c56b681833|-957965849||t19530.692233613s|01d2592bc5ab26be|3978643|rds|local|list|find_shapelets||87.44||
-find_shapelets_49cb1ee5|branch|18710425632c69e5|030d76be38eacacc|daa5417852e0dff3|-763398733||t19530.6038711848s|d87576600223a83f|4758660|rds|local|list|find_shapelets||310.141||
-find_shapelets_4b79072b|branch||61ebafe7fa302917|58f228c3171f3a08|1963156974||t19524.0711582264s||0|rds|local|list|find_shapelets||0.117||operator is invalid for atomic vectors
-find_shapelets_6852b839|branch|fc054da0ff414b85|c8f0fdc057954dd2|d1404198a41dd933|1164592160||t19530.6935397139s|e61edd702441e4a1|4069990|rds|local|list|find_shapelets||112.239||
 find_solutions|function|60abac63439ba648|||||||||||||||
 fit_model|function|f07b8760be6f5b09|||||||||||||||
 gamma_function|function|1429884f4efea59d|||||||||||||||
@@ -429,10 +426,7 @@ read_ecg_csv|function|37ca65c1625b5755|||||||||||||||
 read_ecg_with_atr|function|af525b0ff46cf3ab|||||||||||||||
 reshape_ds_by_truefalse|function|00122e47789973df|||||||||||||||
 rmssd_r|function|d94b973ed64cd2a9|||||||||||||||
-score_by_segment|pattern|39303a0493a5d007|f1553d3ff971c3bc||-1653800725||||360619|rds|local|list||score_by_segment_289470d8*score_by_segment_c993f884|13.114||
-score_by_segment_289470d8|branch|17b66f1c0164c7fb|f1553d3ff971c3bc|804b58607944dc99|1589646323||t19530.691142023s|9155146f8f79cbb9|193475|rds|local|list|score_by_segment||7.34||
-score_by_segment_67b13b4e|branch|eaf643727a8b980f|dfb32833461b67eb|c0f976b2f6781a51|2129449300||t19530.6002727353s|507fcb6fdc85a6bd|56559|rds|local|list|score_by_segment||7.947||
-score_by_segment_c993f884|branch|34ed29116d53284c|f1553d3ff971c3bc|849ec2ef7595555f|-588623498||t19530.6912151708s|f4f7b0e4e6f07057|167144|rds|local|list|score_by_segment||5.774||
+rrank|function|8417f6fe9f0b2347|||||||||||||||
 score_by_segment_window|function|1b390036c1b44577|||||||||||||||
 score_candidates|function|8d31d00234d38174|||||||||||||||
 score_existence|function|28f698c43ad369e2|||||||||||||||
@@ -443,11 +437,15 @@ score_regimes|function|638a62b145cde2b8|||||||||||||||
 score_regimes_precision|function|750133b7a87964d6|||||||||||||||
 score_solutions|function|7b4d0386def41d3b|||||||||||||||
 sd_r|function|edc210211e6db09c|||||||||||||||
+self_optimize_classifier|pattern|ab0c1ccdd10838ad|2bdf723f760ca290||828804313||||8087291|rds|local|list||self_optimize_classifier_cf2ff651*self_optimize_classifier_ef8e95c8|125.723||
+self_optimize_classifier_cf2ff651|branch|d909467041b5f246|2bdf723f760ca290|c47817381d3c3fc2|-286065733||t19532.9152184807s|4ea50290cfd2eace|4000092|rds|local|list|self_optimize_classifier||64.615||
+self_optimize_classifier_ef8e95c8|branch|a9570ef091275c7e|2bdf723f760ca290|aa6244bc22d2ba1e|807916067||t19532.9159355567s|b071c57439beb0ec|4087199|rds|local|list|self_optimize_classifier||61.108||
 skip_graphics|object|909013909b1f03d4|||||||||||||||
 sprintf_transformer|function|2199f7feeee38815|||||||||||||||
-test_classifiers_self|pattern|5816d884d48ce9af|aa1108f63f1f6791||1817600917||||33041|rds|local|list||test_classifiers_self_88343ce0*test_classifiers_self_0d402c0d|122.022||
-test_classifiers_self_0d402c0d|branch|abdcad7796047ba5|aa1108f63f1f6791|05b6cd534cd4b061|504432482||t19530.6949602859s|720a447f18066eb5|14914|rds|local|list|test_classifiers_self||58.524||
-test_classifiers_self_88343ce0|branch|89a27c0c4ec7ed4e|aa1108f63f1f6791|f8b49478c800b59e|1155594481||t19530.6942790026s|1948fe8b40f57e76|18127|rds|local|list|test_classifiers_self||63.498||
+test_classifier|pattern|5d3c2c15909d457c|f700c163f96b6ac4||-564191998||||17106|rds|local|list||test_classifier_6035a7f9*test_classifier_8d46cbb6|113.417||
+test_classifier_6035a7f9|branch|56707945b64bf703|f700c163f96b6ac4|19fca799f36f7de5|-1327951738||t19533.1796791558s|e2a03b78c31aecbe|10962|rds|local|list|test_classifier||62.566||
+test_classifier_8d46cbb6|branch|15f38f3aaa5485b8|f700c163f96b6ac4|8ff35da8e72eb203|-1455901532||t19533.1802714174s|e1befc0c148a1c47|6144|rds|local|list|test_classifier||50.851||
+test_holdout|stem|0b5151a5afd69904|f973ef569daaa912|649252e6e73a4ff1|1685724330||t19533.1918725688s|d5f301c4216c4553|13103|rds|local|vector|||5.113||
 testing_split|stem|d50d3b13a62f03d5|c15f1f750870070b|391ed944dfc56298|184895865||t19524.0978691601s|3db7e9e35ac20187|123790|rds|local|vector|||0.002||
 topk_distance_profiles|function|275ec9c8aa99e412|||||||||||||||
 training_split|stem|77ba96cd3fc7b90d|74d47449a574137b|391ed944dfc56298|708679313||t19524.0562479008s|35e72503cf1ccc8c|366898|rds|local|vector|||0.001||

diff --git a/scripts/_contrast_profile_ex.R b/scripts/_contrast_profile_ex.R
@@ -56,7 +56,7 @@ tar_option_set(
   ),
   format = "rds",
   memory = "transient",
-  # debug = "find_shapelets",
+  # debug = "combine_shapelets",
   garbage_collection = TRUE
 )
 
@@ -288,12 +288,12 @@ list(
   #   iteration = "list" # thus the objects keep their attributes
   # ),
   tar_target(
-    #### Pipeline: score_by_segment - Preparation of the data: the model's data is the shapelets with metadata ----
-    score_by_segment,
+    #### Pipeline: extract_metadata - Preparation of the data: the model's data is the shapelets with metadata ----
+    extract_metadata,
     {
       res <- list()
       for (i in seq_len(var_vfolds)) {
-        cli::cli_alert_info("Scores by segment, fold {i}.")
+        cli::cli_alert_info("Extracting metadata, fold {i}.")
         # These parameter can be tuned on `recipes`. These default values seems to be good enough
         tune1 <- 0.1
         tune2 <- 1 / 3
@@ -309,8 +309,8 @@ list(
     iteration = "list"
   ),
   tar_target(
-    #### Pipeline: find_shapelets - This is the model fit. ----
-    find_shapelets,
+    #### Pipeline: combine_shapelets - This is the model fit. ----
+    combine_shapelets,
     {
       # Here we can try: fitting all possible solutions and later score them and finally try
       # to find which metadata is the best to filter the solutions
@@ -322,7 +322,7 @@ list(
       for (i in seq_len(var_vfolds)) {
         cli::cli_alert_info("Finding solutions, fold {i}.")
         tune3 <- 10 # this could be tuned, but some trials shows that limiting to smaller K's doesn't increase the performance
-        solutions <- find_solutions(score_by_segment[[i]],
+        solutions <- find_solutions(extract_metadata[[i]],
           min_cov = 10,
           max_shapelets = 20, # this can be more than topk
           rep = 5000,
@@ -339,12 +339,12 @@ list(
       }
       res
     },
-    pattern = map(score_by_segment),
+    pattern = map(extract_metadata),
     iteration = "list"
   ),
   tar_target(
-    #### Pipeline: test_classifiers_self - This is the current score function. ----
-    test_classifiers_self,
+    #### Pipeline: self_optimize_classifier - This is the current score function. ----
+    self_optimize_classifier,
     {
       # With the results of this step, plus the fitted solutions, we need to find which
       # metadata is the best to filter the solutions
@@ -353,21 +353,23 @@ list(
       res <- list()
       for (i in seq_len(var_vfolds)) {
         fold <- rsample::get_rsplit(analysis_split, i)
-        res[[i]] <- list()
-        shapelets <- find_shapelets[[i]]
+        shapelets <- combine_shapelets[[i]]
 
         # the `compute_metrics_topk` function may need testing on the `TRUE` criteria
         # currently, if `ANY` shapelet matches, it is considered a positive
         # as alternative we can try to use `ALL`, `HALF` or other criteria
-        res[[i]] <- compute_metrics_topk(fold, shapelets, 6, TRUE)
+
+        training_metrics <- compute_metrics_topk(fold, shapelets, 6, TRUE)
+
+        res[[i]] <- list(training_metrics = training_metrics, shapelets = shapelets)
       }
 
       res # list(fold = res, overall = overall)
 
 
-      # aa <- tibble::as_tibble(purrr::transpose(test_classifiers_self[[1]][[i]]))
+      # aa <- tibble::as_tibble(purrr::transpose(self_optimize_classifier[[1]][[i]]))
       # aa <- dplyr::mutate_all(aa, as.numeric)
-      # aa <- dplyr::bind_cols(find_shapelets[[1]][[i]], aa) |>
+      # aa <- dplyr::bind_cols(combine_shapelets[[1]][[i]], aa) |>
       #   dplyr::select(-data) |>
       #   dplyr::mutate(coverage = as.numeric(coverage), redundancy = as.numeric(redundancy))
       # bb <- dplyr::bind_rows(bb, aa)
@@ -381,41 +383,83 @@ list(
       # )
       # GGally::ggpairs(bb, aes(alpha = 0.05), lower = list(continuous = "smooth"))
     },
-    pattern = map(find_shapelets, analysis_split),
+    pattern = map(combine_shapelets, analysis_split),
     iteration = "list"
   ),
   tar_target(
-    best_shapelets,
+    test_classifier,
     {
       # Here we test the solutions we chose on the assessment split
+      # The final `model` we need is the shapelet
+      class(assessment_split) <- c("manual_rset", "rset", class(assessment_split))
 
       res <- list()
+
       for (i in seq_len(var_vfolds)) {
-        aa <- tibble::as_tibble(purrr::transpose(test_classifiers_self[[i]]))
-        aa <- dplyr::mutate_all(aa, as.numeric)
-        aa <- dplyr::bind_cols(find_shapelets[[i]], aa) |>
-          # dplyr::select(-data) |> ####### The final `model` we need is the shapelet
-          dplyr::mutate(across(!where(is.list), as.numeric))
-
-        sup_spec <- quantile(aa$specificity, 0.75, na.rm = TRUE)
-        sup_prec <- quantile(aa$precision, 0.75, na.rm = TRUE)
-        min_fp <- min(aa$fp, na.rm = TRUE)
-        min_fn <- min(aa$fn, na.rm = TRUE)
-
-        aa <- aa |>
-          dplyr::filter(
-            precision > sup_prec,
-            specificity > sup_spec
-          ) |>
-          dplyr::arrange(fp, fn) |>
-          dplyr::slice_head(n = 10)
-
-        res[[i]] <- aa
+        fold <- rsample::get_rsplit(assessment_split, i)
+
+        best_shapelets <- combine_metrics(
+          self_optimize_classifier[[i]]$training_metrics,
+          self_optimize_classifier[[i]]$shapelets
+        )
+
+        bb <- compute_metrics_topk(fold, best_shapelets, 6, TRUE)
+        bb <- list_dfr(bb)
+        aa <- best_shapelets |> dplyr::select(tp:kappa)
+        metadata <- best_shapelets |> dplyr::select(c_total:data)
+        namesmeta <- names(metadata)
+        cc <- tibble::as_tibble(aa - bb)
+        namecols <- names(aa)
+        namecolsa <- glue::glue("{namecols}_aa")
+        namecolsb <- glue::glue("{namecols}_bb")
+        colnames(aa) <- namecolsa
+        colnames(bb) <- namecolsb
+        cc <- dplyr::bind_cols(cc, aa, bb)
+        cc <- cc %>% dplyr::select(sort(names(.)))
+        cc <- cc %>% dplyr::relocate(tp, tp_aa, tp_bb, fp, fp_aa,
+          fp_bb, tn, tn_aa, tn_bb, fn, fn_aa, fn_bb,
+          .before = 1
+        )
+        cc <- dplyr::bind_cols(cc, metadata)
+        metrics <- cc |>
+          dplyr::filter(abs(precision) < rrank(precision, 2, 2)) |>
+          dplyr::arrange(
+            dplyr::desc(precision_bb), dplyr::desc(specificity_bb),
+            dplyr::desc(km_bb), fp_bb, fn_bb
+          )
+        metrics <- metrics |>
+          dplyr::select(c(all_of(namecolsb), all_of(namesmeta))) |>
+          dplyr::rename_with(~ gsub("_bb", "", .x, fixed = TRUE))
+        res[[i]] <- dplyr::slice_head(metrics, n = 1)
       }
-      res
+      overall <- compute_overall_metric(res)
+      list(fold = res, overall = overall)
     },
-    pattern = map(test_classifiers_self, find_shapelets),
+    pattern = map(self_optimize_classifier, assessment_split),
     iteration = "list"
+  ),
+  tar_target(
+    #### Pipeline: test_holdout - This is the current score function. ----
+    test_holdout,
+    {
+      # With the results of this step, plus the fitted solutions, we need to find which
+      # metadata is the best to filter the solutions
+      fold <- list(data = testing_split)
+      res <- list()
+      for (i in seq_len(var_vfolds_repeats)) {
+        shapelets <- list_dfr(test_classifier[[i]]$fold)
+
+        # the `compute_metrics_topk` function may need testing on the `TRUE` criteria
+        # currently, if `ANY` shapelet matches, it is considered a positive
+        # as alternative we can try to use `ALL`, `HALF` or other criteria
+        metric <- list_dfr(compute_metrics_topk(fold, shapelets, 6, TRUE))
+        combined <- dplyr::bind_cols(metric, (shapelets |> dplyr::select(c_total:data)))
+        res[[i]] <- combined
+      }
+
+      overall <- compute_overall_metric(res)
+      list(final = res, overall = overall)
+    }
   )
 )