flowchart

franzbischoff · Jun 6, 2023 · e97479a · e97479a
1 parent 432425c
commit e97479a
Show file tree

Hide file tree

Showing 5 changed files with 240 additions and 326 deletions.
diff --git a/R/contrast_data.R b/R/contrast_data.R
@@ -26,6 +26,37 @@ register_contrast_model <- function() {
   #   )
   # )
 
+  # contrasts
+  # coverages -> creates cov_sum and cov_idxs
+  # platos
+  # thresholds
+  # cov_counts # never used
+  # num_segments
+
+  # c_total
+  # c_median
+  # c_mean
+  # c_sd
+  # cov_con_mean
+  # k_mean
+  # cov_mean
+  # coverage
+  # cov_percent
+  # redundancy
+  # samples
+
+  # data {
+  #  window
+  #  k***
+  #  plato
+  #  contrast***
+  #  threshold
+  #  cov_sum (same as cov_counts) ***
+  #  cov_idxs***
+  #  cov_con***
+  # }
+
+
   parsnip::set_fit(
     model = "contrast_model",
     eng = "contrast_profile",

diff --git a/_contrast_profile/meta/meta b/_contrast_profile/meta/meta
diff --git a/flowcharts/contrast_profile.mmd b/flowcharts/contrast_profile.mmd
@@ -8,7 +8,8 @@ title: Contrast Profile Classifier
     "theme": "dark",
     "fontFamily": "Fira Code Medium, Trebuchet MS, Verdana, Arial, Sans-Serif",
     "flowchart": {
-      "diagramPadding": 10
+      "rankSpacing": 70,
+      "nodeSpacing": 70
     }
   }
 }%%
@@ -23,41 +24,32 @@ classDiagram
 %% --   Link (Solid) (Association without arrows)
 %% ..   Link (Dashed) (Association without arrows and not navigable)
 
-  Data "many * classes" <.. "1 * window_sizes" Contrast
+
+  Data "*" <.. "1" Contrast
+  Data "*" o-- "1" ShapeletMeta
   Contrast "1" <.. "1" Shapelet
   Shapelet "1" <.. "1" ShapeletMeta
-  Data <.. ShapeletMeta
-%%  Shapelet "1..window_sizes" *-- "1" PanContrast_TopK
-%%  Contrast "1..window_sizes" *-- "1" PanContrast_TopK
-
-%%  class PanContrast_TopK {
-%%    Contrast contrasts
-%%    Shapelet shapelets
-%%  }
-
 
   class Data {
     List~Factor~ classes
-    List~Numeric~ ts
+    List~float[]~ ts
     List~int~ ids
   }
 
-
   %% class is the positive class
   class Contrast {
-    List~int~ window_sizes*
     Factor class*
-    List~Numeric~ contrast_profiles
+    List~int~ window_sizes*
+    List~float[]~ contrast_profiles
   }
 
   class Shapelet {
-    List~int~ window_sizes*
     Factor class*
-    List~Numeric~ platos
-    List~int~ platos_indices
-    List~Numeric~ platos_twin
-    List~int~ platos_twin_indices
-    List~float~ plato_nary_contrasts
+    int num_platos*
+    List~int~ window_sizes*
+    List~int[num_platos]~ platos_indices
+    List~int[num_platos]~ platos_twin_indices
+    List~float[num_platos]~ plato_nary_contrasts
   }
 
   %% all Lists have dim m,n where m == num_of_shapelets(k) and n == length(window_sizes)
@@ -66,42 +58,49 @@ classDiagram
   %% TODO: this need to be reshaped
   %% TODO: num_segments reflect the number of positive samples
   class ShapeletMeta {
-    List~int~ window_sizes*
     Factor class*
-    List~Numeric~ thresholds
-    List~Numeric~ overall_contrasts
-    List~bool~ coverages
-    List~int~ coverages_counts
-    int num_segments
+    int num_segments*
+    int num_platos*
+    List~int~ window_sizes*
+    List~float[num_platos]~ joint_platos
+    List~float[num_platos]~ thresholds
+    List~float[num_platos]~ contrasts
+    List~bool[num_platos]~ coverages
   }
 
-  ShapeletMeta *-- Fitted
-  Fitted *-- Model
-  Terms *-- Model
-  ShapeletMeta <-- Terms : optimizes
+  ShapeletMeta "n" <|.. "1" Candidate : Optimize
+  Data "*" o-- "1" Score
+  Score "1" <|.. "1" Candidate : Optimize
+  Candidate <|-- Fitted : select
 
-  class Fitted {
-    Factor class*
-    ShapeletMeta best_shapelets
-    List~Numeric~ platos
-    List~Numeric~ thresholds
+  class Score {
+    float accuracy
+    float f1
+    float precision
+    float recall
   }
 
-  class Terms {
-  float contrast_total
-  float contrast_median
-  float contrast_mean
-  fload contrast_std
-  fload cov_con_ratio_mean
-  float k_mean
-  float cov_mean
-  fload coverage
-  fload cov_percent
-    int redundancy
-    int num_shapelets
+  class Fitted {
+    Factor class*
+    int num_platos*
+    Candidate best_score_canditate*
+    - List~float[num_platos]~ joint_platos
+    - List~float[num_platos]~ thresholds
   }
 
-  class Model {
-    Fitted fitted_values
-    Terms terms
+  class Candidate {
+    Factor class*
+    Score score*
+    List~ShapeletMeta~ shapelets*
+    float contrast_total
+    float contrast_median
+    float contrast_mean
+    float contrast_std
+    float cov_con_ratio_mean
+    float k_mean
+    float cov_mean
+    float coverage
+    float cov_percent
+      int redundancy
+      int num_shapelets
   }
diff --git a/scripts/_contrast_profile.R b/scripts/_contrast_profile.R
@@ -284,11 +284,13 @@ list(
   #   iteration = "list" # thus the objects keep their attributes
   # ),
   tar_target(
+    #### Pipeline: score_by_segment - Preparation of the data: the model's data is the shapelets with metadata ----
     score_by_segment,
     {
       res <- list()
       for (i in seq_len(var_vfolds)) {
         cli::cli_alert_info("Scores by segment, fold {i}.")
+        # These parameter may be tuned on `recipes`
         tune1 <- 0.1
         tune2 <- 1 / 3
         score <- score_by_segment_window(contrast_profiles[[i]]$positive,
@@ -303,8 +305,15 @@ list(
     iteration = "list"
   ),
   tar_target(
+    #### Pipeline: find_shapelets - This is the model fit. ----
     find_shapelets,
     {
+      # Here we can try: fitting all possible solutions and later score them and finally try
+      # to find which metadata is the best to filter the solutions
+      # Or, we can try to use some heuristics to find the best metadata for the solutions
+      # These parameters are tuned on `parsnip`/`tune`
+      # Currently the parameter `n` draws randomically 1 to `n` samples from the pan contrast profile
+      # We can try to use a fixed number of samples during the parameter optimization
       res <- list()
       for (i in seq_len(var_vfolds)) {
         cli::cli_alert_info("Finding solutions, fold {i}.")
@@ -373,8 +382,11 @@ list(
   #   iteration = "list"
   # ),
   tar_target(
+    #### Pipeline: test_classifiers_self - This is the current score function. ----
     test_classifiers_self,
     {
+      # With the results of this step, plus the fitted solutions, we need to find which
+      # metadata is the best to filter the solutions
       class(analysis_split) <- c("manual_rset", "rset", class(analysis_split))
 
       res <- list()
@@ -384,6 +396,9 @@ list(
         res[[i]] <- list()
         shapelets <- find_shapelets[[i]]
 
+        # the `compute_metrics_topk` function may need testing on the `TRUE` criteria
+        # currently, if `ANY` shapelet matches, it is considered a positive
+        # as alternative we can try to use `ALL`, `HALF` or other criteria
         res[[i]] <- compute_metrics_topk(fold, shapelets, var_future_workers, TRUE)
       }
 
@@ -416,34 +431,24 @@ list(
   tar_target(
     test_classifiers,
     {
-      shapelet_sizes <- var_shapelet_sizes
-
+      # Here we test the solutions we chose on the assessment split
+      # The final `model` we need is the shapelet
       class(assessment_split) <- c("manual_rset", "rset", class(assessment_split))
 
       res <- list()
       for (i in seq_len(var_vfolds)) {
         fold <- rsample::get_rsplit(assessment_split, i)
-        shapelets <- best_shapelets[[i]]
-        contrast <- contrast_profiles[[i]]
+        best_shapelets <- purrr::pluck(find_shapelets, i)[1, ]
 
-        res[[i]] <- compute_metrics_topk(fold, shapelets, contrast)
+        res[[i]] <- compute_metrics_topk(fold, best_shapelets, var_future_workers, TRUE)
       }
-      res
       overall <- compute_overall_metric(res)
       list(fold = res, overall = overall)
     },
-    pattern = map(best_shapelets, assessment_split, contrast_profiles),
+    pattern = map(assessment_split, find_shapelets),
     iteration = "list"
   )
   # tar_target(
-  #   best_shapelets,
-  #   {
-  #     # algorithm for selecting the best shapelet
-  #   },
-  #   pattern = map(contrast_profiles),
-  #   iteration = "list"
-  # ),
-  # tar_target(
   #   train_classifier,
   #   {
   #     # train a classifier based on the best shapelets

diff --git a/scripts/classification/pan_contrast.R b/scripts/classification/pan_contrast.R
@@ -134,7 +134,7 @@ score_by_segment_window <- function(true_data, false_data, contrast_profiles, qu
   }
 
   # here we compute the total number of segments that each plato could classify
-  total_counts <- as.matrix(purrr::map_dfr(segs, function(x) apply(x, 1, sum)))
+  total_counts <- as.matrix(purrr::map_dfr(segs, function(x) apply(x, 1, sum))) # never used
   colnames(cont) <- w_sizes # set the column names on the overall contrast matrix
   colnames(thlds) <- w_sizes # set the column names on the overall contrast matrix
 
@@ -148,9 +148,13 @@ score_by_segment_window <- function(true_data, false_data, contrast_profiles, qu
     coverage = segs, # segs == coverage of each plato (~sensitivity)
     platos = shapes,
     thresholds = thlds, # thlds == threshold of each plato
-    cov_counts = total_counts, # sum of segs == 1. Best is sum == num_segments
+    cov_counts = total_counts, # sum of segs == 1. Best is sum == num_segments  # never used
     num_segments = (length(segments) - 1)
   )
+
+
+  # score <- score_candidates(score)
+  # return(score)
 }
 
 score_candidates <- function(score) {
@@ -686,11 +690,11 @@ compute_overall_metric <- function(all_folds) {
   tp <- fp <- tn <- fn <- acc <- ff <- 0
 
   for (fold in all_folds) {
-    tp <- tp + fold$tp
-    fp <- fp + fold$fp
-    tn <- tn + fold$tn
-    fn <- fn + fold$fn
-    ff <- ff + fold$f1
+    tp <- tp + fold[[1]]$tp
+    fp <- fp + fold[[1]]$fp
+    tn <- tn + fold[[1]]$tn
+    fn <- fn + fold[[1]]$fn
+    ff <- ff + fold[[1]]$f1
   }
 
   tm <- (2 * tp) / (2 * tp + fp + fn)