Added min samples and parametric or non-parametric option

Changed tree and forest to allow for parametric or non-parametric leaves. Also added min_leaf_samples, while keeping min_leaf_failures, as fitting parameters.
derrynknife · Aug 1, 2023 · aaca459 · aaca459
1 parent 06fd334
commit aaca459
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 47 deletions.
diff --git a/surpyval/regression/forest/forest.py b/surpyval/regression/forest/forest.py
@@ -24,14 +24,17 @@ def __init__(
         Z: NDArray,
         n_trees: int = 100,
         max_depth: int | float = float("inf"),
-        min_leaf_failures: int = 6,
+        min_leaf_samples: int = 5,
+        min_leaf_failures: int = 2,
         n_features_split: int | float | str = "sqrt",
         bootstrap: bool = True,
+        parametric: bool = True,
     ):
         self.data: SurpyvalData = data
         self.Z: NDArray = np.array(Z, ndmin=2)
         self.n_trees = n_trees
         self.bootstrap = bootstrap
+        self.parametric = parametric
 
         # Create Trees
         if self.bootstrap:
@@ -51,8 +54,10 @@ def __init__(
                 data=self.data[bootstrap_indices[i]],
                 Z=self.Z[bootstrap_indices[i]],
                 max_depth=max_depth,
+                min_leaf_samples=min_leaf_samples,
                 min_leaf_failures=min_leaf_failures,
                 n_features_split=n_features_split,
+                parametric=parametric,
             )
             for i in range(self.n_trees)
         )
@@ -67,10 +72,14 @@ def fit(
         t: ArrayLike | None = None,
         n_trees: int = 100,
         max_depth: int | float = float("inf"),
-        min_leaf_failures: int = 6,
+        min_leaf_samples: int = 5,
+        min_leaf_failures: int = 2,
         n_features_split: int | float | str = "sqrt",
         bootstrap: bool = True,
+        leaf_type: str = "parametric",
     ):
+        parametric = parse_leaf_type(leaf_type)
+
         data = xcnt_handler(
             x, c, n, t, group_and_sort=False, as_surpyval_dataset=True
         )
@@ -80,9 +89,11 @@ def fit(
             Z,
             n_trees,
             max_depth,
+            min_leaf_samples,
             min_leaf_failures,
             n_features_split,
             bootstrap,
+            parametric,
         )
 
     def sf(
@@ -151,10 +162,6 @@ def _apply_model_function_to_trees(
         x = np.array(x, ndmin=1)
         Z = np.array(Z, ndmin=2)
 
-        # If there are multiple covariant vector samples, ?
-        # if Z.shape[0] > 1 and x.ndim == 1:
-        #     x = np.array(x, ndmin=2).transpose()
-
         res = np.zeros((Z.shape[0], x.size)).astype(np.float64)
         for i_covariant_vector in range(Z.shape[0]):
             for tree in self.trees:
@@ -169,8 +176,20 @@ def score(
         x: ArrayLike,
         Z: ArrayLike | NDArray,
         c: ArrayLike,
-        tie_tol: float = 1e-8,
+        tie_tol: float = 1e-15,
     ) -> float:
         scores: ArrayLike = self.mortality(x, Z)
         tol: float = 1e-8
         return score(x, c, scores, tol)
+
+
+def parse_leaf_type(leaf_type: str):
+    if leaf_type.lower() == "non-parametric":
+        return False
+    elif leaf_type.lower() == "parametric":
+        return True
+    else:
+        raise ValueError(
+            f"leaf_type={leaf_type} is invalid. Must\
+            'parametric' or 'non-parametric'"
+        )
diff --git a/surpyval/regression/forest/log_rank_split.py b/surpyval/regression/forest/log_rank_split.py
@@ -7,22 +7,17 @@
 from surpyval.utils.surpyval_data import SurpyvalData
 
 
-# Inner function used in ensuring the min_leaf_failures constraint is
-# respected
-def breaks_min_leaf_failures_constraint(Z, u, v, c, min_leaf_failures):
-    left_child_samples = len(*np.where(np.logical_and(Z[:, u] <= v, c == 0)))
-    right_child_samples = len(np.where(np.logical_and(Z[:, u] > v, c == 0))[0])
-    if (
-        left_child_samples < min_leaf_failures
-        or right_child_samples < min_leaf_failures
-    ):
-        return True
-    return False
+def numpy_fill(arr):
+    mask = np.isnan(arr)
+    idx = np.where(~mask, np.arange(mask.shape[0]), 0)
+    np.maximum.accumulate(idx, out=idx)
+    return arr[idx]
 
 
 def log_rank_split(
     data: SurpyvalData,
     Z: NDArray,
+    min_leaf_samples: int,
     min_leaf_failures: int,
     feature_indices_in: Iterable[int],
 ) -> tuple[int, float]:
@@ -87,9 +82,14 @@ def log_rank_split(
         for v in np.unique(Z_u):
             # Discard the (u, v) pair if it means a leaf will
             # have < min_leaf_failures samples
-            if Z_u[Z_u <= v].size < min_leaf_failures:
+            mask = Z_u <= v
+            if Z_u[mask].size < min_leaf_samples:
                 continue
-            elif Z_u[Z_u > v].size < min_leaf_failures:
+            elif Z_u[~mask].size < min_leaf_samples:
+                continue
+            elif (data.c[mask] != 1).sum() <= min_leaf_failures:
+                continue
+            elif (data.c[~mask] != 1).sum() <= min_leaf_failures:
                 continue
 
             abs_log_rank = log_rank(u, v, data, Z)
@@ -112,26 +112,59 @@ def log_rank(
 
     # Get sample-indices (i) of those that would end up in the left child
     left_child_indices = np.where(Z[:, u] <= v)[0]
-    left_child_x = data.x[left_child_indices]
-    left_child_c = data.c[left_child_indices]
-
-    left_child_x, idx = np.unique(left_child_x, return_inverse=True)
-    d_L = np.bincount(idx, weights=1 - left_child_c)
-    do_L = np.bincount(idx, weights=left_child_c)
+    data_left_child = data[left_child_indices]
+    # left_child_x = data.x[left_child_indices]
+    # left_child_c = data.c[left_child_indices]
+
+    # left_child_x, idx = np.unique(left_child_x, return_inverse=True)
+    # d_L = np.bincount(idx, weights=1 - left_child_c)
+    # do_L = np.bincount(idx, weights=left_child_c)
+    x_left, Y_L, d_L = data_left_child.to_xrd()
     all_x, Y, d = data.to_xrd()
 
     # expand d_L to match all_x
+    # idx = np.searchsorted(x_left, all_x, side="right") - 1
     expanded_d_L = np.zeros_like(all_x)
-    expanded_do_L = np.zeros_like(all_x)
-    x_l_indices = np.in1d(all_x, left_child_x).nonzero()[0]
+    expanded_Y_L = np.empty_like(all_x)
+    expanded_Y_L.fill(np.nan)
+    # expanded_do_L = np.zeros_like(all_x)
+    # x_l_indices = np.in1d(all_x, left_child_x).nonzero()[0]
+    x_l_indices = np.in1d(all_x, data_left_child.x).nonzero()[0]
     expanded_d_L[x_l_indices] = d_L
-    expanded_do_L[x_l_indices] = do_L
+    expanded_Y_L[x_l_indices] = Y_L
+    # expanded_do_L[x_l_indices] = do_L
+
+    (index,) = np.where(~np.isnan(expanded_Y_L))
+    first_non_nan = index[0]
+    last_non_nan = index[-1]
+    expanded_Y_L[first_non_nan:last_non_nan] = numpy_fill(
+        expanded_Y_L[first_non_nan:last_non_nan]
+    )
+    if first_non_nan != 0:
+        expanded_Y_L[:first_non_nan] = expanded_Y_L[first_non_nan]
+        # expanded_Y_L[:first_non_nan] = 0
+
+    if last_non_nan != expanded_Y_L.size - 1:
+        # expanded_Y_L[last_non_nan+1:] = 0
+        expanded_Y_L[last_non_nan + 1 :] = (
+            expanded_Y_L[last_non_nan] - expanded_d_L[last_non_nan]
+        )
+
+    Y_L = expanded_Y_L
     d_L = expanded_d_L
-    do_L = expanded_do_L
 
+    # Y_L = expanded_Y_L[first_non_nan:last_non_nan+1:]
+    # d_L = expanded_d_L[first_non_nan:last_non_nan+1:]
+    # Y = Y[first_non_nan:last_non_nan+1:]
+    # d = d[first_non_nan:last_non_nan+1:]
+
+    # do_L = expanded_do_L
+
+    # raise ValueError("STOP")
     # Find the risk set from the expanded d_L and do_L
     # These are the event and censored counts at each x
-    Y_L = (d_L.sum() + do_L.sum()) - d_L.cumsum() + d_L - do_L.cumsum() + do_L
+    # Y_L = (d_L.sum() + do_L.sum()) - d_L.cumsum()
+    # + d_L - do_L.cumsum() + do_L
 
     # Filter to where Y > 1
     mask = Y > 1

diff --git a/surpyval/regression/forest/node.py b/surpyval/regression/forest/node.py
@@ -5,7 +5,7 @@
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 
-from surpyval import Exponential, Weibull
+from surpyval import Exponential, NelsonAalen, Weibull
 from surpyval.parametric import NeverOccurs
 from surpyval.regression.forest.log_rank_split import log_rank_split
 from surpyval.utils.surpyval_data import SurpyvalData
@@ -31,11 +31,13 @@ def __init__(
         Z: NDArray,
         curr_depth: int,
         max_depth: int | float,
+        min_leaf_samples: int,
         min_leaf_failures: int,
         n_features_split: int,
         split_feature_index: int,
         split_feature_value: float,
         feature_indices_in: NDArray,
+        parametric: bool = True,
     ):
         # Set split attributes
         self.split_feature_index = split_feature_index
@@ -54,16 +56,20 @@ def __init__(
             Z[left_indices, :],
             curr_depth=curr_depth + 1,
             max_depth=max_depth,
+            min_leaf_samples=min_leaf_samples,
             min_leaf_failures=min_leaf_failures,
             n_features_split=n_features_split,
+            parametric=parametric,
         )
         self.right_child = build_tree(
             data[right_indices],
             Z[right_indices, :],
             curr_depth=curr_depth + 1,
             max_depth=max_depth,
+            min_leaf_samples=min_leaf_samples,
             min_leaf_failures=min_leaf_failures,
             n_features_split=n_features_split,
+            parametric=parametric,
         )
 
     def apply_model_function(
@@ -79,18 +85,23 @@ def apply_model_function(
 
 
 class TerminalNode(Node):
-    def __init__(self, data: SurpyvalData):
+    def __init__(self, data: SurpyvalData, parametric: bool = True):
         self.data = deepcopy(data)
+        self.paramettric = parametric
 
     @cached_property
     def model(self):
-        events = self.data.to_xrd()[2].sum()
-        if events == 0:
-            return NeverOccurs
-        elif events <= 1:
-            return Exponential.fit_from_surpyval_data(self.data)
+        if self.paramettric:
+            if self.data.to_xrd()[2].sum() == 0:
+                return NeverOccurs
+            elif self.data.to_xrd()[2].sum() <= 1:
+                return Exponential.fit_from_surpyval_data(self.data)
+            else:
+                return Weibull.fit_from_surpyval_data(self.data)
         else:
-            return Weibull.fit_from_surpyval_data(self.data)
+            return NelsonAalen.fit(
+                self.data.x, self.data.c, self.data.n, self.data.t
+            )
 
     def apply_model_function(
         self,
@@ -106,8 +117,10 @@ def build_tree(
     Z: NDArray,
     curr_depth: int,
     max_depth: int | float,
+    min_leaf_samples: int,
     min_leaf_failures: int,
     n_features_split: int,
+    parametric: bool = True,
 ) -> Node:
     """
     Node factory. Decides to return IntermediateNode object, or its
@@ -125,23 +138,25 @@ def build_tree(
 
     # Figure out best feature-value split
     split_feature_index, split_feature_value = log_rank_split(
-        data, Z, min_leaf_failures, feature_indices_in
+        data, Z, min_leaf_samples, min_leaf_failures, feature_indices_in
     )
 
     # If log_rank_split() can't suggest a feature-value split, return a
     # TerminalNode
     if split_feature_index == -1 and split_feature_value == float("-Inf"):
-        return TerminalNode(data)
+        return TerminalNode(data, parametric)
 
-    # Else, return an IntermediateNode, with the suggested feature-value split
+    # Else, return an IntermediateNode, with the best feature-value split
     return IntermediateNode(
         data=data,
         Z=Z,
         curr_depth=curr_depth,
         max_depth=max_depth,
+        min_leaf_samples=min_leaf_samples,
         min_leaf_failures=min_leaf_failures,
         n_features_split=n_features_split,
         split_feature_index=split_feature_index,
         split_feature_value=split_feature_value,
         feature_indices_in=feature_indices_in,
+        parametric=parametric,
     )
diff --git a/surpyval/regression/forest/tree.py b/surpyval/regression/forest/tree.py
@@ -20,23 +20,31 @@ def __init__(
         data: SurpyvalData,
         Z: NDArray,
         max_depth: int | float = float("inf"),
-        min_leaf_failures: int = 6,
+        min_leaf_samples: int = 5,
+        min_leaf_failures: int = 2,
         n_features_split: int | float | str = "sqrt",
+        parametric: str = "non-parametric",
     ):
         self.data = data
         self.Z = Z
 
-        self.n_features_split = parse_n_features_split(
+        n_features: int = parse_n_features_split(
             n_features_split, self.Z.shape[1]
         )
 
+        self.n_features_split = n_features
+
+        is_parametric: bool = parse_leaf_type(parametric)
+
         self._root = build_tree(
             data=self.data,
             Z=self.Z,
             curr_depth=0,
             max_depth=max_depth,
+            min_leaf_samples=min_leaf_samples,
             min_leaf_failures=min_leaf_failures,
-            n_features_split=self.n_features_split,
+            n_features_split=n_features,
+            parametric=is_parametric,
         )
 
     @classmethod
@@ -48,14 +56,19 @@ def fit(
         n: ArrayLike | None = None,
         t: ArrayLike | None = None,
         max_depth: int | float = float("inf"),
-        min_leaf_failures: int = 6,
+        min_leaf_samples: int = 5,
+        min_leaf_failures: int = 2,
         n_features_split: int | float | str = "sqrt",
+        leaf_type: str = "parametric",
     ):
         data = xcnt_handler(
             x, c, n, t, group_and_sort=False, as_surpyval_dataset=True
         )
         Z = np.array(Z, ndmin=2)
-        return cls(data, Z, max_depth, min_leaf_failures, n_features_split)
+        n_features: int = parse_n_features_split(n_features_split, Z.shape[1])
+        return cls(
+            data, Z, max_depth, min_leaf_failures, n_features, leaf_type
+        )
 
     def apply_model_function(
         self,
@@ -113,3 +126,15 @@ def parse_n_features_split(
             f"n_features_split={n_features_split} is invalid. See\
                          `Tree` docstring for valid values."
         )
+
+
+def parse_leaf_type(leaf_type: str):
+    if leaf_type.lower() == "non-parametric":
+        return False
+    elif leaf_type.lower() == "parametric":
+        return True
+    else:
+        raise ValueError(
+            f"leaf_type={leaf_type} is invalid. Must\
+            'parametric' or 'non-parametric'"
+        )