diff --git a/surpyval/regression/forest/forest.py b/surpyval/regression/forest/forest.py index 5128650..0b5554f 100644 --- a/surpyval/regression/forest/forest.py +++ b/surpyval/regression/forest/forest.py @@ -24,14 +24,17 @@ def __init__( Z: NDArray, n_trees: int = 100, max_depth: int | float = float("inf"), - min_leaf_failures: int = 6, + min_leaf_samples: int = 5, + min_leaf_failures: int = 2, n_features_split: int | float | str = "sqrt", bootstrap: bool = True, + parametric: bool = True, ): self.data: SurpyvalData = data self.Z: NDArray = np.array(Z, ndmin=2) self.n_trees = n_trees self.bootstrap = bootstrap + self.parametric = parametric # Create Trees if self.bootstrap: @@ -51,8 +54,10 @@ def __init__( data=self.data[bootstrap_indices[i]], Z=self.Z[bootstrap_indices[i]], max_depth=max_depth, + min_leaf_samples=min_leaf_samples, min_leaf_failures=min_leaf_failures, n_features_split=n_features_split, + parametric=parametric, ) for i in range(self.n_trees) ) @@ -67,10 +72,14 @@ def fit( t: ArrayLike | None = None, n_trees: int = 100, max_depth: int | float = float("inf"), - min_leaf_failures: int = 6, + min_leaf_samples: int = 5, + min_leaf_failures: int = 2, n_features_split: int | float | str = "sqrt", bootstrap: bool = True, + leaf_type: str = "parametric", ): + parametric = parse_leaf_type(leaf_type) + data = xcnt_handler( x, c, n, t, group_and_sort=False, as_surpyval_dataset=True ) @@ -80,9 +89,11 @@ def fit( Z, n_trees, max_depth, + min_leaf_samples, min_leaf_failures, n_features_split, bootstrap, + parametric, ) def sf( @@ -151,10 +162,6 @@ def _apply_model_function_to_trees( x = np.array(x, ndmin=1) Z = np.array(Z, ndmin=2) - # If there are multiple covariant vector samples, ? - # if Z.shape[0] > 1 and x.ndim == 1: - # x = np.array(x, ndmin=2).transpose() - res = np.zeros((Z.shape[0], x.size)).astype(np.float64) for i_covariant_vector in range(Z.shape[0]): for tree in self.trees: @@ -169,8 +176,20 @@ def score( x: ArrayLike, Z: ArrayLike | NDArray, c: ArrayLike, - tie_tol: float = 1e-8, + tie_tol: float = 1e-15, ) -> float: scores: ArrayLike = self.mortality(x, Z) tol: float = 1e-8 return score(x, c, scores, tol) + + +def parse_leaf_type(leaf_type: str): + if leaf_type.lower() == "non-parametric": + return False + elif leaf_type.lower() == "parametric": + return True + else: + raise ValueError( + f"leaf_type={leaf_type} is invalid. Must\ + 'parametric' or 'non-parametric'" + ) diff --git a/surpyval/regression/forest/log_rank_split.py b/surpyval/regression/forest/log_rank_split.py index 92cb98a..1e5221b 100644 --- a/surpyval/regression/forest/log_rank_split.py +++ b/surpyval/regression/forest/log_rank_split.py @@ -7,22 +7,17 @@ from surpyval.utils.surpyval_data import SurpyvalData -# Inner function used in ensuring the min_leaf_failures constraint is -# respected -def breaks_min_leaf_failures_constraint(Z, u, v, c, min_leaf_failures): - left_child_samples = len(*np.where(np.logical_and(Z[:, u] <= v, c == 0))) - right_child_samples = len(np.where(np.logical_and(Z[:, u] > v, c == 0))[0]) - if ( - left_child_samples < min_leaf_failures - or right_child_samples < min_leaf_failures - ): - return True - return False +def numpy_fill(arr): + mask = np.isnan(arr) + idx = np.where(~mask, np.arange(mask.shape[0]), 0) + np.maximum.accumulate(idx, out=idx) + return arr[idx] def log_rank_split( data: SurpyvalData, Z: NDArray, + min_leaf_samples: int, min_leaf_failures: int, feature_indices_in: Iterable[int], ) -> tuple[int, float]: @@ -87,9 +82,14 @@ def log_rank_split( for v in np.unique(Z_u): # Discard the (u, v) pair if it means a leaf will # have < min_leaf_failures samples - if Z_u[Z_u <= v].size < min_leaf_failures: + mask = Z_u <= v + if Z_u[mask].size < min_leaf_samples: continue - elif Z_u[Z_u > v].size < min_leaf_failures: + elif Z_u[~mask].size < min_leaf_samples: + continue + elif (data.c[mask] != 1).sum() <= min_leaf_failures: + continue + elif (data.c[~mask] != 1).sum() <= min_leaf_failures: continue abs_log_rank = log_rank(u, v, data, Z) @@ -112,26 +112,59 @@ def log_rank( # Get sample-indices (i) of those that would end up in the left child left_child_indices = np.where(Z[:, u] <= v)[0] - left_child_x = data.x[left_child_indices] - left_child_c = data.c[left_child_indices] - - left_child_x, idx = np.unique(left_child_x, return_inverse=True) - d_L = np.bincount(idx, weights=1 - left_child_c) - do_L = np.bincount(idx, weights=left_child_c) + data_left_child = data[left_child_indices] + # left_child_x = data.x[left_child_indices] + # left_child_c = data.c[left_child_indices] + + # left_child_x, idx = np.unique(left_child_x, return_inverse=True) + # d_L = np.bincount(idx, weights=1 - left_child_c) + # do_L = np.bincount(idx, weights=left_child_c) + x_left, Y_L, d_L = data_left_child.to_xrd() all_x, Y, d = data.to_xrd() # expand d_L to match all_x + # idx = np.searchsorted(x_left, all_x, side="right") - 1 expanded_d_L = np.zeros_like(all_x) - expanded_do_L = np.zeros_like(all_x) - x_l_indices = np.in1d(all_x, left_child_x).nonzero()[0] + expanded_Y_L = np.empty_like(all_x) + expanded_Y_L.fill(np.nan) + # expanded_do_L = np.zeros_like(all_x) + # x_l_indices = np.in1d(all_x, left_child_x).nonzero()[0] + x_l_indices = np.in1d(all_x, data_left_child.x).nonzero()[0] expanded_d_L[x_l_indices] = d_L - expanded_do_L[x_l_indices] = do_L + expanded_Y_L[x_l_indices] = Y_L + # expanded_do_L[x_l_indices] = do_L + + (index,) = np.where(~np.isnan(expanded_Y_L)) + first_non_nan = index[0] + last_non_nan = index[-1] + expanded_Y_L[first_non_nan:last_non_nan] = numpy_fill( + expanded_Y_L[first_non_nan:last_non_nan] + ) + if first_non_nan != 0: + expanded_Y_L[:first_non_nan] = expanded_Y_L[first_non_nan] + # expanded_Y_L[:first_non_nan] = 0 + + if last_non_nan != expanded_Y_L.size - 1: + # expanded_Y_L[last_non_nan+1:] = 0 + expanded_Y_L[last_non_nan + 1 :] = ( + expanded_Y_L[last_non_nan] - expanded_d_L[last_non_nan] + ) + + Y_L = expanded_Y_L d_L = expanded_d_L - do_L = expanded_do_L + # Y_L = expanded_Y_L[first_non_nan:last_non_nan+1:] + # d_L = expanded_d_L[first_non_nan:last_non_nan+1:] + # Y = Y[first_non_nan:last_non_nan+1:] + # d = d[first_non_nan:last_non_nan+1:] + + # do_L = expanded_do_L + + # raise ValueError("STOP") # Find the risk set from the expanded d_L and do_L # These are the event and censored counts at each x - Y_L = (d_L.sum() + do_L.sum()) - d_L.cumsum() + d_L - do_L.cumsum() + do_L + # Y_L = (d_L.sum() + do_L.sum()) - d_L.cumsum() + # + d_L - do_L.cumsum() + do_L # Filter to where Y > 1 mask = Y > 1 diff --git a/surpyval/regression/forest/node.py b/surpyval/regression/forest/node.py index ce3d6e2..09f6479 100644 --- a/surpyval/regression/forest/node.py +++ b/surpyval/regression/forest/node.py @@ -5,7 +5,7 @@ import numpy as np from numpy.typing import ArrayLike, NDArray -from surpyval import Exponential, Weibull +from surpyval import Exponential, NelsonAalen, Weibull from surpyval.parametric import NeverOccurs from surpyval.regression.forest.log_rank_split import log_rank_split from surpyval.utils.surpyval_data import SurpyvalData @@ -31,11 +31,13 @@ def __init__( Z: NDArray, curr_depth: int, max_depth: int | float, + min_leaf_samples: int, min_leaf_failures: int, n_features_split: int, split_feature_index: int, split_feature_value: float, feature_indices_in: NDArray, + parametric: bool = True, ): # Set split attributes self.split_feature_index = split_feature_index @@ -54,16 +56,20 @@ def __init__( Z[left_indices, :], curr_depth=curr_depth + 1, max_depth=max_depth, + min_leaf_samples=min_leaf_samples, min_leaf_failures=min_leaf_failures, n_features_split=n_features_split, + parametric=parametric, ) self.right_child = build_tree( data[right_indices], Z[right_indices, :], curr_depth=curr_depth + 1, max_depth=max_depth, + min_leaf_samples=min_leaf_samples, min_leaf_failures=min_leaf_failures, n_features_split=n_features_split, + parametric=parametric, ) def apply_model_function( @@ -79,18 +85,23 @@ def apply_model_function( class TerminalNode(Node): - def __init__(self, data: SurpyvalData): + def __init__(self, data: SurpyvalData, parametric: bool = True): self.data = deepcopy(data) + self.paramettric = parametric @cached_property def model(self): - events = self.data.to_xrd()[2].sum() - if events == 0: - return NeverOccurs - elif events <= 1: - return Exponential.fit_from_surpyval_data(self.data) + if self.paramettric: + if self.data.to_xrd()[2].sum() == 0: + return NeverOccurs + elif self.data.to_xrd()[2].sum() <= 1: + return Exponential.fit_from_surpyval_data(self.data) + else: + return Weibull.fit_from_surpyval_data(self.data) else: - return Weibull.fit_from_surpyval_data(self.data) + return NelsonAalen.fit( + self.data.x, self.data.c, self.data.n, self.data.t + ) def apply_model_function( self, @@ -106,8 +117,10 @@ def build_tree( Z: NDArray, curr_depth: int, max_depth: int | float, + min_leaf_samples: int, min_leaf_failures: int, n_features_split: int, + parametric: bool = True, ) -> Node: """ Node factory. Decides to return IntermediateNode object, or its @@ -125,23 +138,25 @@ def build_tree( # Figure out best feature-value split split_feature_index, split_feature_value = log_rank_split( - data, Z, min_leaf_failures, feature_indices_in + data, Z, min_leaf_samples, min_leaf_failures, feature_indices_in ) # If log_rank_split() can't suggest a feature-value split, return a # TerminalNode if split_feature_index == -1 and split_feature_value == float("-Inf"): - return TerminalNode(data) + return TerminalNode(data, parametric) - # Else, return an IntermediateNode, with the suggested feature-value split + # Else, return an IntermediateNode, with the best feature-value split return IntermediateNode( data=data, Z=Z, curr_depth=curr_depth, max_depth=max_depth, + min_leaf_samples=min_leaf_samples, min_leaf_failures=min_leaf_failures, n_features_split=n_features_split, split_feature_index=split_feature_index, split_feature_value=split_feature_value, feature_indices_in=feature_indices_in, + parametric=parametric, ) diff --git a/surpyval/regression/forest/tree.py b/surpyval/regression/forest/tree.py index 733bd8f..c987f34 100644 --- a/surpyval/regression/forest/tree.py +++ b/surpyval/regression/forest/tree.py @@ -20,23 +20,31 @@ def __init__( data: SurpyvalData, Z: NDArray, max_depth: int | float = float("inf"), - min_leaf_failures: int = 6, + min_leaf_samples: int = 5, + min_leaf_failures: int = 2, n_features_split: int | float | str = "sqrt", + parametric: str = "non-parametric", ): self.data = data self.Z = Z - self.n_features_split = parse_n_features_split( + n_features: int = parse_n_features_split( n_features_split, self.Z.shape[1] ) + self.n_features_split = n_features + + is_parametric: bool = parse_leaf_type(parametric) + self._root = build_tree( data=self.data, Z=self.Z, curr_depth=0, max_depth=max_depth, + min_leaf_samples=min_leaf_samples, min_leaf_failures=min_leaf_failures, - n_features_split=self.n_features_split, + n_features_split=n_features, + parametric=is_parametric, ) @classmethod @@ -48,14 +56,19 @@ def fit( n: ArrayLike | None = None, t: ArrayLike | None = None, max_depth: int | float = float("inf"), - min_leaf_failures: int = 6, + min_leaf_samples: int = 5, + min_leaf_failures: int = 2, n_features_split: int | float | str = "sqrt", + leaf_type: str = "parametric", ): data = xcnt_handler( x, c, n, t, group_and_sort=False, as_surpyval_dataset=True ) Z = np.array(Z, ndmin=2) - return cls(data, Z, max_depth, min_leaf_failures, n_features_split) + n_features: int = parse_n_features_split(n_features_split, Z.shape[1]) + return cls( + data, Z, max_depth, min_leaf_failures, n_features, leaf_type + ) def apply_model_function( self, @@ -113,3 +126,15 @@ def parse_n_features_split( f"n_features_split={n_features_split} is invalid. See\ `Tree` docstring for valid values." ) + + +def parse_leaf_type(leaf_type: str): + if leaf_type.lower() == "non-parametric": + return False + elif leaf_type.lower() == "parametric": + return True + else: + raise ValueError( + f"leaf_type={leaf_type} is invalid. Must\ + 'parametric' or 'non-parametric'" + )