Skip to content

Commit

Permalink
Added min samples and parametric or non-parametric option
Browse files Browse the repository at this point in the history
Changed tree and forest to allow for parametric or non-parametric leaves. Also added min_leaf_samples, while keeping min_leaf_failures, as fitting parameters.
  • Loading branch information
derrynknife committed Aug 1, 2023
1 parent 06fd334 commit aaca459
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 47 deletions.
33 changes: 26 additions & 7 deletions surpyval/regression/forest/forest.py
Expand Up @@ -24,14 +24,17 @@ def __init__(
Z: NDArray,
n_trees: int = 100,
max_depth: int | float = float("inf"),
min_leaf_failures: int = 6,
min_leaf_samples: int = 5,
min_leaf_failures: int = 2,
n_features_split: int | float | str = "sqrt",
bootstrap: bool = True,
parametric: bool = True,
):
self.data: SurpyvalData = data
self.Z: NDArray = np.array(Z, ndmin=2)
self.n_trees = n_trees
self.bootstrap = bootstrap
self.parametric = parametric

# Create Trees
if self.bootstrap:
Expand All @@ -51,8 +54,10 @@ def __init__(
data=self.data[bootstrap_indices[i]],
Z=self.Z[bootstrap_indices[i]],
max_depth=max_depth,
min_leaf_samples=min_leaf_samples,
min_leaf_failures=min_leaf_failures,
n_features_split=n_features_split,
parametric=parametric,
)
for i in range(self.n_trees)
)
Expand All @@ -67,10 +72,14 @@ def fit(
t: ArrayLike | None = None,
n_trees: int = 100,
max_depth: int | float = float("inf"),
min_leaf_failures: int = 6,
min_leaf_samples: int = 5,
min_leaf_failures: int = 2,
n_features_split: int | float | str = "sqrt",
bootstrap: bool = True,
leaf_type: str = "parametric",
):
parametric = parse_leaf_type(leaf_type)

data = xcnt_handler(
x, c, n, t, group_and_sort=False, as_surpyval_dataset=True
)
Expand All @@ -80,9 +89,11 @@ def fit(
Z,
n_trees,
max_depth,
min_leaf_samples,
min_leaf_failures,
n_features_split,
bootstrap,
parametric,
)

def sf(
Expand Down Expand Up @@ -151,10 +162,6 @@ def _apply_model_function_to_trees(
x = np.array(x, ndmin=1)
Z = np.array(Z, ndmin=2)

# If there are multiple covariant vector samples, ?
# if Z.shape[0] > 1 and x.ndim == 1:
# x = np.array(x, ndmin=2).transpose()

res = np.zeros((Z.shape[0], x.size)).astype(np.float64)
for i_covariant_vector in range(Z.shape[0]):
for tree in self.trees:
Expand All @@ -169,8 +176,20 @@ def score(
x: ArrayLike,
Z: ArrayLike | NDArray,
c: ArrayLike,
tie_tol: float = 1e-8,
tie_tol: float = 1e-15,
) -> float:
scores: ArrayLike = self.mortality(x, Z)
tol: float = 1e-8
return score(x, c, scores, tol)


def parse_leaf_type(leaf_type: str):
if leaf_type.lower() == "non-parametric":
return False
elif leaf_type.lower() == "parametric":
return True
else:
raise ValueError(
f"leaf_type={leaf_type} is invalid. Must\
'parametric' or 'non-parametric'"
)
81 changes: 57 additions & 24 deletions surpyval/regression/forest/log_rank_split.py
Expand Up @@ -7,22 +7,17 @@
from surpyval.utils.surpyval_data import SurpyvalData


# Inner function used in ensuring the min_leaf_failures constraint is
# respected
def breaks_min_leaf_failures_constraint(Z, u, v, c, min_leaf_failures):
left_child_samples = len(*np.where(np.logical_and(Z[:, u] <= v, c == 0)))
right_child_samples = len(np.where(np.logical_and(Z[:, u] > v, c == 0))[0])
if (
left_child_samples < min_leaf_failures
or right_child_samples < min_leaf_failures
):
return True
return False
def numpy_fill(arr):
mask = np.isnan(arr)
idx = np.where(~mask, np.arange(mask.shape[0]), 0)
np.maximum.accumulate(idx, out=idx)
return arr[idx]


def log_rank_split(
data: SurpyvalData,
Z: NDArray,
min_leaf_samples: int,
min_leaf_failures: int,
feature_indices_in: Iterable[int],
) -> tuple[int, float]:
Expand Down Expand Up @@ -87,9 +82,14 @@ def log_rank_split(
for v in np.unique(Z_u):
# Discard the (u, v) pair if it means a leaf will
# have < min_leaf_failures samples
if Z_u[Z_u <= v].size < min_leaf_failures:
mask = Z_u <= v
if Z_u[mask].size < min_leaf_samples:
continue
elif Z_u[Z_u > v].size < min_leaf_failures:
elif Z_u[~mask].size < min_leaf_samples:
continue
elif (data.c[mask] != 1).sum() <= min_leaf_failures:
continue
elif (data.c[~mask] != 1).sum() <= min_leaf_failures:
continue

abs_log_rank = log_rank(u, v, data, Z)
Expand All @@ -112,26 +112,59 @@ def log_rank(

# Get sample-indices (i) of those that would end up in the left child
left_child_indices = np.where(Z[:, u] <= v)[0]
left_child_x = data.x[left_child_indices]
left_child_c = data.c[left_child_indices]

left_child_x, idx = np.unique(left_child_x, return_inverse=True)
d_L = np.bincount(idx, weights=1 - left_child_c)
do_L = np.bincount(idx, weights=left_child_c)
data_left_child = data[left_child_indices]
# left_child_x = data.x[left_child_indices]
# left_child_c = data.c[left_child_indices]

# left_child_x, idx = np.unique(left_child_x, return_inverse=True)
# d_L = np.bincount(idx, weights=1 - left_child_c)
# do_L = np.bincount(idx, weights=left_child_c)
x_left, Y_L, d_L = data_left_child.to_xrd()
all_x, Y, d = data.to_xrd()

# expand d_L to match all_x
# idx = np.searchsorted(x_left, all_x, side="right") - 1
expanded_d_L = np.zeros_like(all_x)
expanded_do_L = np.zeros_like(all_x)
x_l_indices = np.in1d(all_x, left_child_x).nonzero()[0]
expanded_Y_L = np.empty_like(all_x)
expanded_Y_L.fill(np.nan)
# expanded_do_L = np.zeros_like(all_x)
# x_l_indices = np.in1d(all_x, left_child_x).nonzero()[0]
x_l_indices = np.in1d(all_x, data_left_child.x).nonzero()[0]
expanded_d_L[x_l_indices] = d_L
expanded_do_L[x_l_indices] = do_L
expanded_Y_L[x_l_indices] = Y_L
# expanded_do_L[x_l_indices] = do_L

(index,) = np.where(~np.isnan(expanded_Y_L))
first_non_nan = index[0]
last_non_nan = index[-1]
expanded_Y_L[first_non_nan:last_non_nan] = numpy_fill(
expanded_Y_L[first_non_nan:last_non_nan]
)
if first_non_nan != 0:
expanded_Y_L[:first_non_nan] = expanded_Y_L[first_non_nan]
# expanded_Y_L[:first_non_nan] = 0

if last_non_nan != expanded_Y_L.size - 1:
# expanded_Y_L[last_non_nan+1:] = 0
expanded_Y_L[last_non_nan + 1 :] = (
expanded_Y_L[last_non_nan] - expanded_d_L[last_non_nan]
)

Y_L = expanded_Y_L
d_L = expanded_d_L
do_L = expanded_do_L

# Y_L = expanded_Y_L[first_non_nan:last_non_nan+1:]
# d_L = expanded_d_L[first_non_nan:last_non_nan+1:]
# Y = Y[first_non_nan:last_non_nan+1:]
# d = d[first_non_nan:last_non_nan+1:]

# do_L = expanded_do_L

# raise ValueError("STOP")
# Find the risk set from the expanded d_L and do_L
# These are the event and censored counts at each x
Y_L = (d_L.sum() + do_L.sum()) - d_L.cumsum() + d_L - do_L.cumsum() + do_L
# Y_L = (d_L.sum() + do_L.sum()) - d_L.cumsum()
# + d_L - do_L.cumsum() + do_L

# Filter to where Y > 1
mask = Y > 1
Expand Down
37 changes: 26 additions & 11 deletions surpyval/regression/forest/node.py
Expand Up @@ -5,7 +5,7 @@
import numpy as np
from numpy.typing import ArrayLike, NDArray

from surpyval import Exponential, Weibull
from surpyval import Exponential, NelsonAalen, Weibull
from surpyval.parametric import NeverOccurs
from surpyval.regression.forest.log_rank_split import log_rank_split
from surpyval.utils.surpyval_data import SurpyvalData
Expand All @@ -31,11 +31,13 @@ def __init__(
Z: NDArray,
curr_depth: int,
max_depth: int | float,
min_leaf_samples: int,
min_leaf_failures: int,
n_features_split: int,
split_feature_index: int,
split_feature_value: float,
feature_indices_in: NDArray,
parametric: bool = True,
):
# Set split attributes
self.split_feature_index = split_feature_index
Expand All @@ -54,16 +56,20 @@ def __init__(
Z[left_indices, :],
curr_depth=curr_depth + 1,
max_depth=max_depth,
min_leaf_samples=min_leaf_samples,
min_leaf_failures=min_leaf_failures,
n_features_split=n_features_split,
parametric=parametric,
)
self.right_child = build_tree(
data[right_indices],
Z[right_indices, :],
curr_depth=curr_depth + 1,
max_depth=max_depth,
min_leaf_samples=min_leaf_samples,
min_leaf_failures=min_leaf_failures,
n_features_split=n_features_split,
parametric=parametric,
)

def apply_model_function(
Expand All @@ -79,18 +85,23 @@ def apply_model_function(


class TerminalNode(Node):
def __init__(self, data: SurpyvalData):
def __init__(self, data: SurpyvalData, parametric: bool = True):
self.data = deepcopy(data)
self.paramettric = parametric

@cached_property
def model(self):
events = self.data.to_xrd()[2].sum()
if events == 0:
return NeverOccurs
elif events <= 1:
return Exponential.fit_from_surpyval_data(self.data)
if self.paramettric:
if self.data.to_xrd()[2].sum() == 0:
return NeverOccurs
elif self.data.to_xrd()[2].sum() <= 1:
return Exponential.fit_from_surpyval_data(self.data)
else:
return Weibull.fit_from_surpyval_data(self.data)
else:
return Weibull.fit_from_surpyval_data(self.data)
return NelsonAalen.fit(
self.data.x, self.data.c, self.data.n, self.data.t
)

def apply_model_function(
self,
Expand All @@ -106,8 +117,10 @@ def build_tree(
Z: NDArray,
curr_depth: int,
max_depth: int | float,
min_leaf_samples: int,
min_leaf_failures: int,
n_features_split: int,
parametric: bool = True,
) -> Node:
"""
Node factory. Decides to return IntermediateNode object, or its
Expand All @@ -125,23 +138,25 @@ def build_tree(

# Figure out best feature-value split
split_feature_index, split_feature_value = log_rank_split(
data, Z, min_leaf_failures, feature_indices_in
data, Z, min_leaf_samples, min_leaf_failures, feature_indices_in
)

# If log_rank_split() can't suggest a feature-value split, return a
# TerminalNode
if split_feature_index == -1 and split_feature_value == float("-Inf"):
return TerminalNode(data)
return TerminalNode(data, parametric)

# Else, return an IntermediateNode, with the suggested feature-value split
# Else, return an IntermediateNode, with the best feature-value split
return IntermediateNode(
data=data,
Z=Z,
curr_depth=curr_depth,
max_depth=max_depth,
min_leaf_samples=min_leaf_samples,
min_leaf_failures=min_leaf_failures,
n_features_split=n_features_split,
split_feature_index=split_feature_index,
split_feature_value=split_feature_value,
feature_indices_in=feature_indices_in,
parametric=parametric,
)
35 changes: 30 additions & 5 deletions surpyval/regression/forest/tree.py
Expand Up @@ -20,23 +20,31 @@ def __init__(
data: SurpyvalData,
Z: NDArray,
max_depth: int | float = float("inf"),
min_leaf_failures: int = 6,
min_leaf_samples: int = 5,
min_leaf_failures: int = 2,
n_features_split: int | float | str = "sqrt",
parametric: str = "non-parametric",
):
self.data = data
self.Z = Z

self.n_features_split = parse_n_features_split(
n_features: int = parse_n_features_split(
n_features_split, self.Z.shape[1]
)

self.n_features_split = n_features

is_parametric: bool = parse_leaf_type(parametric)

self._root = build_tree(
data=self.data,
Z=self.Z,
curr_depth=0,
max_depth=max_depth,
min_leaf_samples=min_leaf_samples,
min_leaf_failures=min_leaf_failures,
n_features_split=self.n_features_split,
n_features_split=n_features,
parametric=is_parametric,
)

@classmethod
Expand All @@ -48,14 +56,19 @@ def fit(
n: ArrayLike | None = None,
t: ArrayLike | None = None,
max_depth: int | float = float("inf"),
min_leaf_failures: int = 6,
min_leaf_samples: int = 5,
min_leaf_failures: int = 2,
n_features_split: int | float | str = "sqrt",
leaf_type: str = "parametric",
):
data = xcnt_handler(
x, c, n, t, group_and_sort=False, as_surpyval_dataset=True
)
Z = np.array(Z, ndmin=2)
return cls(data, Z, max_depth, min_leaf_failures, n_features_split)
n_features: int = parse_n_features_split(n_features_split, Z.shape[1])
return cls(
data, Z, max_depth, min_leaf_failures, n_features, leaf_type
)

def apply_model_function(
self,
Expand Down Expand Up @@ -113,3 +126,15 @@ def parse_n_features_split(
f"n_features_split={n_features_split} is invalid. See\
`Tree` docstring for valid values."
)


def parse_leaf_type(leaf_type: str):
if leaf_type.lower() == "non-parametric":
return False
elif leaf_type.lower() == "parametric":
return True
else:
raise ValueError(
f"leaf_type={leaf_type} is invalid. Must\
'parametric' or 'non-parametric'"
)

0 comments on commit aaca459

Please sign in to comment.