In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv(os.path.join('..', '..', 'data', 'tidy_Stroke_Vital_Sign.csv'))
data_x = data.drop(['UID', 'Hospital_ID', 'SurvivalWeeks', 'admission_date',
                    'discharge_date', 'death_date', 'Mortality', 'CVDeath'], axis=1)
categorical_ix = [0, 2, 3, 4, 5, 6, 7, 8, 15, 16, 17]
categorical_columns = data_x.columns[categorical_ix].values
data_x_one_hot = pd.get_dummies(data_x, columns=categorical_columns)

data_y = data[['Mortality', 'SurvivalWeeks']]
data_y['Mortality'] = data_y['Mortality'].astype(bool)
data_y = np.array(list(data_y.to_records(index=False)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_y['Mortality'] = data_y['Mortality'].astype(bool)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data_x_one_hot, data_y, test_size=0.25, random_state=369)

# gradient boosting with regression tree base learner

In [6]:
est_cph_tree = GradientBoostingSurvivalAnalysis(
    n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
)
est_cph_tree.fit(X_train, y_train)
print(est_cph_tree.score(X_test, y_test))

0.8090954876647054


# component-wise least squares as base learner

In [8]:
est_cph_ls = ComponentwiseGradientBoostingSurvivalAnalysis(
    n_estimators=100, learning_rate=1.0, random_state=0
)
est_cph_ls.fit(X_train, y_train)
print(est_cph_ls.score(X_test, y_test))

0.7843464054391476


In [12]:
coef = pd.Series(est_cph_ls.coef_, ["Intercept"] + data_x_one_hot.columns.tolist())

print("Number of non-zero coefficients:", (coef != 0).sum())
coef_nz = coef[coef != 0]
coef_order = coef_nz.abs().sort_values(ascending=False).index
coef_nz.loc[coef_order]

Number of non-zero coefficients: 11


NG_1.0                   0.961946
Cancer before adm_1.0    0.852906
Foley_1.0                0.456165
AF_1.0                   0.355294
Hyperlipidemia_1.0      -0.344887
CHF_1.0                  0.165050
ICU_1.0                  0.109270
MeanHR G                 0.106922
DM_1.0                   0.054483
Age                      0.016088
MPsum                   -0.006751
dtype: float64