In [1]:
import csv
import pandas as pd
import shap
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sksurv.datasets import load_gbsg2
from sksurv.preprocessing import OneHotEncoder
from sksurv.ensemble import RandomSurvivalForest
from xgbse.converters import (
    convert_data_to_xgb_format,
    convert_to_structured
)

data = pd.read_csv('../cohort.csv')

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [2]:
# splitting
X = data.drop(['time', 'event', 'cldl'], axis=1)
y = data
y = convert_to_structured(y['time'], y['event'])
# X = X.head(5000)

# splitting between train, and validation 
(X_train, X_valid,
 y_train, y_valid) = \
train_test_split(X, y, test_size=0.2, random_state=20)

In [3]:
X_train

Unnamed: 0,age,men,smoker,p.dm,pas,ct,chdl,tg,htn_med,charlson,ckd.epi,c10,hematocrit
62632,81,0,0,0,130.0,271,83,117,1,0,49.769318,1,39.5
56930,79,0,0,0,148.0,173,32,105,1,0,56.965670,0,41.0
69624,81,0,0,0,123.0,194,52,177,1,0,74.753319,0,42.9
58081,76,0,0,0,130.0,255,72,157,1,0,70.546410,0,38.1
60661,78,0,0,0,156.0,218,107,71,0,0,88.283129,1,41.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31962,82,0,0,0,155.0,195,60,107,1,0,61.950790,0,42.7
88988,75,0,0,0,160.0,230,46,214,1,1,46.505077,0,42.9
23775,78,0,0,0,163.0,268,65,123,1,0,78.881374,0,44.1
37135,85,1,0,0,130.0,224,48,172,1,0,77.857231,1,42.1


In [4]:
random_state = 20
model = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=random_state)
model.fit(X_train, y_train)
model.score(X_valid, y_valid)

KeyboardInterrupt: 

In [None]:
feature_names = [i for i in X.columns ]
explainer = shap.Explainer(model.predict, X_valid, feature_names=feature_names)
shaps = explainer(X_valid[:2000])

In [None]:
X_test_sorted = X_valid
X_test_sel = pd.concat((X_test_sorted.head(6), X_test_sorted.tail(0)))

X_test_sel

In [None]:
surv = model.predict_survival_function(X_test_sel, return_array=True)

for i, s in enumerate(surv):
    plt.step(model.event_times_, s, where="post", label=str(i))
plt.ylabel("Cardiac event probability")
plt.xlabel("Time in years")
plt.legend()
plt.grid(True)



In [None]:
surv = model.predict_cumulative_hazard_function(X_test_sel, return_array=True)

for i, s in enumerate(surv):
    plt.step(model.event_times_, s, where="post", label=str(i))
plt.ylabel("Cumulative hazard")
plt.xlabel("Time in years")
plt.legend()
plt.grid(True)

In [None]:
shap.summary_plot(shaps, X_valid)
# C-stat = 0.76 amb 1000 individus
# C-stat = 0.66 amb 5000 individus

In [None]:
shap.initjs()
shap.plots.force(shaps[0])

In [None]:
shap.plots.force(shaps[1])

In [None]:
shap.plots.force(shaps[2])

In [None]:
shap.plots.force(shaps[3])

In [None]:
shap.plots.force(shaps[4])

In [None]:
shap.plots.force(shaps[5])