# **Elaborato per il corso di Health Analytics and Data-Driven Medicine, MD2SL 2024**
## *Dario Comanducci*

## **Survival Analysis**

In [2]:
# librerie di base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

np.random.seed(3131)

In [3]:
# Dataset (Veteran's Administration Lung Cancer Trial)
from sksurv.datasets import load_veterans_lung_cancer

data_x, ys = load_veterans_lung_cancer() 
data_x.head()

Unnamed: 0,Age_in_years,Celltype,Karnofsky_score,Months_from_Diagnosis,Prior_therapy,Treatment
0,69.0,squamous,60.0,7.0,no,standard
1,64.0,squamous,70.0,5.0,yes,standard
2,38.0,squamous,60.0,3.0,no,standard
3,63.0,squamous,60.0,9.0,yes,standard
4,65.0,squamous,70.0,11.0,yes,standard


**X**
- Age in years
- Celltype: squamous, smallcell, adeno, large
- Karnofsky score
- Months from Diagnosis
- Prior therapy: no/yes
- Treatment: standard, test

In [4]:
from sksurv.column import encode_categorical

x = encode_categorical(data_x) #one-hot encoding
x.head()

Unnamed: 0,Age_in_years,Celltype=large,Celltype=smallcell,Celltype=squamous,Karnofsky_score,Months_from_Diagnosis,Prior_therapy=yes,Treatment=test
0,69.0,0.0,0.0,1.0,60.0,7.0,0.0,0.0
1,64.0,0.0,0.0,1.0,70.0,5.0,1.0,0.0
2,38.0,0.0,0.0,1.0,60.0,3.0,0.0,0.0
3,63.0,0.0,0.0,1.0,60.0,9.0,1.0,0.0
4,65.0,0.0,0.0,1.0,70.0,11.0,1.0,0.0


In [5]:
ys[0:5]

array([( True,  72.), ( True, 411.), ( True, 228.), ( True, 126.),
       ( True, 118.)],
      dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

Status = 1 $\equiv$ dead;
Status = 0 $\equiv$ censored

In [6]:
df = x.assign(y=ys['Survival_in_days'], delta=ys['Status'])
df.head()

Unnamed: 0,Age_in_years,Celltype=large,Celltype=smallcell,Celltype=squamous,Karnofsky_score,Months_from_Diagnosis,Prior_therapy=yes,Treatment=test,y,delta
0,69.0,0.0,0.0,1.0,60.0,7.0,0.0,0.0,72.0,True
1,64.0,0.0,0.0,1.0,70.0,5.0,1.0,0.0,411.0,True
2,38.0,0.0,0.0,1.0,60.0,3.0,0.0,0.0,228.0,True
3,63.0,0.0,0.0,1.0,60.0,9.0,1.0,0.0,126.0,True
4,65.0,0.0,0.0,1.0,70.0,11.0,1.0,0.0,118.0,True


In [7]:
def inspect_treated_censored_data(df):
    id_censored = df[df['delta']==False].index 
    id_treated = df[df['Treatment=test']==1].index
    id_not_treated = df[df['Treatment=test']==0].index
    id_censored_treated = id_censored.intersection(id_treated)
    id_censored_not_treated = id_censored.intersection(id_not_treated)
    print('Dataset size: ' + str(df.shape))
    print('Censored: ' + str(len(id_censored)))
    print('Treated: ' + str(len(id_treated)))
    print('Not Treated: ' + str(len(id_not_treated)))
    print('Censored & Treated: ' + str(len(id_censored_treated)))
    print('Censored & Not Treated: ' + str(len(id_censored_not_treated)))
    
inspect_treated_censored_data(df)

Dataset size: (137, 10)
Censored: 9
Treated: 68
Not Treated: 69
Censored & Treated: 4
Censored & Not Treated: 5


**Training set & test set**

In [8]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.33, random_state=1242)

print('Training set:')
inspect_treated_censored_data(df_train)
print('\nTest set:')
inspect_treated_censored_data(df_test)

Training set:
Dataset size: (91, 10)
Censored: 5
Treated: 46
Not Treated: 45
Censored & Treated: 1
Censored & Not Treated: 4

Test set:
Dataset size: (46, 10)
Censored: 4
Treated: 22
Not Treated: 24
Censored & Treated: 3
Censored & Not Treated: 1


# Cox model

In [8]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

#----------------------------------------------------------------------------------------------------------------------
x_train = df_train.iloc[:,0:data_x.shape[1]]
y_train = np.array(list(zip(df_train['delta'], df_train['y'])), dtype=[('Status', '?'), ('Survival_in_days', '<f8')])

x_test = df_test.iloc[:,0:data_x.shape[1]]
y_test = np.array(list(zip(df_test['delta'], df_test['y'])), dtype=[('Status', '?'), ('Survival_in_days', '<f8')])


cph = CoxPHSurvivalAnalysis()
cph.set_params(alpha=0) #Cox standard

cph.fit(x_train, y_train)
cox_train_cindex = score_survival_model(cph, x_train, y_train)
cox_test_cindex = score_survival_model(cph, x_test, y_test)
print('Cox C-index (train): ' + str(cox_train_cindex))
print('Cox C-index (test): ' + str(cox_test_cindex))

Cox C-index (train): 0.7345741968383478
Cox C-index (test): 0.7251828631138976


# Survival SVM (linear)

In [9]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit, LeaveOneOut
from sksurv.svm import FastSurvivalSVM

#estimator = FastSurvivalSVM(max_iter=1000, rank_ratio=0, random_state=0) # regression SVM
estimator = FastSurvivalSVM(max_iter=1000, random_state=0) # ranking SVM
param_grid = {'alpha': 2.0 ** np.arange(-12, 13, 0.5)}

lsvm = GridSearchCV(estimator, param_grid, scoring=score_survival_model, n_jobs=-1, cv=3)
lsvm.fit(x_train, y_train)

lsvm_train_cindex = score_survival_model(lsvm, x_train, y_train)
lsvm_test_cindex = score_survival_model(lsvm, x_test, y_test)
print('Linear SVM C-index (train): ' + str(lsvm_train_cindex))
print('Linear C-index (test): ' + str(lsvm_test_cindex))

Linear SVM C-index (train): 0.7394186639469659
Linear C-index (test): 0.7377220480668757


# Survival SVM (kernel)

In [10]:
from sksurv.kernels import clinical_kernel
from sksurv.svm import FastKernelSurvivalSVM

#estimator = FastKernelSurvivalSVM(optimizer='rbtree', kernel='rbf', rank_ratio=0, random_state=0) # regression SVM
estimator = FastKernelSurvivalSVM(optimizer='rbtree', kernel='rbf', random_state=0) # ranking SVM
param_grid = {'alpha': 2.0 ** np.arange(-12, 13, 0.5), 'gamma':2.0 ** np.arange(-12, 13, 0.5)}
ksvm = GridSearchCV(estimator, param_grid, scoring=score_survival_model, cv=3, n_jobs=-1)
ksvm.fit(x_train, y_train)

ksvm_train_cindex = score_survival_model(ksvm, x_train, y_train)
ksvm_test_cindex = score_survival_model(ksvm, x_test, y_test)
print('Kernel SVM C-index (train): ' + str(ksvm_train_cindex))
print('Kernel C-index (test): ' + str(ksvm_test_cindex))

Kernel SVM C-index (train): 0.719020907700153
Kernel C-index (test): 0.7345872518286312
