# Draft: method evaluation

In [1]:
import numpy

In [2]:
import sklearn

In [3]:
from dataclasses import dataclass

In [4]:
import survwrap
survwrap.datasets.list_available_datasets()

('flchain', 'gbsg2', 'metabric', 'support')

In [5]:
_a,_b = survwrap.load_test_data()
type(_a), type(_b)

(numpy.ndarray, numpy.ndarray)

In [6]:
my_data='gbsg2'
my_data_df=survwrap.datasets.get_data(my_data)

In [44]:
print(my_data_df.dataframe.head().to_markdown())

|    |   age |   meno |   size |   grade |   nodes |   pgr |   er |   hormon |   time |   event |
|---:|------:|-------:|-------:|--------:|--------:|------:|-----:|---------:|-------:|--------:|
|  0 |    74 |      1 |      1 |       3 |       0 |    35 |  291 |        0 |   1799 |       0 |
|  1 |    79 |      1 |      2 |       3 |       0 |    36 |  611 |        0 |   2828 |       0 |
|  2 |    44 |      0 |      1 |       2 |       0 |   138 |    0 |        0 |   6012 |       0 |
|  3 |    70 |      1 |      2 |       3 |       0 |     0 |   12 |        0 |   2624 |       0 |
|  4 |    75 |      1 |      1 |       3 |       0 |   260 |  409 |        0 |   4915 |       0 |


In [8]:
X, y= my_data_df.get_X_y()
#X.shape, y.shape

((3668, 8), (3668,))

In [45]:
a_third=X.shape[0] // 3
#a_third

In [10]:
seed=2311

## check possible dimensionality reduction

In [11]:
from sklearn.decomposition import PCA

In [12]:
pca= PCA(n_components=0.995, random_state=seed).fit(X)
print("Data set name:", my_data),
print("n. of features:", X.shape[1],
      "- PCA-reducible to:", pca.n_components_)

Data set name: gbsg2
n. of features: 8 - PCA-reducible to: 2


### Generate three (stratified) non-overlapping splits of original data

In [13]:
X.shape,y.shape

((3668, 8), (3668,))

In [14]:
@dataclass
class split:
    X: numpy.ndarray = None
    y: numpy.ndarray = None

[split() for _ in range(3)]

[split(X=None, y=None), split(X=None, y=None), split(X=None, y=None)]

In [15]:
zsplits=[split() for _ in range(3)]
z_rest=split()
z_rest.X, zsplits[0].X, z_rest.y, zsplits[0].y = survwrap.survival_train_test_split(X, y,
                                                                               rng_seed=seed,test_size=a_third)
#print(X.shape[0], zsplits[0].X.shape[0] ,  z_rest.X.shape[0])
assert X.shape[0] == zsplits[0].X.shape[0] + z_rest.X.shape[0]
#print([_.X.shape[0] for _ in zsplits])
zsplits[1].X, zsplits[2].X, zsplits[1].y, zsplits[2].y = survwrap.survival_train_test_split(z_rest.X, z_rest.y,
                                                                              test_size=0.5,rng_seed=seed)
assert z_rest.X.shape[0] == zsplits[1].X.shape[0] + zsplits[2].X.shape[0]

zlen=[ _.X.shape[0] for _ in zsplits]
print(zlen)
assert  X.shape[0] == sum(zlen)

[1222, 1223, 1223]


## Create runs

In [16]:
@dataclass 
class ml_run:
    train_set: split = None
    test_set: split = None
    best_model: survwrap.BaseEstimator = None
    best_params: dict = None
    search_params: dict = None

In [17]:
fakecircular =[0,1,2,0,1]
z_runs=[]
for _ in range(3):
    z_runs.append(ml_run())
    z_runs[-1].test_set = zsplits[fakecircular[_]]
    z_runs[-1].train_set = split()
    z_runs[-1].train_set.X = numpy.append(zsplits[fakecircular[_+1]].X, zsplits[fakecircular[_+2]].X, axis=0)
    z_runs[-1].train_set.y = numpy.append(zsplits[fakecircular[_+1]].y, zsplits[fakecircular[_+2]].y, axis=0) 
    print('run:',_,z_runs[-1].train_set.X.shape, z_runs[-1].test_set.X.shape)

for _ in z_runs:
    assert len(X) == len(_.train_set.X) + len(_.test_set.X)
    assert len(y) == len(_.train_set.y) + len(_.test_set.y)
    print('run:', _.train_set.X.shape, _.test_set.X.shape)


run: 0 (2446, 8) (1222, 8)
run: 1 (2445, 8) (1223, 8)
run: 2 (2445, 8) (1223, 8)
run: (2446, 8) (1222, 8)
run: (2445, 8) (1223, 8)
run: (2445, 8) (1223, 8)


### scale the features (only) 

First do the stratified splitting THEN do scaling, parameterized on X_train set ONLY 

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
#X_train, X_test, y_train, y_test = survwrap.survival_train_test_split(X, y,rng_seed=seed)

In [20]:
scalers=[]
for _ in range(len(z_runs)):
    scalers.append(StandardScaler().fit(z_runs[_].train_set.X))
    print('before:', z_runs[_].train_set.X.shape, z_runs[_].test_set.X.shape)
    z_runs[_].train_set.X = scalers[-1].transform(z_runs[_].train_set.X)
    z_runs[_].test_set.X = scalers[-1].transform(z_runs[_].test_set.X)

for _ in z_runs:
    assert len(X) == len(_.train_set.X) + len(_.test_set.X)
    assert len(y) == len(_.train_set.y) + len(_.test_set.y)
    print('run:',_.train_set.X.shape, _.test_set.X.shape)

#assert z_runs[0] != z_runs[1] 
#assert z_runs[1] != z_runs[2]

before: (2446, 8) (1222, 8)
before: (2445, 8) (1223, 8)
before: (2445, 8) (1223, 8)
run: (2446, 8) (1222, 8)
run: (2445, 8) (1223, 8)
run: (2445, 8) (1223, 8)


In [21]:
#survwrap.get_indicator(y).sum(), survwrap.get_indicator(y_train).sum(), survwrap.get_indicator(y_test).sum(),


In [22]:
#splitter = survwrap.survival_crossval_splitter(X_train,y_train,n_splits=3, n_repeats=2,rng_seed=2309)
#print([ (survwrap.get_indicator(y_train[_[1]]).sum()) for _ in splitter])

In [23]:
## Stratified CV spliter for survival analysis

In [24]:
#from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold

# test coxnet

## set-up optimization grid

In [25]:
zmodel=survwrap.CoxNet()
zmodel.rng_seed = seed
zmodel

In [26]:
custom_grid=zmodel.get_parameter_grid()
#custom_grid['l1_ratio']=[0.5]
custom_grid

{'alpha': [0.001,
  0.003,
  0.005,
  0.008,
  0.01,
  0.02,
  0.03,
  0.04,
  0.05,
  0.06,
  0.07,
  0.08,
  0.09,
  0.1,
  0.15,
  0.2,
  0.3],
 'l1_ratio': [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]}

In [27]:
print('Full-grid size:',survwrap.optimization._guess_tries(custom_grid,fraction=1))

Full-grid size: 120


## first report performances on defaults

In [28]:
print('## Before optimization\n')
for ndx in range(3):
    current=z_runs[ndx]
    zmodel.fit(current.train_set.X, current.train_set.y)    
    print("Run {0:2d} - train: {1:5.3f}".format(ndx, zmodel.score(current.train_set.X, current.train_set.y)))
    print("Run {0:2d} - Test: {1:5.3f}".format(ndx, zmodel.score(current.test_set.X, current.test_set.y)))


## Before optimization

Run  0 - train: 0.669
Run  0 - Test: 0.691
Run  1 - train: 0.690
Run  1 - Test: 0.660
Run  2 - train: 0.678
Run  2 - Test: 0.680


## optimize

In [29]:
#survwrap.CoxNet().get_parameter_grid()

In [30]:
from survwrap import optimize
from survwrap.metrics import *

In [31]:
scorers = {
    'c-index-td': concordance_index_td_scorer,
    'neg-brier': make_time_dependent_scorer(neg_brier_score, time_mode='quantiles', time_values=[0.25, 0.5, 0.75]),
    'auc': make_time_dependent_scorer(roc_auc_td_score, time_mode='quantiles', time_values=[0.25, 0.5, 0.75]),
    #'c-index-median': make_time_dependent_scorer(concordance_index_score, time_mode='quantiles', time_values=[0.5]),
}

#best_model, best_params, search_results = survwrap.optimize(deephit, X, y, mode='sklearn-random', user_grid=dict(dropout=[0.0, 0.2, 0.9]), scoring=scoring, tries=3, refit='c-index-td')

In [32]:
def save_results(run, basename):
    import pickle
    import json
    with open(basename+'_params.json',"w") as f:
        json.dump(run.best_params, f)
        f.close()
    with open(basename+'_search.json',"w") as f:
        survwrap.get_model_scores_df(run.search_params).to_json(f)
        #pickle.dump(run.search_params, f)
        f.close()
    with open(basename+'_model.pickle',"wb") as f:
        pickle.dump(run.best_model, f)
        f.close()

In [33]:
for ndx in range(3):
    current=z_runs[ndx]
    base_file_name= '_'.join([my_data,type(zmodel).__name__, str(ndx)])
    current.best_model, current.best_params, current.search_params = optimize(zmodel, current.train_set.X , current.train_set.y, mode='sklearn-random',
                                                                          user_grid=custom_grid, tries=2,
                                                                          scoring=scorers, refit='c-index-td',
                                                                         n_jobs=4)

    print('run {0:2d} - best model test score:  {1:5.3f}'.format(
        ndx,
        current.best_model.score(current.test_set.X, current.test_set.y)))
    print("\tparams:", current.best_params)
    print('\nsaving w. basename:', base_file_name)
    save_results(current, base_file_name)

Random search tries: 2
run  0 - best model test score:  0.692
	params: {'l1_ratio': 0.01, 'alpha': 0.02}

saving w. basename: gbsg2_CoxNet_0
Random search tries: 2
run  1 - best model test score:  0.662
	params: {'l1_ratio': 0.9, 'alpha': 0.02}

saving w. basename: gbsg2_CoxNet_1
Random search tries: 2
run  2 - best model test score:  0.680
	params: {'l1_ratio': 0.01, 'alpha': 0.02}

saving w. basename: gbsg2_CoxNet_2


# Final Summary

In [34]:
#opt_coxnet, opt_coxnet_params, opt_coxnet_search = optimize(survwrap.CoxNet(rng_seed=2309), X_train, y_train, n_jobs=4)
#opt_coxnet.score(X_train, y_train), opt_coxnet.score(X_test, y_test), opt_coxnet_params

In [53]:
#current.search_params.cv_results_
print('# Summary')
for ndx in range(3):
    current=z_runs[ndx]
    print('\n## Best solutions for run', ndx)
    print('\nBest model test score: %5.3f' %
        current.best_model.score(current.test_set.X, current.test_set.y))
    print('\n')
    print(survwrap.get_model_scores_df(current.search_params)[:10].to_markdown())

# Summary

## Best solutions for run 0

Best model test score: 0.695


|   rank_test_c-index-td |   mean_test_c-index-td |   std_test_c-index-td |   rank_test_neg-brier |   mean_test_neg-brier |   std_test_neg-brier |   rank_test_auc |   mean_test_auc |   std_test_auc | params                            |   mean_fit_time |   std_fit_time |
|-----------------------:|-----------------------:|----------------------:|----------------------:|----------------------:|---------------------:|----------------:|----------------:|---------------:|:----------------------------------|----------------:|---------------:|
|                      1 |               0.664666 |             0.013991  |                     2 |             -0.162362 |           0.00330083 |               1 |        0.717789 |      0.0231999 | {'l1_ratio': 0.01, 'alpha': 0.02} |        0.471787 |     0.00862079 |
|                      2 |               0.66464  |             0.0138937 |                     1 |             -0.1