In [1]:
import shap
import sklearn
import itertools
import pydotplus
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Image

from sklearn.tree import export_graphviz
# from sklearn.externals.six import StringIO
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, space_eval, hp
import catboost

from util import load_data, cindex


def print_grid_search_results(results):
    best_iter = results['cv_results']['iterations'][
        np.argmin(results['cv_results']['test-Logloss-mean'])]
    best_loss = np.min(results['cv_results']['test-Logloss-mean'])
    print('Best params', results['params'], 'obtained at iteration', best_iter, 'with logloss', best_loss)


def print_train_val_test_c_indices(classifier,
                                   X_train,
                                   y_train,
                                   X_val,
                                   y_val,
                                   X_test,
                                   y_test):
    y_train_preds = classifier.predict_proba(X_train)[:, 1]
    print(f'Train ROC AUC: {roc_auc_score(y_train, y_train_preds)}')

    y_val_preds = classifier.predict_proba(X_val)[:, 1]
    print(f'Val ROC AUC: {roc_auc_score(y_val, y_val_preds)}')

    y_test_preds = classifier.predict_proba(X_test)[:, 1]
    print(f'Test ROC AUC: {roc_auc_score(y_test, y_test_preds)}')


def make_imputed_pool(X, y, imputer, cat_features):
    X_imputed = X if imputer is None else pd.DataFrame(imputer.transform(X), columns=X.columns)
    # imputer.transform() above has converted the int columns with categories into float, need to be converted back to int
    X_imputed = X_imputed.astype({'Sex': int, 'Race': int})
    pool = Pool(data=X_imputed, label=y, cat_features=cat_features)
    return pool, X_imputed


seed = 42
iterations = 200
hyper_iterations = 200

# Load the NHANES I epidemiology dataset
X_dev, X_test, y_dev, y_test = load_data(10)

# Convert categorical features from float to int, as that is what CatBoost expects
X_dev = X_dev.astype({'Sex': int, 'Race': int})
y_dev = y_dev.astype(int)
X_test = X_test.astype({'Sex': int, 'Race': int})
y_test = y_test.astype(int)


# Find out how many samples have missing data in one or more variables (columns)
def count_samples_with_missing_data(df):
    res = sum(df.isnull().any(axis='columns'))
    return res


dev_missing_count = count_samples_with_missing_data(X_dev)
test_missing_count = count_samples_with_missing_data(X_test)

print('Dev. set missing data in', dev_missing_count, 'samples out of', len(X_dev))
print('Test set missing data in', test_missing_count, 'samples out of', len(X_test))

# Split the dev set into training and validation. The latter will be used for hyper-parameters tuning.
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=seed)

# Make a dataset after dropping samples with missing data (note, no samples with missing data in test set)
X_dev_dropped = X_dev.dropna(axis='rows')
y_dev_dropped = y_dev.loc[X_dev_dropped.index]
X_train_dropped = X_train.dropna(axis='rows')
y_train_dropped = y_train.loc[X_train_dropped.index]
X_val_dropped = X_val.dropna(axis='rows')
y_val_dropped = y_val.loc[X_val_dropped.index]

cat_features = [3, 11]  # Categorical features are race and sex

print('Performing grid-search for hyper-parameters optimization, after dropping samples with missing data')

''' Grid-search is done on the dev. set, as the grid-search takes care of splitting it into training and validation.
Note: if `search_by_train_test_split` is set to True, every combination of values of the hyper-parameters is evaluated
with a basic training/val. split of the dataset; if set to False, then every combination is evaluated with x-evaluation.
Once method grid_search() has selected the best combination of hyper-parameters, fits a model with it. The final model 
can be evaluated with x-evaluation by setting parameter `calc_cv_statistics` to True (default). 

Note 2: CatBoost grid search chooses the best values for the hyper-parameters based on the loss, not on the eval metric 
set for the model (AUC). '''


# TODO: Check the above, might be not true!


def run_exp_grid_hyperparams_opt(X, y, cat_features, seed, iterations, param_grid, imputer=None):
    dev_pool, X_inputed = make_imputed_pool(X, y, imputer, cat_features)
    model = CatBoostClassifier(iterations=iterations,
                               eval_metric='AUC:hints=skip_train~false',
                               cat_features=cat_features,
                               random_state=seed)

    grid_search_results = model.grid_search(X=dev_pool,
                                            param_grid=param_grid,
                                            search_by_train_test_split=True,
                                            calc_cv_statistics=True,
                                            cv=5,
                                            partition_random_seed=seed,
                                            verbose=True)

    print_grid_search_results(grid_search_results)
    y_preds = model.predict_proba(X)[:, 1]
    print(f'ROC AUC on best model after grid-search: {roc_auc_score(y.values, y_preds)}')


param_grid = {'learning_rate': [.01, .05, .06, .07, 0.08, .1, .2],
              'depth': [2, 3, 4, 5, 6, 7, 8]}

run_exp_grid_hyperparams_opt(X=X_dev_dropped,
                             y=y_dev_dropped,
                             cat_features=cat_features,
                             seed=seed,
                             iterations=iterations,
                             param_grid=param_grid,
                             imputer=None)

print('\nPerforming grid-search for hyper-parameters optimization, with missing data replaced with a mean imputer')

# Now impute missing values using the mean, instead of dropping samples containing them
mean_imputer = SimpleImputer(strategy='mean')
mean_imputer.fit(X_dev)
run_exp_grid_hyperparams_opt(X=X_dev,
                             y=y_dev,
                             seed=seed,
                             iterations=iterations,
                             cat_features=cat_features,
                             param_grid=param_grid,
                             imputer=mean_imputer)

print(
    '\nPerforming grid-search for hyper-parameters optimization, with missing data replaced with an iterative imputer')

# Now try with an iterative imputer instead
iter_imputer = IterativeImputer(random_state=seed, sample_posterior=False, max_iter=1, min_value=0)
iter_imputer.fit(X_dev)
run_exp_grid_hyperparams_opt(X=X_dev,
                             y=y_dev,
                             seed=seed,
                             iterations=iterations,
                             cat_features=cat_features,
                             param_grid=param_grid,
                             imputer=iter_imputer)


def run_exp_bayes_hyperparams_opt(X_train, y_train, X_val, y_val, cat_features, param_space, max_evals, imputer):
    train_pool, X_train_imputed = make_imputed_pool(X_train,
                                                    y=y_train,
                                                    imputer=imputer,
                                                    cat_features=cat_features)

    val_pool, X_val_imputed = make_imputed_pool(X_val,
                                                y=y_val,
                                                imputer=imputer,
                                                cat_features=cat_features)

    # The objective function, that hyperopt will minimize
    def objective(params):
        model = CatBoostClassifier(iterations=params['iterations'],
                                   eval_metric='AUC',
                                   learning_rate=params['learning_rate'],
                                   depth=params['depth'],
                                   random_state=params['seed'])
        training_res = model.fit(train_pool, eval_set=val_pool, verbose=False)
        auc = training_res.best_score_['validation']['AUC']
        return -auc  # The objective function is minimized

    rstate = np.random.RandomState(seed)
    best = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=max_evals, rstate=rstate)
    print('Re-fitting the model with the best hyper-parameter values found:', best)
    refit_model = CatBoostClassifier(iterations=param_space['iterations'],
                                     eval_metric='AUC',
                                     **best,
                                     random_state=param_space['seed'])
    training_res = refit_model.fit(train_pool, eval_set=val_pool, verbose=iterations // 10)

    print_train_val_test_c_indices(refit_model,
                                   X_train_imputed,
                                   y_train.values,
                                   X_val_imputed,
                                   y_val.values,
                                   X_test,
                                   y_test.values)


''' Use the iterative imputer, but use Bayesian optimization for the hyper-parameters, instead of grid search. Here
we use the train/val data sets

Note: passing a CatBoost Pool() instance in the param_space values here below doesn't work, because hyperopt would
throw an exception during optimization.'''

param_space = {'learning_rate': hp.uniform('learning_rate', .01, .1),
               'depth': hp.quniform('depth', 2, 8, 1),
               'seed': seed,  # hyperopt accepts constant value parameters
               'iterations': iterations
               }

print('Performing Bayesian search for hyper-parameters optimization, with missing data replaced with iterative imputer')

run_exp_bayes_hyperparams_opt(X_train,
                              y_train,
                              X_val,
                              y_val,
                              cat_features=cat_features,
                              param_space=param_space,
                              max_evals=hyper_iterations,
                              imputer=iter_imputer)

print('Performing Bayesian search for hyper-parameters optimization, without replacement of missing data')

run_exp_bayes_hyperparams_opt(X_train,
                              y_train,
                              X_val,
                              y_val,
                              cat_features=cat_features,
                              param_space=param_space,
                              max_evals=hyper_iterations,
                              imputer=None)

''' TODO
Check the loss/ROC issue filed on GitHub
How does CatBoost deal with missing data (None/NaN)?
Unbalanced dataset, try using weights
Leverage Tensorboard
How to display CatBoost charts outside of notebook? Is it possible?
Explore Seaborne
Use the whole HANES dataset from CDC, and also try with GPU
Try other strategies for imputation based on mean encoding and similar
Instead of checking if survival after 10 years, estimate the number of years of survival
C-index is the same as the ROC AUC for logistic regression.
   see https://www.statisticshowto.com/c-statistic/#:~:text=A%20weighted%20c-index%20is,correctly%20predicting%20a%20negative%20outcome
   and also https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4886856/  and https://bit.ly/3dvUh07

'''



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Dev. set missing data in 4804 samples out of 6863
Test set missing data in 0 samples out of 1716
Performing grid-search for hyper-parameters optimization, after dropping samples with missing data

bestTest = 0.6517794473
bestIteration = 21

0:	loss: 0.6517794	best: 0.6517794 (0)	total: 306ms	remaining: 14.7s

bestTest = 0.6769698703
bestIteration = 186

1:	loss: 0.6769699	best: 0.6769699 (1)	total: 516ms	remaining: 12.1s

bestTest = 0.6880183014
bestIteration = 155

2:	loss: 0.6880183	best: 0.6880183 (2)	total: 714ms	remaining: 10.9s

bestTest = 0.6882002756
bestIteration = 198

3:	loss: 0.6882003	best: 0.6882003 (3)	total: 921ms	remaining: 10.4s

bestTest = 0.6836509216
bestIteration = 194

4:	loss: 0.6836509	best: 0.6882003 (3)	total: 1.12s	remaining: 9.85s

bestTest = 0.6853926742
bestIteration = 199

5:	loss: 0.6853927	best: 0.6882003 (3)	total: 1.33s	remaining: 9.54s

bestTest = 0.6817271947
bestIteration = 71

6:	loss: 0.6817272	best: 0.6882003 (3)	total: 1.53s	remaining: 9.19s




bestTest = 0.7482479784
bestIteration = 142

18:	loss: 0.7482480	best: 0.7498687 (17)	total: 10.4s	remaining: 16.4s

bestTest = 0.7385721197
bestIteration = 75

19:	loss: 0.7385721	best: 0.7498687 (17)	total: 11s	remaining: 15.9s

bestTest = 0.745158615
bestIteration = 74

20:	loss: 0.7451586	best: 0.7498687 (17)	total: 11.6s	remaining: 15.4s

bestTest = 0.7327907941
bestIteration = 199

21:	loss: 0.7327908	best: 0.7498687 (17)	total: 12.2s	remaining: 15s

bestTest = 0.7383198562
bestIteration = 186

22:	loss: 0.7383199	best: 0.7498687 (17)	total: 12.9s	remaining: 14.6s

bestTest = 0.7445262285
bestIteration = 195

23:	loss: 0.7445262	best: 0.7498687 (17)	total: 13.6s	remaining: 14.1s

bestTest = 0.7407008086
bestIteration = 152

24:	loss: 0.7407008	best: 0.7498687 (17)	total: 14.3s	remaining: 13.7s

bestTest = 0.7428260419
bestIteration = 164

25:	loss: 0.7428260	best: 0.7498687 (17)	total: 15s	remaining: 13.2s

bestTest = 0.7450480337
bestIteration = 156

26:	loss: 0.7450480	best: 0

[IterativeImputer] Early stopping criterion not reached.



bestTest = 0.7223132214
bestIteration = 199

0:	loss: 0.7223132	best: 0.7223132 (0)	total: 569ms	remaining: 27.3s

bestTest = 0.7387345359
bestIteration = 181

1:	loss: 0.7387345	best: 0.7387345 (1)	total: 1.1s	remaining: 25.9s

bestTest = 0.7393219988
bestIteration = 199

2:	loss: 0.7393220	best: 0.7393220 (2)	total: 1.62s	remaining: 24.9s

bestTest = 0.7398783606
bestIteration = 198

3:	loss: 0.7398784	best: 0.7398784 (3)	total: 2.15s	remaining: 24.2s

bestTest = 0.7413228281
bestIteration = 193

4:	loss: 0.7413228	best: 0.7413228 (4)	total: 2.68s	remaining: 23.6s

bestTest = 0.7459637846
bestIteration = 199

5:	loss: 0.7459638	best: 0.7459638 (5)	total: 3.22s	remaining: 23.1s

bestTest = 0.7440631695
bestIteration = 132

6:	loss: 0.7440632	best: 0.7459638 (5)	total: 3.77s	remaining: 22.6s

bestTest = 0.7240790656
bestIteration = 199

7:	loss: 0.7240791	best: 0.7459638 (5)	total: 4.38s	remaining: 22.5s

bestTest = 0.7429262561
bestIteration = 195

8:	loss: 0.7429263	best: 0.7459638 

' TODO\nCheck the loss/ROC issue filed on GitHub\nHow does CatBoost deal with missing data (None/NaN)?\nUnbalanced dataset, try using weights\nLeverage Tensorboard\nHow to display CatBoost charts outside of notebook? Is it possible?\nExplore Seaborne\nUse the whole HANES dataset from CDC, and also try with GPU\nTry other strategies for imputation based on mean encoding and similar\nInstead of checking if survival after 10 years, estimate the number of years of survival\nC-index is the same as the ROC AUC for logistic regression.\n   see https://www.statisticshowto.com/c-statistic/#:~:text=A%20weighted%20c-index%20is,correctly%20predicting%20a%20negative%20outcome\n   and also https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4886856/  and https://bit.ly/3dvUh07\n\n'