In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.base import clone

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

In [2]:
def load_split_preprocess(path, seed, apply_ohe = True, scale = False):
    """
    This function loads the data, splits the data into
    training and test set and apply basic preprocessing.

    Parameters
    ----------
    conf: CfgNode
        A yacs configuration node to access configuration data.
    """
    TO_DROP = ["site", "patient_ID", "infarct_side", "stroke"]
    TO_SCALE = ["age", "stenosis_left", "stenosis_right"]
    TO_LABEL_ENCODE = ["TIA", "hypertension", "cad", "gender",
                       "diabetes", "hyperlipidemia", "smoker_status", 
                       "prs", "calcification", "at_least_4"]
    TO_OHE = ["calcification_type_left", "calcification_type_right"]
    PREFIXES = ["calcification_left", "calcification_right"]    
    TARGET = "symptoms"
    
    train_filename = 'train'
    test_filename = 'test'

    df = pd.read_csv(path)

    df["smoker_status"] = df["smoker_status"].apply(lambda v: "no" if v == "no" else "yes")
    df["calcification_type_left"] = df["calcification_type_left"].apply(lambda v: v.replace(" ", ""))
    df["calcification_type_right"] = df["calcification_type_right"].apply(lambda v: v.replace(" ", ""))

    # drop unuseful features
    df = df.drop(TO_DROP, axis=1)

    X, y = df.drop(TARGET, axis=1), df[TARGET]
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=seed,
                                                        stratify=y)

    le = LabelEncoder()
    oe = OrdinalEncoder()

    # start pre-processing
    to_encode = TO_LABEL_ENCODE
    if apply_ohe:
        X_train = pd.get_dummies(X_train, columns=TO_OHE, prefix=PREFIXES)
        X_test = pd.get_dummies(X_test, columns=TO_OHE, prefix=PREFIXES)
    else:
        to_encode += TO_OHE

        train_filename += '_no_ohe'
        test_filename += '_no_ohe'

    if scale:
        scaler = StandardScaler()
        X_train[TO_SCALE] = scaler.fit_transform(X_train[TO_SCALE])
        X_test[TO_SCALE] = scaler.transform(X_test[TO_SCALE])
    else:
        train_filename += '_no_scale'
        test_filename += '_no_scale'

    y_train = le.fit_transform(y_train)
    X_train[to_encode] = oe.fit_transform(X_train[to_encode]).astype(np.uint8)

    # test set
    y_test = le.transform(y_test)
    X_test[to_encode] = oe.transform(X_test[to_encode]).astype(np.uint8)

    # add target vectors to feature matrices
    #X_train[TARGET] = y_train
    #X_test[TARGET] = y_test
    
    return X_train, y_train, X_test, y_test


In [3]:
def load_preprocess(path, seed, apply_ohe = True, scale = False):
    """
    This function loads the data, splits the data into
    training and test set and apply basic preprocessing.

    Parameters
    ----------
    conf: CfgNode
        A yacs configuration node to access configuration data.
    """
    TO_DROP = ["site", "patient_ID", "infarct_side", "stroke"]
    TO_SCALE = ["age", "stenosis_left", "stenosis_right"]
    TO_LABEL_ENCODE = ["TIA", "hypertension", "cad", "gender",
                       "diabetes", "hyperlipidemia", "smoker_status", 
                       "prs", "calcification", "at_least_4"]
    TO_OHE = ["calcification_type_left", "calcification_type_right"]
    PREFIXES = ["calcification_left", "calcification_right"]    
    TARGET = "symptoms"

    df = pd.read_csv(path)

    df["smoker_status"] = df["smoker_status"].apply(lambda v: "no" if v == "no" else "yes")
    df["calcification_type_left"] = df["calcification_type_left"].apply(lambda v: v.replace(" ", ""))
    df["calcification_type_right"] = df["calcification_type_right"].apply(lambda v: v.replace(" ", ""))

    # drop unuseful features
    df = df.drop(TO_DROP, axis=1)

    X, y = df.drop(TARGET, axis=1), df[TARGET]

    le = LabelEncoder()
    oe = OrdinalEncoder()

    # start pre-processing
    to_encode = TO_LABEL_ENCODE
    if apply_ohe:
        X = pd.get_dummies(X, columns=TO_OHE, prefix=PREFIXES)
    else:
        to_encode += ["calcification_type_left", "calcification_type_right"]

    if scale:
        scaler = StandardScaler()
        X[TO_SCALE] = scaler.fit_transform(X[TO_SCALE])

    y = le.fit_transform(y)
    X[to_encode] = oe.fit_transform(X[to_encode]).astype(np.uint8)
    
    return X, y


In [68]:
# 0.68, seed 58
train_scores_dict = {}
test_scores_dict = {}

for seed in range(1, 100):
    X_train, y_train, X_test, y_test = load_split_preprocess(path = '../input/calcifications.csv',
                                                             seed=seed,
                                                             apply_ohe = False,
                                                             scale=False)
    
    cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=seed)
    
    mlp = MLPClassifier(hidden_layer_sizes=(50,),
                    solver='adam',
                    activation="relu",
                    learning_rate="adaptive",
                    learning_rate_init=0.05,
                    max_iter=500,
                    #early_stopping=True,
                    alpha=0.03,
                    random_state=seed)
    
    #mlp_copy = clone(mlp)
    #mlp_copy.predict = mlp_copy.predict_proba
    
    scores = cross_validate(mlp, 
                         X_train, y_train, 
                         scoring="roc_auc", 
                         cv=cv,
                         return_train_score=True,
                         n_jobs=-1)
    
    curr_auc_train = np.median(scores["train_score"]).round(3)
    curr_auc_test = np.median(scores["test_score"]).round(3)    
    train_scores_dict[seed] = curr_auc_train
    test_scores_dict[seed] = curr_auc_test
    
    mlp.fit(X_train, y_train)
    probas = mlp.predict_proba(X_test)
    auc = roc_auc_score(y_test, probas[:, 1]).round(3)
    
    print("Seed: {}, Train: {} | Val: {} | Test: {}".format(seed, curr_auc_train, curr_auc_test, auc))

best_seed = max(test_scores_dict, key=test_scores_dict.get)
print("Best is seed {} with train AUC {} and test AUC {} ".format(best_seed, 
                                                                  train_scores_dict[best_seed],
                                                                  test_scores_dict[best_seed]))

Seed: 1, Train: 0.704 | Val: 0.6 | Test: 0.707
Seed: 2, Train: 0.764 | Val: 0.656 | Test: 0.652
Seed: 3, Train: 0.784 | Val: 0.676 | Test: 0.652
Seed: 4, Train: 0.783 | Val: 0.662 | Test: 0.58
Seed: 5, Train: 0.743 | Val: 0.648 | Test: 0.64
Seed: 6, Train: 0.762 | Val: 0.661 | Test: 0.634
Seed: 7, Train: 0.774 | Val: 0.667 | Test: 0.691
Seed: 8, Train: 0.762 | Val: 0.673 | Test: 0.646
Seed: 9, Train: 0.742 | Val: 0.66 | Test: 0.603
Seed: 10, Train: 0.731 | Val: 0.643 | Test: 0.66
Seed: 11, Train: 0.75 | Val: 0.627 | Test: 0.673
Seed: 12, Train: 0.754 | Val: 0.663 | Test: 0.624
Seed: 13, Train: 0.762 | Val: 0.653 | Test: 0.659
Seed: 14, Train: 0.745 | Val: 0.631 | Test: 0.702
Seed: 15, Train: 0.749 | Val: 0.646 | Test: 0.705
Seed: 16, Train: 0.756 | Val: 0.651 | Test: 0.666
Seed: 17, Train: 0.762 | Val: 0.662 | Test: 0.657
Seed: 18, Train: 0.745 | Val: 0.626 | Test: 0.738
Seed: 19, Train: 0.739 | Val: 0.636 | Test: 0.697
Seed: 20, Train: 0.752 | Val: 0.649 | Test: 0.683
Seed: 21, Train:

In [52]:
from sklearn.metrics import roc_auc_score

BEST_SEED = 58

X_train, y_train, X_test, y_test = load_split_preprocess(path = '../input/calcifications.csv',
                                                         seed=BEST_SEED,
                                                         apply_ohe = False,
                                                         scale=False)

cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=BEST_SEED)
    
mlp = MLPClassifier(hidden_layer_sizes=(50,),
                solver='adam',
                activation="relu",
                learning_rate="adaptive",
                learning_rate_init=0.02,
                max_iter=500,
                #early_stopping=True,
                alpha=0.03,
                random_state=BEST_SEED)

#mlp_copy = clone(mlp)
#mlp_copy.predict = mlp_copy.predict_proba

scores = cross_validate(mlp, 
                     X_train, y_train, 
                     scoring="roc_auc", 
                     cv=cv,
                     return_train_score=True,
                     n_jobs=-1)

curr_auc_train = np.median(scores["train_score"]).round(3)
curr_auc_test = np.median(scores["test_score"]).round(3) 

print("Seed: {}, Train: {} | Val: {}".format(BEST_SEED, curr_auc_train, curr_auc_test))

Seed: 58, Train: 0.786 | Val: 0.686


In [8]:
# MLPClassifier

train_scores_dict = {}
test_scores_dict = {}

for seed in range(1, 300):
    X, y = load_preprocess(path = '../input/calcifications.csv',
                                                             seed=seed,
                                                             apply_ohe = False,
                                                             scale=False)
    
    cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=seed)
    
    mlp = MLPClassifier(hidden_layer_sizes=(50,),
                    solver='adam',
                    activation="relu",
                    learning_rate="adaptive",
                    learning_rate_init=0.05,
                    max_iter=500,
                    #early_stopping=True,
                    alpha=0.03,
                    random_state=seed)
    
    #mlp_copy = clone(mlp)
    #mlp_copy.predict = mlp_copy.predict_proba
    
    scores = cross_validate(mlp, 
                         X, y, 
                         scoring="roc_auc", 
                         cv=cv,
                         return_train_score=True,
                         n_jobs=-1)
    
    curr_auc_train = np.median(scores["train_score"]).round(3)
    curr_auc_test = np.median(scores["test_score"]).round(3)    
    train_scores_dict[seed] = curr_auc_train
    test_scores_dict[seed] = curr_auc_test
    
    print("Seed: {}, Train: {} | Val: {}".format(seed, curr_auc_train, curr_auc_test))

best_seed = max(test_scores_dict, key=test_scores_dict.get)
print("Best is seed {} with train AUC {} and test AUC {} ".format(best_seed, 
                                                                  train_scores_dict[best_seed],
                                                                  test_scores_dict[best_seed]))

Seed: 1, Train: 0.718 | Val: 0.648
Seed: 2, Train: 0.74 | Val: 0.66
Seed: 3, Train: 0.746 | Val: 0.658
Seed: 4, Train: 0.752 | Val: 0.675
Seed: 5, Train: 0.719 | Val: 0.647
Seed: 6, Train: 0.739 | Val: 0.663
Seed: 7, Train: 0.754 | Val: 0.674
Seed: 8, Train: 0.73 | Val: 0.667
Seed: 9, Train: 0.729 | Val: 0.672
Seed: 10, Train: 0.726 | Val: 0.66
Seed: 11, Train: 0.741 | Val: 0.657
Seed: 12, Train: 0.729 | Val: 0.661
Seed: 13, Train: 0.738 | Val: 0.671
Seed: 14, Train: 0.738 | Val: 0.645
Seed: 15, Train: 0.746 | Val: 0.673
Seed: 16, Train: 0.741 | Val: 0.664
Seed: 17, Train: 0.738 | Val: 0.675
Seed: 18, Train: 0.737 | Val: 0.672
Seed: 19, Train: 0.729 | Val: 0.654
Seed: 20, Train: 0.736 | Val: 0.654
Seed: 21, Train: 0.719 | Val: 0.66
Seed: 22, Train: 0.743 | Val: 0.676
Seed: 23, Train: 0.747 | Val: 0.66
Seed: 24, Train: 0.738 | Val: 0.663
Seed: 25, Train: 0.72 | Val: 0.653
Seed: 26, Train: 0.72 | Val: 0.656
Seed: 27, Train: 0.737 | Val: 0.669
Seed: 28, Train: 0.733 | Val: 0.666
Seed: 29,

Seed: 227, Train: 0.739 | Val: 0.666
Seed: 228, Train: 0.73 | Val: 0.657
Seed: 229, Train: 0.726 | Val: 0.66
Seed: 230, Train: 0.747 | Val: 0.679
Seed: 231, Train: 0.755 | Val: 0.677
Seed: 232, Train: 0.754 | Val: 0.674
Seed: 233, Train: 0.737 | Val: 0.665
Seed: 234, Train: 0.762 | Val: 0.651
Seed: 235, Train: 0.739 | Val: 0.664
Seed: 236, Train: 0.746 | Val: 0.665
Seed: 237, Train: 0.744 | Val: 0.676
Seed: 238, Train: 0.73 | Val: 0.651
Seed: 239, Train: 0.723 | Val: 0.666
Seed: 240, Train: 0.747 | Val: 0.676
Seed: 241, Train: 0.741 | Val: 0.656
Seed: 242, Train: 0.736 | Val: 0.674
Seed: 243, Train: 0.707 | Val: 0.647
Seed: 244, Train: 0.739 | Val: 0.67
Seed: 245, Train: 0.742 | Val: 0.665
Seed: 246, Train: 0.75 | Val: 0.674
Seed: 247, Train: 0.735 | Val: 0.672
Seed: 248, Train: 0.733 | Val: 0.647
Seed: 249, Train: 0.735 | Val: 0.667
Seed: 250, Train: 0.742 | Val: 0.674
Seed: 251, Train: 0.734 | Val: 0.665
Seed: 252, Train: 0.734 | Val: 0.664
Seed: 253, Train: 0.74 | Val: 0.653
Seed: 2

In [6]:
# 0.68, seed 58
train_scores_dict = {}
test_scores_dict = {}

for seed in range(1, 300):
    X, y = load_preprocess(path = '../input/calcifications.csv',
                                                             seed=seed,
                                                             apply_ohe = False,
                                                             scale=False)
    
    cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=seed)
    
    lgbm = LGBMClassifier(objective="binary",
                        metric="auc",
                        boosting_type="gbdt",
                        seed=seed,
                        #enable_categorical=True,
                        feature_pre_filter=False,
                        force_row_wise=True,
                        deterministic=True,
                        learning_rate=0.05,
                        is_unbalance=True,
                        verbosity=-1,
                        reg_alpha=7,
                        reg_lambda=7,
                        max_depth=10,
                        n_estimators=150,
                        colsample_bytree=0.7,                          
                        n_jobs=1)
    
    #mlp_copy = clone(mlp)
    #mlp_copy.predict = mlp_copy.predict_proba
    
    scores = cross_validate(lgbm, 
                         X, y, 
                         scoring="roc_auc", 
                         cv=cv,
                         return_train_score=True,
                         n_jobs=-1)
    
    curr_auc_train = np.median(scores["train_score"]).round(3)
    curr_auc_test = np.median(scores["test_score"]).round(3)    
    train_scores_dict[seed] = curr_auc_train
    test_scores_dict[seed] = curr_auc_test
    
    print("Seed: {}, Train: {} | Val: {}".format(seed, curr_auc_train, curr_auc_test))

best_seed = max(test_scores_dict, key=test_scores_dict.get)
print("Best is seed {} with train AUC {} and test AUC {} ".format(best_seed, 
                                                                  train_scores_dict[best_seed],
                                                                  test_scores_dict[best_seed]))

Seed: 1, Train: 0.794 | Val: 0.714
Seed: 2, Train: 0.794 | Val: 0.695
Seed: 3, Train: 0.794 | Val: 0.709
Seed: 4, Train: 0.793 | Val: 0.699
Seed: 5, Train: 0.794 | Val: 0.709
Seed: 6, Train: 0.793 | Val: 0.713
Seed: 7, Train: 0.793 | Val: 0.71
Seed: 8, Train: 0.794 | Val: 0.709
Seed: 9, Train: 0.797 | Val: 0.711
Seed: 10, Train: 0.798 | Val: 0.708
Seed: 11, Train: 0.794 | Val: 0.718
Seed: 12, Train: 0.795 | Val: 0.706
Seed: 13, Train: 0.794 | Val: 0.71
Seed: 14, Train: 0.794 | Val: 0.702
Seed: 15, Train: 0.795 | Val: 0.705
Seed: 16, Train: 0.795 | Val: 0.716
Seed: 17, Train: 0.792 | Val: 0.715
Seed: 18, Train: 0.794 | Val: 0.715
Seed: 19, Train: 0.795 | Val: 0.71
Seed: 20, Train: 0.795 | Val: 0.715
Seed: 21, Train: 0.794 | Val: 0.715
Seed: 22, Train: 0.792 | Val: 0.718
Seed: 23, Train: 0.795 | Val: 0.712
Seed: 24, Train: 0.794 | Val: 0.704
Seed: 25, Train: 0.795 | Val: 0.711
Seed: 26, Train: 0.793 | Val: 0.706
Seed: 27, Train: 0.796 | Val: 0.713
Seed: 28, Train: 0.794 | Val: 0.706
Seed

Seed: 227, Train: 0.794 | Val: 0.708
Seed: 228, Train: 0.794 | Val: 0.714
Seed: 229, Train: 0.794 | Val: 0.704
Seed: 230, Train: 0.795 | Val: 0.708
Seed: 231, Train: 0.795 | Val: 0.71
Seed: 232, Train: 0.794 | Val: 0.71
Seed: 233, Train: 0.792 | Val: 0.706
Seed: 234, Train: 0.795 | Val: 0.713
Seed: 235, Train: 0.794 | Val: 0.724
Seed: 236, Train: 0.794 | Val: 0.71
Seed: 237, Train: 0.794 | Val: 0.703
Seed: 238, Train: 0.796 | Val: 0.714
Seed: 239, Train: 0.792 | Val: 0.708
Seed: 240, Train: 0.793 | Val: 0.711
Seed: 241, Train: 0.792 | Val: 0.707
Seed: 242, Train: 0.794 | Val: 0.711
Seed: 243, Train: 0.792 | Val: 0.708
Seed: 244, Train: 0.793 | Val: 0.718
Seed: 245, Train: 0.794 | Val: 0.708
Seed: 246, Train: 0.793 | Val: 0.712
Seed: 247, Train: 0.794 | Val: 0.711
Seed: 248, Train: 0.793 | Val: 0.703
Seed: 249, Train: 0.794 | Val: 0.709
Seed: 250, Train: 0.793 | Val: 0.709
Seed: 251, Train: 0.795 | Val: 0.706
Seed: 252, Train: 0.793 | Val: 0.704
Seed: 253, Train: 0.793 | Val: 0.706
Seed

In [4]:
# 0.68, seed 58
train_scores_dict = {}
test_scores_dict = {}

for seed in range(100,200):
    X, y = load_preprocess(path = '../input/calcifications.csv',
                                                             seed=seed,
                                                             apply_ohe = True,
                                                             scale=False)
    
    cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=seed)
    
    lgbm = LGBMClassifier(objective="binary",
                        metric="auc",
                        boosting_type="gbdt",
                        seed=seed,
                        #enable_categorical=True,ù
                        feature_pre_filter=False,
                        force_row_wise=True,
                        deterministic=True,
                        learning_rate=0.05,
                        is_unbalance=True,
                        verbosity=-1,
                        reg_alpha=10,
                        reg_lambda=10,
                        max_depth=10,
                        n_estimators=100,
                        colsample_bytree=0.7,                          
                        n_jobs=1)
    
    #mlp_copy = clone(mlp)
    #mlp_copy.predict = mlp_copy.predict_proba
    
    scores = cross_validate(lgbm, 
                         X, y, 
                         scoring="roc_auc", 
                         cv=cv,
                         return_train_score=True,
                         n_jobs=-1)
    
    curr_auc_train = np.median(scores["train_score"]).round(3)
    curr_auc_test = np.median(scores["test_score"]).round(3)    
    train_scores_dict[seed] = curr_auc_train
    test_scores_dict[seed] = curr_auc_test
    
    print("Seed: {}, Train: {} | Val: {}".format(seed, curr_auc_train, curr_auc_test))

best_seed = max(test_scores_dict, key=test_scores_dict.get)
print("Best is seed {} with train AUC {} and test AUC {} ".format(best_seed, 
                                                                  train_scores_dict[best_seed],
                                                                  test_scores_dict[best_seed]))

Seed: 100, Train: 0.77 | Val: 0.701
Seed: 101, Train: 0.771 | Val: 0.695
Seed: 102, Train: 0.772 | Val: 0.691
Seed: 103, Train: 0.771 | Val: 0.697
Seed: 104, Train: 0.769 | Val: 0.69
Seed: 105, Train: 0.768 | Val: 0.692
Seed: 106, Train: 0.769 | Val: 0.7
Seed: 107, Train: 0.771 | Val: 0.695
Seed: 108, Train: 0.772 | Val: 0.701
Seed: 109, Train: 0.77 | Val: 0.692
Seed: 110, Train: 0.77 | Val: 0.688
Seed: 111, Train: 0.768 | Val: 0.692
Seed: 112, Train: 0.768 | Val: 0.693
Seed: 113, Train: 0.77 | Val: 0.697
Seed: 114, Train: 0.766 | Val: 0.683
Seed: 115, Train: 0.767 | Val: 0.69
Seed: 116, Train: 0.767 | Val: 0.682
Seed: 117, Train: 0.769 | Val: 0.69
Seed: 118, Train: 0.765 | Val: 0.692
Seed: 119, Train: 0.77 | Val: 0.694
Seed: 120, Train: 0.768 | Val: 0.696
Seed: 121, Train: 0.768 | Val: 0.695
Seed: 122, Train: 0.768 | Val: 0.694
Seed: 123, Train: 0.77 | Val: 0.7
Seed: 124, Train: 0.768 | Val: 0.69
Seed: 125, Train: 0.768 | Val: 0.696
Seed: 126, Train: 0.768 | Val: 0.685
Seed: 127, Trai

In [109]:
lgbm = LGBMClassifier(objective="binary",
                        metric="auc",
                        boosting_type="gbdt",
                        seed=123,
                        #enable_categorical=True,ù
                        feature_pre_filter=False,
                        force_row_wise=True,
                        deterministic=True,
                        learning_rate=0.05,
                        is_unbalance=True,
                        verbosity=-1,
                        reg_alpha=7,
                        reg_lambda=7,
                        max_depth=10,
                        n_estimators=100,
                        colsample_bytree=0.6,    
                       importance_type="gain",
                        n_jobs=1)

lgbm.fit(X, y)

LGBMClassifier(colsample_bytree=0.6, deterministic=True,
               feature_pre_filter=False, force_row_wise=True,
               importance_type='gain', is_unbalance=True, learning_rate=0.05,
               max_depth=10, metric='auc', n_jobs=1, objective='binary',
               reg_alpha=7, reg_lambda=7, seed=123, verbosity=-1)

In [113]:
feats = X.columns.tolist()
vals = lgbm.feature_importances_

imps = zip(feats, vals)
sorted(imps, key=lambda x: x[1], reverse=True)

[('age', 412.5417698306857),
 ('calcification_right_Type6', 289.98373451828957),
 ('hyperlipidemia', 106.55086836125702),
 ('at_least_4', 104.61912578344345),
 ('stenosis_left', 102.97196879948024),
 ('stenosis_right', 81.92115885869134),
 ('calcification_right_Type5', 81.78353455662727),
 ('calcification_right_Type1', 76.65921981073916),
 ('calcification_right_Type2', 76.32601677571074),
 ('calcification_left_Type6', 75.60454034805298),
 ('calcification_right_Type3', 64.74391509592533),
 ('calcification_left_Type1', 38.613770335912704),
 ('smoker_status', 34.79370028366975),
 ('calcification_right_Type4', 34.551454305648804),
 ('prs', 29.48992145061493),
 ('calcification_left_Type5', 12.497104823589325),
 ('calcification_left_Type4', 6.985429883003235),
 ('calcification_left_Type2', 4.111389687284827),
 ('gender', 3.524590525776148),
 ('hypertension', 2.9609769731760025),
 ('TIA', 0.0),
 ('cad', 0.0),
 ('diabetes', 0.0),
 ('calcification', 0.0),
 ('calcification_left_Type3', 0.0)]

In [20]:
import optuna
from optuna.integration import OptunaSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import brier_score_loss

In [36]:
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=235)
outer_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=235)

In [19]:
def _brier_loss(y_true, y_pred):
    loss = brier_score_loss(y_true, y_pred)
    return "brier_loss", -1 * loss, True

In [14]:
est = LGBMClassifier(objective="binary",
                        metric="None",
                        boosting_type="gbdt",
                        seed=235,
                        feature_pre_filter=False,
                        force_row_wise=True,
                        deterministic=True,
                        learning_rate=0.05,
                        is_unbalance=True,
                        verbosity=-1,
                        reg_alpha=10,
                        reg_lambda=10,
                        max_depth=10,
                        n_estimators=100,
                        colsample_bytree=0.7,                          
                        n_jobs=1)

In [21]:
space = {
            "reg_alpha": optuna.distributions.UniformDistribution(6.0, 10.0),
            "reg_lambda": optuna.distributions.UniformDistribution(6.0, 10.0),
            "num_leaves": optuna.distributions.IntUniformDistribution(2, 15),
            "colsample_bytree": optuna.distributions.UniformDistribution(0.6, 0.9),
            "max_depth": optuna.distributions.IntUniformDistribution(8, 12),
            "n_estimators": optuna.distributions.IntUniformDistribution(120, 200),
        }

In [42]:
opt_search = OptunaSearchCV(estimator=est,
                                    param_distributions=space,
                                    cv=inner_cv,
                                    n_trials=10,
                                    random_state=235,
                                    refit=True,
                                    scoring="neg_brier_score",
                                    verbose=10,
                                    n_jobs=1)

for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    Xtrain, ytrain = X.iloc[train_idx], y[train_idx]
    Xtest, ytest = X.loc[test_idx], y[test_idx]
    
    """
    if (i%10)+1 == 10:
        print("\n")
        
    print("Iteration {}".format((i % 10)+1))
    """
    
    opt_search.fit(Xtrain, ytrain)
    

  opt_search = OptunaSearchCV(estimator=est,
[32m[I 2022-04-28 15:47:52,020][0m A new study created in memory with name: no-name-70ae7ca1-2771-4b4a-adaa-8b515154bc7f[0m
[32m[I 2022-04-28 15:47:52,021][0m Searching the best hyperparameters using 711 samples...[0m
[32m[I 2022-04-28 15:47:52,101][0m Trial 0 finished with value: -0.222063434745692 and parameters: {'reg_alpha': 6.204519696112449, 'reg_lambda': 6.339273611318688, 'num_leaves': 15, 'colsample_bytree': 0.8022694283947998, 'max_depth': 9, 'n_estimators': 176}. Best is trial 0 with value: -0.222063434745692.[0m
[32m[I 2022-04-28 15:47:52,184][0m Trial 1 finished with value: -0.22439022312527226 and parameters: {'reg_alpha': 9.64479834174737, 'reg_lambda': 6.000256887693302, 'num_leaves': 13, 'colsample_bytree': 0.7775431936398253, 'max_depth': 10, 'n_estimators': 182}. Best is trial 0 with value: -0.222063434745692.[0m
[32m[I 2022-04-28 15:47:52,276][0m Trial 2 finished with value: -0.22212094216132333 and paramete

[32m[I 2022-04-28 15:47:54,081][0m Trial 2 finished with value: -0.21992497742685796 and parameters: {'reg_alpha': 6.196777448245388, 'reg_lambda': 6.015666646740372, 'num_leaves': 13, 'colsample_bytree': 0.7635738718795501, 'max_depth': 12, 'n_estimators': 156}. Best is trial 2 with value: -0.21992497742685796.[0m
[32m[I 2022-04-28 15:47:54,182][0m Trial 3 finished with value: -0.2203720304314191 and parameters: {'reg_alpha': 7.542502066840683, 'reg_lambda': 9.079589631471556, 'num_leaves': 14, 'colsample_bytree': 0.6349198416214671, 'max_depth': 8, 'n_estimators': 181}. Best is trial 2 with value: -0.21992497742685796.[0m
[32m[I 2022-04-28 15:47:54,269][0m Trial 4 finished with value: -0.2212206681041534 and parameters: {'reg_alpha': 8.906839216973445, 'reg_lambda': 9.364284033741777, 'num_leaves': 11, 'colsample_bytree': 0.6384791654135209, 'max_depth': 12, 'n_estimators': 191}. Best is trial 2 with value: -0.21992497742685796.[0m
[32m[I 2022-04-28 15:47:54,358][0m Trial 

[32m[I 2022-04-28 15:47:56,068][0m Trial 5 finished with value: -0.2221723065021842 and parameters: {'reg_alpha': 9.330043611674574, 'reg_lambda': 7.104954501816131, 'num_leaves': 8, 'colsample_bytree': 0.6656062229796421, 'max_depth': 12, 'n_estimators': 160}. Best is trial 2 with value: -0.2213783140492495.[0m
[32m[I 2022-04-28 15:47:56,136][0m Trial 6 finished with value: -0.22152122691597048 and parameters: {'reg_alpha': 9.75228077143339, 'reg_lambda': 9.556125404787046, 'num_leaves': 15, 'colsample_bytree': 0.884276401679743, 'max_depth': 9, 'n_estimators': 169}. Best is trial 2 with value: -0.2213783140492495.[0m
[32m[I 2022-04-28 15:47:56,198][0m Trial 7 finished with value: -0.22142142635994566 and parameters: {'reg_alpha': 8.543387876659587, 'reg_lambda': 9.512968184235525, 'num_leaves': 15, 'colsample_bytree': 0.7952041326293483, 'max_depth': 8, 'n_estimators': 152}. Best is trial 2 with value: -0.2213783140492495.[0m
[32m[I 2022-04-28 15:47:56,294][0m Trial 8 fini

[32m[I 2022-04-28 15:47:58,128][0m Trial 8 finished with value: -0.21991676185691253 and parameters: {'reg_alpha': 6.494397747535711, 'reg_lambda': 7.574208180774116, 'num_leaves': 13, 'colsample_bytree': 0.7136495859952233, 'max_depth': 8, 'n_estimators': 129}. Best is trial 8 with value: -0.21991676185691253.[0m
[32m[I 2022-04-28 15:47:58,230][0m Trial 9 finished with value: -0.22259259917812613 and parameters: {'reg_alpha': 9.236622832922126, 'reg_lambda': 7.629127560849012, 'num_leaves': 13, 'colsample_bytree': 0.8012442039932435, 'max_depth': 12, 'n_estimators': 176}. Best is trial 8 with value: -0.21991676185691253.[0m
[32m[I 2022-04-28 15:47:58,232][0m Finished hyperparemeter search![0m
[32m[I 2022-04-28 15:47:58,235][0m Refitting the estimator using 711 samples...[0m
[32m[I 2022-04-28 15:47:58,262][0m Finished refitting! (elapsed time: 0.013 sec.)[0m
[32m[I 2022-04-28 15:47:58,268][0m A new study created in memory with name: no-name-2c01199b-20a9-4ba7-b313-3506

[32m[I 2022-04-28 15:48:00,020][0m Searching the best hyperparameters using 711 samples...[0m
[32m[I 2022-04-28 15:48:00,123][0m Trial 0 finished with value: -0.2145530091092267 and parameters: {'reg_alpha': 6.204519696112449, 'reg_lambda': 6.339273611318688, 'num_leaves': 15, 'colsample_bytree': 0.8022694283947998, 'max_depth': 9, 'n_estimators': 176}. Best is trial 0 with value: -0.2145530091092267.[0m
[32m[I 2022-04-28 15:48:00,194][0m Trial 1 finished with value: -0.21802473546119158 and parameters: {'reg_alpha': 9.64479834174737, 'reg_lambda': 6.000256887693302, 'num_leaves': 13, 'colsample_bytree': 0.7775431936398253, 'max_depth': 10, 'n_estimators': 182}. Best is trial 0 with value: -0.2145530091092267.[0m
[32m[I 2022-04-28 15:48:00,282][0m Trial 2 finished with value: -0.2150368710183787 and parameters: {'reg_alpha': 6.196777448245388, 'reg_lambda': 6.015666646740372, 'num_leaves': 13, 'colsample_bytree': 0.7635738718795501, 'max_depth': 12, 'n_estimators': 156}. Bes

In [33]:
i = 1
i / 40

0.025

In [43]:
import pickle

In [44]:
with open("../output/predictions/outer_cv_accumulated_preds_all_algorithms.pkl", "rb") as f_r:
    obj = pickle.load(f_r)

In [64]:
for j, (algo_name, outer_cv_data) in enumerate(obj.items()):
    print("Algorithm: ", algo_name)
    
    roc_aucs = []
    pr_aucs = []
    briers = []
    
    gts = outer_cv_data['gt']
    probas = outer_cv_data['probas']
    
    for split_idx, (split_gts, split_probas) in enumerate(zip(gts, probas)):
        rocauc_val = roc_auc_score(split_gts, split_probas[:, 1])
        prauc_val = average_precision_score(split_gts, split_probas[:, 1])
        brier_val = brier_score_loss(split_gts, split_probas[:, 1])
        
        roc_aucs.append(rocauc_val)
        pr_aucs.append(prauc_val)
        briers.append(brier_val)

    rocauc_low, rocauc_med, rocauc_up = np.percentile(roc_aucs, [2.5, 50, 97.5])
    prauc_low, prauc_med, prauc_up = np.percentile(pr_aucs, [2.5, 50, 97.5])
    brier_low, brier_med, brier_up = np.percentile(briers, [2.5, 50, 97.5])
    
    print("ROCAUC: {:.4f} [{:.2f} - {:.2f}]".format(rocauc_med, rocauc_low, rocauc_up))
    print("PRAUC: {:.4f} [{:.2f} - {:.2f}]".format(prauc_med, prauc_low, prauc_up))
    print("Brier: {:.4f} [{:.2f} - {:.2f}]".format(brier_med, brier_low, brier_up))

Algorithm:  LGBM
ROCAUC: 0.7139 [0.57 - 0.81]
PRAUC: 0.8197 [0.72 - 0.90]
Brier: 0.2165 [0.19 - 0.23]
Algorithm:  MLP
ROCAUC: 0.6551 [0.51 - 0.76]
PRAUC: 0.7823 [0.68 - 0.86]
Brier: 0.2188 [0.19 - 0.25]
Algorithm:  SVC
ROCAUC: 0.5060 [0.39 - 0.62]
PRAUC: 0.6684 [0.59 - 0.77]
Brier: 0.2289 [0.23 - 0.24]
Algorithm:  LR
ROCAUC: 0.6569 [0.55 - 0.77]
PRAUC: 0.7872 [0.71 - 0.86]
Brier: 0.2311 [0.20 - 0.26]
Algorithm:  KNN
ROCAUC: 0.5042 [0.39 - 0.65]
PRAUC: 0.6636 [0.59 - 0.74]
Brier: 0.2382 [0.22 - 0.26]
