# Imports

First we have to import some packages to use down the line

In [1]:
import sys
import xgboost as xgb
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
from lifelines.plotting import add_at_risk_counts
import sys
from tableone import TableOne
import sklearn
import math
import statsmodels.api as sm
import warnings
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, LassoLarsIC, LassoCV, LassoLarsCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.calibration import calibration_curve
import statsmodels.api as sm
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import log_loss
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
import imblearn
import scipy
from imblearn.over_sampling import SMOTE
import time
import datetime
import openpyxl
import numpy.random as rng
import seaborn as sns
from scipy import stats
import pickle

from ncdb_tools import *

  from pandas import MultiIndex, Int64Index


In [2]:
def warn(*args, **kwargs):
    pass

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
warnings.warn = warn

In [3]:
output_dir = "/home/asim/NCDB_Projects/test/gridsearch/"

# Loading the NCDB File

First step is to properly load the NCDB file.

Since loading the full NCDB csv takes quite some time, you can utilize the savefile and loadfile specifications to save a subset of the database once you've narrowed down to the patients / variables you are interested in.

This next cell defines a 'function', we can then call the function later on in this notebook to get the actual results, running the code below just defines (as per def) the function. The function's name is load_data_her2. After the function, we specify parameters (filename, savefile, loadfile, and lower). You can provide default values for these parameters if you want using the equals sign - this means that the user doesn't need to provide these parameters every time the function is called.

Detailed descriptions of all NCDB elements are found here:
<a href="https://www.facs.org/-/media/files/quality-programs/cancer/ncdb/puf_data_dictionary_2017.ashx">https://www.facs.org/-/media/files/quality-programs/cancer/ncdb/puf_data_dictionary_2017.ashx</a>

'Site Specific Factors' are found here:
<a href="https://web2.facs.org/cstage0205/breast/Breastschema.html">https://web2.facs.org/cstage0205/breast/Breastschema.html</a>

Let's demonstrate the use of our above NCDB accessing function. Replace the filename with your copy of NCDB.

This code will take some time to execute! Try not to rerun it

In [4]:
df = load_data_her2(
    filename = r"/mnt/data/NCDB/NCDBPUF_Breast.0.2020.csv",
    lower = True
)

Total Patients in NCDB: 3690015
Excluding diagnoses before 2010: 2594500
Excluding non-invasive cases: 2100249
Excluding cases where HER2+ or triple negative: 1583059
Excluding cases where HER2 IHC isn't 0-2: 1583059
Excluding Stage 0 cases: 1461118


In [5]:
#Adds the relevant important columns to the dataframe
df_new = getNCDBClassifications(df, use_imputation = False)
df_new = addTennNomogram(df_new)

In [11]:
#Minimal Feature Set:
mfs = ['age','sex', 'grade_med', 'grade_high', 'tumor_size', 'er', 'pr', 'regional_nodes_positive', 'skin_changes', 'lvi', 'chest_wall',
                     'ductal', 'lobular', 'ductlob', 'mucinous', 'tubular', 'medullary', 'metaplastic', 'paget', 'sarcoma', 'papillary', 'inflammatory',
                     'hispanic', 'native', 'asian', 'black', 'her2', 'er_num', 'pr_num', 'ki67_num']

In [12]:
print("Excluding stage 0: " + str(len(df_new.index)))

df_train = df_new[(df_new.regional_nodes_positive < 4)]
print("Excluding 4 or more nodes positive: " + str(len(df_train.index)))

df_train = df_train.dropna(subset = ['odx', 'high_odx'])
print("Excluding patients missing ODX: " + str(len(df_train.index)))

df_train = df_train.dropna(subset = mfs)
print("Excluding patients missing minimal feature set: " + str(len(df_train.index)))

# # use a smaller sample to speed up code initially
# frac = 0.1
# df_train = df_train.sample(frac = frac)
# print("Using %.1f%% of the data for initial code testing: %d" % (frac*100, df_train.shape[0]))

Excluding stage 0: 1461118
Excluding 4 or more nodes positive: 1212847
Excluding patients missing ODX: 399284
Excluding patients missing minimal feature set: 53346


# Explore Models

In [13]:
# could check to optimize hyperparameters for logistic, but maybe not necessary
# penalty: {‘l1’, ‘l2’, ‘elasticnet’, None}
# C: [0.01, 0.1, 1, 10]
# fit_intercept: False, True
# solver: 'saga'

In [102]:
%run ncdb_tools.py

In [86]:
model = LogisticRegression(max_iter = 1000, solver ='saga', n_jobs = -1)

params = {
    'model__penalty': {'l1', 'l2', None},
    'model__C': [0.01, 0.1, 1, 10],
    'model__fit_intercept': [False, True]
}

logistic_features = compareClassifiersNCDB(
    df_train[mfs + ['odx', 'high_odx']],
    model = model,
    params = params,
    maximizeAIC = True,
    her2_fish = False,
    ki67 = True,
    numerical = True,
    file_name = output_dir + "logistic_regression.txt"
)

Version Feature


TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'

In [87]:
model = sklearn.ensemble.RandomForestClassifier(
    n_estimators = 1, 
    max_features = None,
    n_jobs = -1
)

params = {
    'model__n_estimators': [100],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None, 10, 50],
    'model__max_features': ['sqrt', 'log2', None],
    'poly__degree': [1,2]
}

forest_features = compareClassifiersNCDB(
    df_train[mfs + ['odx', 'high_odx']],
    model = model,
    params = params,
    maximizeAIC = True,
    her2_fish = False,
    ki67 = True,
    numerical = True,
    file_name = output_dir + "random_forest.txt"
)

Version Feature
['pr_num', 'sarcoma'] auc: 99999999.24709502 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'gini', 'model__max_depth': None, 'model__max_features': 'sqrt', 'poly__degree': 1} AUC: 0.7210502638111956 AIC: 0.7529049813747406 Features: ['pr_num', 'sarcoma']
Version Feature
['pr_num', 'sarcoma'] auc: 99999999.24679513 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'gini', 'model__max_depth': None, 'model__max_features': 'sqrt', 'poly__degree': 2} AUC: 0.7209744708291554 AIC: 0.7532048672437668 Features: ['pr_num', 'sarcoma']
Version Feature
['pr_num', 'sarcoma'] auc: 99999999.24717 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'gini', 'model__max_depth': None, 'model__max_features': 'log2', 'poly__degree': 1} AUC: 0.720944304191509 AIC: 0.7528299987316132 Features: ['pr_num', 'sarcoma']
Version Feature
['pr_num', 'sarcoma'] auc: 99999999.24687009 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'gini', 'model__max_depth': None, 'm

['pr_num', 'sarcoma'] auc: 99999999.24731997 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'entropy', 'model__max_depth': None, 'model__max_features': 'log2', 'poly__degree': 2} AUC: 0.7216156985388555 AIC: 0.7526800334453583 Features: ['pr_num', 'sarcoma']
Version Feature
['pr_num', 'sarcoma'] auc: 99999999.24702005 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'entropy', 'model__max_depth': None, 'model__max_features': None, 'poly__degree': 1} AUC: 0.7210469481778397 AIC: 0.7529799491167068 Features: ['pr_num', 'sarcoma']
Version Feature
['pr_num', 'sarcoma'] auc: 99999999.24679513 n: 53346
{'model__n_estimators': 100, 'model__criterion': 'entropy', 'model__max_depth': None, 'model__max_features': None, 'poly__degree': 2} AUC: 0.7206665993420807 AIC: 0.7532048672437668 Features: ['pr_num', 'sarcoma']
Version Feature
['pr_num', 'chest_wall'] auc: 99999999.37021579 n: 53346
['pr_num', 'chest_wall', 'paget'] auc: 99999999.4055324 n: 53346
['pr_num', 'chest_wall',

In [None]:
# model = sklearn.neighbors.KNeighborsClassifier(
#     n_neighbors = 1, 
#     weights = 'uniform',
#     n_jobs = -1
# )

# params = {
#     'model__n_neighbors': [1, 5, 10, 25, 50, 100],
#     'model__weights': ['uniform', 'distance'],
#     'poly__degree': [1,2]
# }

# kneighbors_features1 = compareClassifiersNCDB(
#     df_train,
#     model = model,
#     params = params,
#     her2_fish = False,
#     ki67 = True,
#     numerical = True,
#     file_name = output_dir + "kneighbors1.txt"
# )

In [None]:
# model = sklearn.neighbors.KNeighborsClassifier(
#     n_neighbors = 1, 
#     weights = 'uniform',
#     n_jobs = -1
# )

# params = {
#     'model__n_neighbors': [100, 250, 500, 750, 1000],
#     'model__weights': ['uniform', 'distance'],
#     'poly__degree': [1,2]
# }

# kneighbors_features2 = compareClassifiersNCDB(
#     df_train,
#     model = model,
#     params = params,
#     her2_fish = False,
#     ki67 = True,
#     numerical = True,
#     file_name = output_dir + "kneighbors2.txt"
# )

In [97]:
model = sklearn.ensemble.AdaBoostClassifier(
    n_estimators = 1
)

params = {
    'model__n_estimators': [1, 5, 10, 50, 100, 500, 1000],
#     'model__loss': ['linear', 'square', 'exponential'],
    'poly__degree': [1,2]
}

adaboost_features = compareClassifiersNCDB(
    df_train[mfs + ['odx', 'high_odx']],
    model = model,
    params = params,
    maximizeAIC = True,
    her2_fish = False,
    ki67 = True,
    numerical = True,
    file_name = output_dir + "adaboost.txt"
)

Version Feature
['pr_num', 'age'] auc: 99999999.99987854 n: 53346
{'model__n_estimators': 1, 'poly__degree': 1} AUC: 0.6698392332619183 AIC: 0.00012145936489105225 Features: ['pr_num', 'age']
Version Feature
['pr_num', 'age'] auc: 99999999.99987856 n: 53346
{'model__n_estimators': 1, 'poly__degree': 2} AUC: 0.6711466068522338 AIC: 0.0001214444637298584 Features: ['pr_num', 'age']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99942027 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99942052 n: 53346
{'model__n_estimators': 5, 'poly__degree': 1} AUC: 0.7954599739115024 AIC: 0.0005794763565063477 Features: ['pr_num', 'ki67_num', 'grade_high']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99942027 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99942039 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'age'] auc: 99999999.99942051 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'age', 'ductal'] auc: 99999999.99942054 n: 53346
{'model__n_estimators': 5, 'pol

In [103]:
model = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes = (10, 50, 100),
    activation = 'logistic',
    learning_rate_init = 0.01,
    alpha = 0.0001
)

params = {
    'model__hidden_layer_sizes': [(10,), (50,), (100,)],
    'model__activation': ['logistic', 'tanh', 'relu'],
    'model__learning_rate_init': [0.01, 0.001, 0.0001],
    'model__alpha': [0.0001, 0.00001],
    'poly__degree': [1,2]
}

nn_features1 = compareClassifiersNCDB(
    df_train[mfs + ['odx', 'high_odx']],
    model = model,
    params = params,
    maximizeAIC = True,
    her2_fish = False,
    ki67 = True,
    numerical = True,
    file_name = output_dir + "neuralnetwork1.txt"
)

Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924202 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924226 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'pr'] auc: 99999999.99924228 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'pr', 'sex'] auc: 99999999.99924229 n: 53346
{'model__hidden_layer_sizes': (10,), 'model__activation': 'logistic', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8094877076417358 AIC: 0.0007577091455459595 Features: ['pr_num', 'ki67_num', 'grade_high', 'pr', 'sex']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924126 n: 53346
['pr_num', 'ki67_num', 'tubular'] auc: 99999999.99924183 n: 53346
{'model__hidden_layer_sizes': (10,), 'model__activation': 'logistic', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 2} AUC: 0.7079017970468113 AIC: 0.0007581710815429688 Features: ['pr_num', 'ki67_num', 'tubular']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924204 n:

['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'ductal', 'ductlob'] auc: 99999999.99924228 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'ductal', 'ductlob', 'pr'] auc: 99999999.99924229 n: 53346
{'model__hidden_layer_sizes': (10,), 'model__activation': 'tanh', 'model__learning_rate_init': 0.01, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8111459151306827 AIC: 0.0007577091455459595 Features: ['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'ductal', 'ductlob', 'pr']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924126 n: 53346
['pr_num', 'ki67_num', 'lobular'] auc: 99999999.99924181 n: 53346
['pr_num', 'ki67_num', 'lobular', 'pr'] auc: 99999999.99924184 n: 53346
{'model__hidden_layer_sizes': (10,), 'model__activation': 'tanh', 'model__learning_rate_init': 0.01, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.6580863440213705 AIC: 0.0007581561803817749 Features: ['pr_num', 'ki67_num', 'lobular', 'pr']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.9

{'model__hidden_layer_sizes': (10,), 'model__activation': 'relu', 'model__learning_rate_init': 0.001, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8104116842047306 AIC: 0.0007577091455459595 Features: ['pr_num', 'ki67_num', 'grade_high', 'pr']
Version Feature
['pr_num', 'grade_high'] auc: 99999999.9992418 n: 53346
['pr_num', 'grade_high', 'lobular'] auc: 99999999.99924193 n: 53346
['pr_num', 'grade_high', 'lobular', 'black'] auc: 99999999.99924196 n: 53346
{'model__hidden_layer_sizes': (10,), 'model__activation': 'relu', 'model__learning_rate_init': 0.001, 'model__alpha': 0.0001, 'poly__degree': 2} AUC: 0.7170491029099855 AIC: 0.0007580369710922241 Features: ['pr_num', 'grade_high', 'lobular', 'black']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924204 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924219 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num'] auc: 99999999.9992423 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'lobular'] auc: 99



['pr_num', 'grade_high', 'pr', 'ki67_num', 'lobular'] auc: 99999999.99924217 n: 53346




{'model__hidden_layer_sizes': (10,), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 0.0001, 'poly__degree': 2} AUC: 0.7750597638126082 AIC: 0.0007578283548355103 Features: ['pr_num', 'grade_high', 'pr', 'ki67_num', 'lobular']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924196 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924219 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med'] auc: 99999999.99924229 n: 53346
{'model__hidden_layer_sizes': (10,), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8096644724517279 AIC: 0.0007577091455459595 Features: ['pr_num', 'ki67_num', 'grade_high', 'grade_med']
Version Feature
['pr_num', 'grade_high'] auc: 99999999.99924187 n: 53346
['pr_num', 'grade_high', 'ki67_num'] auc: 99999999.99924205 n: 53346




['pr_num', 'grade_high', 'ki67_num', 'sarcoma'] auc: 99999999.99924207 n: 53346




['pr_num', 'grade_high', 'ki67_num', 'sarcoma', 'sex'] auc: 99999999.99924208 n: 53346




['pr_num', 'grade_high', 'ki67_num', 'sarcoma', 'sex', 'tubular'] auc: 99999999.99924217 n: 53346




['pr_num', 'grade_high', 'ki67_num', 'sarcoma', 'sex', 'tubular', 'ductlob'] auc: 99999999.9992422 n: 53346




['pr_num', 'grade_high', 'ki67_num', 'sarcoma', 'sex', 'tubular', 'ductlob', 'hispanic'] auc: 99999999.99924226 n: 53346




{'model__hidden_layer_sizes': (10,), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.7632242284573278 AIC: 0.0007577389478683472 Features: ['pr_num', 'grade_high', 'ki67_num', 'sarcoma', 'sex', 'tubular', 'ductlob', 'hispanic']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624272 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99624297 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'lobular'] auc: 99999999.996243 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'lobular', 'grade_med'] auc: 99999999.99624301 n: 53346
{'model__hidden_layer_sizes': (50,), 'model__activation': 'logistic', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.813552047497603 AIC: 0.003756985068321228 Features: ['pr_num', 'ki67_num', 'grade_high', 'lobular', 'grade_med']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.9962425 n: 53346
['pr_num', 'ki67_num', 'medullary'] auc: 99999999.996242

{'model__hidden_layer_sizes': (50,), 'model__activation': 'logistic', 'model__learning_rate_init': 0.0001, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.8119772790278945 AIC: 0.003756910562515259 Features: ['pr_num', 'ki67_num', 'grade_high', 'ductal', 'grade_med', 'regional_nodes_positive', 'er']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624257 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99624288 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'mucinous'] auc: 99999999.99624297 n: 53346
{'model__hidden_layer_sizes': (50,), 'model__activation': 'tanh', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8068717830547991 AIC: 0.0037570297718048096 Features: ['pr_num', 'ki67_num', 'grade_high', 'mucinous']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624209 n: 53346
['pr_num', 'ki67_num', 'papillary'] auc: 99999999.99624251 n: 53346
{'model__hidden_layer_sizes': (50,), 'model__activation': 'tanh', 'model__learning_r

['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99624297 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num'] auc: 99999999.99624301 n: 53346
{'model__hidden_layer_sizes': (50,), 'model__activation': 'relu', 'model__learning_rate_init': 0.01, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8133801472862358 AIC: 0.003756985068321228 Features: ['pr_num', 'ki67_num', 'grade_high', 'er_num']
Version Feature
['pr_num', 'grade_high'] auc: 99999999.99624261 n: 53346
['pr_num', 'grade_high', 'ki67_num'] auc: 99999999.99624278 n: 53346
['pr_num', 'grade_high', 'ki67_num', 'er_num'] auc: 99999999.99624296 n: 53346
{'model__hidden_layer_sizes': (50,), 'model__activation': 'relu', 'model__learning_rate_init': 0.01, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.7754947646797474 AIC: 0.0037570446729660034 Features: ['pr_num', 'grade_high', 'ki67_num', 'er_num']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624275 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99624297

['pr_num', 'ki67_num'] auc: 99999999.99249364 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99249388 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med'] auc: 99999999.9924939 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'er_num'] auc: 99999999.99249397 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'er_num', 'age'] auc: 99999999.992494 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'er_num', 'age', 'tumor_size'] auc: 99999999.99249402 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'er_num', 'age', 'tumor_size', 'regional_nodes_positive'] auc: 99999999.99249409 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'er_num', 'age', 'tumor_size', 'regional_nodes_positive', 'papillary'] auc: 99999999.9924941 n: 53346
{'model__hidden_layer_sizes': (100,), 'model__activation': 'logistic', 'model__learning_rate_init': 0.001, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8167202759099853 AIC: 0.007505893707275391 Fe

['pr_num', 'ki67_num', 'grade_high', 'er_num', 'grade_med'] auc: 99999999.99249397 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'grade_med', 'tumor_size'] auc: 99999999.99249402 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'grade_med', 'tumor_size', 'lobular'] auc: 99999999.99249409 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'grade_med', 'tumor_size', 'lobular', 'lvi'] auc: 99999999.9924941 n: 53346
{'model__hidden_layer_sizes': (100,), 'model__activation': 'tanh', 'model__learning_rate_init': 0.001, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8162911586530643 AIC: 0.007505893707275391 Features: ['pr_num', 'ki67_num', 'grade_high', 'er_num', 'grade_med', 'tumor_size', 'lobular', 'lvi']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99249354 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99249361 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'sex'] auc: 99999999.99249364 n: 53346
{'model__hidden_layer_sizes': (100,), 'model__act

['pr_num', 'metaplastic'] auc: 99999999.99249078 n: 53346
{'model__hidden_layer_sizes': (100,), 'model__activation': 'relu', 'model__learning_rate_init': 0.001, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.692450315403045 AIC: 0.007509216666221619 Features: ['pr_num', 'metaplastic']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99249364 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99249385 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num'] auc: 99999999.99249391 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'lobular'] auc: 99999999.99249397 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'lobular', 'grade_med'] auc: 99999999.99249399 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num', 'lobular', 'grade_med', 'age'] auc: 99999999.992494 n: 53346
{'model__hidden_layer_sizes': (100,), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8191226842963741 AIC: 0.00750599801540

In [None]:
model = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes = (10, 50, 100),
    activation = 'logistic',
    learning_rate_init = 0.01,
    alpha = 0.0001
)

params = {
    'model__hidden_layer_sizes': [(10,10), (50,50), (100,100)],
    'model__activation': ['logistic', 'tanh', 'relu'],
    'model__learning_rate_init': [0.01, 0.001, 0.0001],
    'model__alpha': [0.0001, 0.00001],
    'poly__degree': [1,2]
}

nn_features2 = compareClassifiersNCDB(
    df_train[mfs + ['odx', 'high_odx']],
    model = model,
    params = params,
    maximizeAIC = True,
    her2_fish = False,
    ki67 = True,
    numerical = True,
    file_name = output_dir + "neuralnetwork2.txt"
)

Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924196 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924222 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'tumor_size'] auc: 99999999.99924228 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'logistic', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8086086654539677 AIC: 0.0007577240467071533 Features: ['pr_num', 'ki67_num', 'grade_high', 'tumor_size']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.9992417 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924181 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'regional_nodes_positive'] auc: 99999999.99924184 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'regional_nodes_positive', 'lobular'] auc: 99999999.99924186 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'logistic', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 2} AUC: 0.69788606222105

['pr_num', 'grade_high'] auc: 99999999.99924125 n: 53346
['pr_num', 'grade_high', 'ki67_num'] auc: 99999999.9992413 n: 53346
['pr_num', 'grade_high', 'ki67_num', 'skin_changes'] auc: 99999999.99924178 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'tanh', 'model__learning_rate_init': 0.01, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.6877495448182944 AIC: 0.0007582157850265503 Features: ['pr_num', 'grade_high', 'ki67_num', 'skin_changes']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924204 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924228 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med'] auc: 99999999.9992423 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'sex'] auc: 99999999.99924232 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'tanh', 'model__learning_rate_init': 0.001, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8150914307584586 AIC: 0.0007576793432235718 Features: ['pr_num', 

['pr_num', 'ki67_num'] auc: 99999999.99924205 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924229 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num'] auc: 99999999.99924232 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'relu', 'model__learning_rate_init': 0.001, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8127369488964595 AIC: 0.0007576793432235718 Features: ['pr_num', 'ki67_num', 'grade_high', 'er_num']
Version Feature
['pr_num', 'lvi'] auc: 99999999.9992414 n: 53346
['pr_num', 'lvi', 'grade_high'] auc: 99999999.99924178 n: 53346
['pr_num', 'lvi', 'grade_high', 'lobular'] auc: 99999999.99924196 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'relu', 'model__learning_rate_init': 0.001, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.7329230890964756 AIC: 0.0007580369710922241 Features: ['pr_num', 'lvi', 'grade_high', 'lobular']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924205 n: 53346
['pr_num', 'ki67



['pr_num', 'grade_high', 'lobular', 'grade_med', 'black', 'ki67_num', 'ductlob', 'ductal'] auc: 99999999.99924226 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 0.0001, 'poly__degree': 2} AUC: 0.7983018389336494 AIC: 0.0007577389478683472 Features: ['pr_num', 'grade_high', 'lobular', 'grade_med', 'black', 'ki67_num', 'ductlob', 'ductal']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99924205 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99924219 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'er_num'] auc: 99999999.99924229 n: 53346
{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 1e-05, 'poly__degree': 1} AUC: 0.8112004574596083 AIC: 0.0007577091455459595 Features: ['pr_num', 'ki67_num', 'grade_high', 'er_num']
Version Feature
['pr_num', 'mucinous'] auc: 99999999.99924135 n: 53346
['pr_num', 'mucinous', 'grade_



['pr_num', 'mucinous', 'grade_high', 'ki67_num', 'lobular'] auc: 99999999.99924208 n: 53346
['pr_num', 'mucinous', 'grade_high', 'ki67_num', 'lobular', 'regional_nodes_positive'] auc: 99999999.9992422 n: 53346




['pr_num', 'mucinous', 'grade_high', 'ki67_num', 'lobular', 'regional_nodes_positive', 'black'] auc: 99999999.99924222 n: 53346




{'model__hidden_layer_sizes': (10, 10), 'model__activation': 'relu', 'model__learning_rate_init': 0.0001, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.7840750800281432 AIC: 0.0007577836513519287 Features: ['pr_num', 'mucinous', 'grade_high', 'ki67_num', 'lobular', 'regional_nodes_positive', 'black']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624275 n: 53346
['pr_num', 'ki67_num', 'grade_high'] auc: 99999999.99624296 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med'] auc: 99999999.996243 n: 53346
['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'tumor_size'] auc: 99999999.99624301 n: 53346
{'model__hidden_layer_sizes': (50, 50), 'model__activation': 'logistic', 'model__learning_rate_init': 0.01, 'model__alpha': 0.0001, 'poly__degree': 1} AUC: 0.8118405071716304 AIC: 0.003756985068321228 Features: ['pr_num', 'ki67_num', 'grade_high', 'grade_med', 'tumor_size']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624237 n: 53346
['pr_num', 'ki67_num', 'lobular']

{'model__hidden_layer_sizes': (50, 50), 'model__activation': 'logistic', 'model__learning_rate_init': 0.0001, 'model__alpha': 1e-05, 'poly__degree': 2} AUC: 0.811315216570771 AIC: 0.003756910562515259 Features: ['pr_num', 'ki67_num', 'grade_high', 'ductal', 'grade_med', 'er']
Version Feature
['pr_num', 'ki67_num'] auc: 99999999.99624266 n: 53346
['pr_num', 'ki67_num', 'lobular'] auc: 99999999.99624267 n: 53346
