In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from model_metrics import *
from fancyimpute import *
import pickle
from sklearn.metrics import roc_curve, auc

Using TensorFlow backend.


In [2]:
full_data = pd.read_csv('data/Christie_diagnosis_20180118.csv')

# Multiple Imputation Testing

For information on algorithms, see [fancyimpute](https://pypi.python.org/pypi/fancyimpute)

Make all the solver objects

In [None]:
KNN_solver = KNN(k=5)
softimpute_solver = SoftImpute()
MICE_solver = MICE()
simple_solver = SimpleFill()
iterativeSVD_solver = IterativeSVD()
matrixfactorization_solver = MatrixFactorization()

Create a dataframe from the complete data in `train_data`. Then, randomly insert some NaNs for MSE testing.

In [None]:
# Drop NaNs
complete_data = train_data.dropna()

In [None]:
# Drop DX and DXSUB
complete_data.drop(columns=['DX', 'DXSUB'], inplace=True)

In [None]:
# Randomly insert NaNs
nan_inserted_data = complete_data.copy()
import random
ix = [(row, col) for row in range(complete_data.shape[0]) for col in range(complete_data.shape[1])]
for row, col in random.sample(ix, int(round(.1*len(ix)))):
    nan_inserted_data.iat[row, col] = np.nan

In [None]:
missing_mask = nan_inserted_data.isna().any(axis=1)

Complete those missing dataframes with the various solvers!

In [None]:
def test_imputation(solver, df):
    """Impute the data using imputation methods"""
    impute_data = df.values
    data_index = df.index
    data_cols = df.columns

    impute_data_filled = solver.complete(impute_data)
    impute_df = pd.DataFrame(impute_data_filled, index=data_index, columns=data_cols)
    return impute_df

In [None]:
KNN_df = test_imputation(KNN_solver, nan_inserted_data)

In [None]:
softimpute_df = test_imputation(softimpute_solver, nan_inserted_data)

In [None]:
MICE_df = test_imputation(MICE_solver, nan_inserted_data)

In [None]:
simple_df = test_imputation(simple_solver, nan_inserted_data)

In [None]:
iterative_df = test_imputation(iterativeSVD_solver, nan_inserted_data)

In [None]:
matrixfact_df = test_imputation(matrixfactorization_solver, nan_inserted_data)

### Now cast as ints

In [None]:
solver_list = [KNN_df, softimpute_df, MICE_df,
               simple_df, iterative_df, matrixfact_df]
solver_names = ['KNN', 'SoftImpute', 'MICE', 'SimpleFill',
                   'IterativeSVD', 'MatrixFactorization']

In [None]:
KNN_df_round = KNN_df.copy()
softimpute_df_round = softimpute_df.copy()
MICE_df_round = MICE_df.copy()
simple_df_round = simple_df.copy()
iterative_df_round = iterative_df.copy()
matrixfact_df_round = matrixfact_df.copy()

In [None]:
round_list = [KNN_df_round, softimpute_df_round, MICE_df_round,
              simple_df_round, iterative_df_round, matrixfact_df_round]
round_names = ['KNN_round', 'SoftImpute_round', 'MICE_round',
            'SimpleFill_round', 'IterativeSVD_round', 'MatrixFact_round']

In [None]:
int_cols = ['SSBK_NUMCOMPLETE_Y1', 'SSFD_NUMCOMPLETE_Y1',
            'Y1_CLWRD_COND1', 'Y1_CLWRD_COND2', 'Y1_DIGITS_BKWD_RS',
            'Y1_DIGITS_FRWD_RS', 'Y1_TRAILS_COND2', 'Y1_TRAILS_COND3']
for df in round_list:
    for col in int_cols:
        df[col] = df[col].astype('int')

### Create MSEs for each

In [None]:
total_df_list = solver_list + round_list
total_df_names = solver_names + round_names

In [None]:
mse_df = pd.DataFrame(index=total_df_names, columns=complete_data.columns)

In [None]:
i = 0
for df, name in zip(total_df_list, total_df_names):
    mse = ((df[missing_mask] - complete_data[missing_mask]) ** 2).mean()
    mse_df.loc[name] = mse
    i += 1

In [None]:
mse_df

### Which method has the lowest MSEs?

Write `true` for minimums in each col

In [None]:
mse_df_bool = mse_df.copy()
for col in mse_df.columns:
    mse_df_bool[col] = (mse_df_bool[col] == np.min(mse_df_bool[col]))

In [None]:
mse_df_bool.sum(axis=1)

In [None]:
int_cols_nodx = ['SSBK_NUMCOMPLETE_Y1',
 'SSFD_NUMCOMPLETE_Y1',
 'Y1_CLWRD_COND1',
 'Y1_CLWRD_COND2',
 'Y1_DIGITS_BKWD_RS',
 'Y1_DIGITS_FRWD_RS',
 'Y1_TRAILS_COND2',
 'Y1_TRAILS_COND3']

In [None]:
mse_df_bool[int_cols_nodx].sum(axis=1)

However, MICE seems to do the best for columns that are technically integers (3 out of 8)

### Conclusions

Looks like MatrixFactorization is the best option. (out of 38 cols, 24 went to MatrixFactorization for the lowest MSE)

Rounding does not improve the MSE. 

# Testing for leaky data!

I had really high accuracy on my first logistic regression model (91% test accuracy on smaller train/test split, and 92% accuracy on cross-validated log models).

So, I want to investigate if I have any leaky data columns.

In [None]:
X_train = train_data.drop(columns=['DX','DXSUB'])
y_train = train_data['DX']

In [None]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
                    X_train, y_train, test_size=0.2, random_state=56)

In [None]:
accuracy_list = []
for col in X_train_small.columns:
    # Drop the column
    X_train_dataset = X_train_small.drop(columns=col).values
    X_test_dataset = X_test_small.drop(columns=col).values
    
    # Impute the data if missing numbers
    if np.sum(np.isnan(X_train_dataset)) > 0:
        X_train_final = impute_data(X_train_dataset)
    else:
        X_train_final = X_train_dataset.copy()

    if np.sum(np.isnan(X_test_dataset)) > 0:
        X_test_final = impute_data(X_test_dataset)
    else:
        X_test_final = X_test_dataset.copy()
        
    # Fit model
    model = LogisticRegression()
    model.fit(X_train_final, y_train_small)
    
    # Score model
    accuracy = model.score(y_test_final, y_test_small)
    accuracy_list.append((col, accuracy))

In [None]:
accuracy_list
for col, acc in accuracy_list:
    print("Accuracy removing {}: \t \t {:2.2f}".format(col, acc).expandtabs(10))

In [None]:
X_train_small_impute = impute_data(X_train_small.values)

In [None]:
logreg.fit(X_train_small_impute, y_train_small.values)

In [None]:
logreg.score(impute_data(X_test_small.values), y_test_small.values)

Looks like there aren't any leaky variables. I just have high accuracy.

This makes sense - the lab wouldn't administer tests or behavioral questionnaires that don't have something to do with ADHD. So a straight logistical model is pretty accurate.

Going forward, I want to do a few things:

- How high can I get the accuracy? Test out a few different models (RF, Gradient Boosting)
- What's the spread of the predicted probas like?
- Test out on DXSUB

# Logistic Model Metrics Visualization

In [None]:
X_train_DX = train_data.drop(columns=['DX','DXSUB'])
y_train_DX = train_data['DX']

In [None]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
                    X_train_DX, y_train_DX, test_size=0.2, random_state=56)

In [None]:
logmod = LogisticRegression()
logmod.fit(impute_data(X_train_small.values), y_train_small)

In [None]:
pred_prob_dx = logmod.predict_proba(impute_data(X_test_small.values))

In [None]:
prob_dx = logmod.predict(impute_data(X_test_small.values))

In [None]:
len(pred_prob_dx[:,0])

In [None]:
def make_jitter(data, jitter=0.1):
    return np.random.uniform(-jitter, jitter, size=data.shape)

In [None]:
fig, ax = plt.subplots(figsize=(10,2))

_ = ax.scatter(pred_prob_dx[:,0], make_jitter(pred_prob_dx[:,0]), c=np.vectorize(dx_dict.get)(y_test_small),
           s=40, alpha=0.5)
_ = ax.set_xlim(0,1)
_ = ax.set_title('Predicted Probability of Positive vs Negative Classes')

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_small, prob_dx, pos_label=3, drop_intermediate=False)
roc_auc = auc(fpr, tpr)

In [None]:
plt.plot(fpr, tpr)

# Classification Metrics

I created logistic regression, random forest, and gradient boosting models. I want to see the MSE and accuracy on train/test, cross-validated (k-fold=10), when predicting DX, and when predicting DXSUB.

In [None]:
#train_data = pd.read_csv('data/train_data.csv')

In [None]:
#%%capture
#classifier_metrics = run_classifiers(train_data)

In [None]:
#with open('classifier_metrics.pkl', 'wb') as f:
    #pickle.dump(classifier_metrics, f)

In [None]:
classification_metrics = pickle.load(open("classifier_metrics.pkl", "rb"))

In [None]:
#[clf][pred][metric][train/test]
# Logistic Regression
lr_dx_mse_train = np.mean(classification_metrics[0][0][0][0])
lr_dx_mse_test = np.mean(classification_metrics[0][0][0][1])
lr_dx_acc_train = np.mean(classification_metrics[0][0][1][0])
lr_dx_acc_test = np.mean(classification_metrics[0][0][1][1])

lr_dxsub_mse_train = np.mean(classification_metrics[0][1][0][0])
lr_dxsub_mse_test = np.mean(classification_metrics[0][1][0][1])
lr_dxsub_acc_train = np.mean(classification_metrics[0][1][1][0])
lr_dxsub_acc_test = np.mean(classification_metrics[0][1][1][1])

# Random Forest
rf_dx_mse_train = np.mean(classification_metrics[1][0][0][0])
rf_dx_mse_test = np.mean(classification_metrics[1][0][0][1])
rf_dx_acc_train = np.mean(classification_metrics[1][0][1][0])
rf_dx_acc_test = np.mean(classification_metrics[1][0][1][1])

rf_dxsub_mse_train = np.mean(classification_metrics[1][1][0][0])
rf_dxsub_mse_test = np.mean(classification_metrics[1][1][0][1])
rf_dxsub_acc_train = np.mean(classification_metrics[1][1][1][0])
rf_dxsub_acc_test = np.mean(classification_metrics[1][1][1][1])

# Gradient Boosting
gb_dx_mse_train = np.mean(classification_metrics[2][0][0][0])
gb_dx_mse_test = np.mean(classification_metrics[2][0][0][1])
gb_dx_acc_train = np.mean(classification_metrics[2][0][1][0])
gb_dx_acc_test = np.mean(classification_metrics[2][0][1][1])

gb_dxsub_mse_train = np.mean(classification_metrics[2][1][0][0])
gb_dxsub_mse_test = np.mean(classification_metrics[2][1][0][1])
gb_dxsub_acc_train = np.mean(classification_metrics[2][1][1][0])
gb_dxsub_acc_test = np.mean(classification_metrics[2][1][1][1])

In [None]:
# Make dataframes
metrics_dx_dict = {'DX_acc_train': [lr_dx_acc_train, rf_dx_acc_train, gb_dx_acc_train],
                   'DX_acc_test': [lr_dx_acc_test, rf_dx_acc_test, gb_dx_acc_test],
                   'DX_mse_train': [lr_dx_mse_train, rf_dx_mse_train, gb_dx_mse_train],
                   'DX_mse_test': [lr_dx_mse_test, rf_dx_mse_test, gb_dx_mse_test]}

metrics_DX = pd.DataFrame(data=metrics_dx_dict,
                          columns=['DX_acc_train', 'DX_acc_test', 'DX_mse_train', 'DX_mse_test'],
                          index=['LogReg', 'RandomForest', 'GradBoost'])

In [None]:
metrics_dxsub_dict = {'DXSUB_acc_train': [lr_dxsub_acc_train, rf_dxsub_acc_train, gb_dxsub_acc_train],
                   'DXSUB_acc_test': [lr_dxsub_acc_test, rf_dxsub_acc_test, gb_dxsub_acc_test],
                   'DXSUB_mse_train': [lr_dxsub_mse_train, rf_dxsub_mse_train, gb_dxsub_mse_train],
                   'DXSUB_mse_test': [lr_dxsub_mse_test, rf_dxsub_mse_test, gb_dxsub_mse_test]}

metrics_DXSUB = pd.DataFrame(data=metrics_dxsub_dict,
                             columns=['DXSUB_acc_train', 'DXSUB_acc_test', 'DXSUB_mse_train', 'DXSUB_mse_test'],
                             index=['LogReg', 'RandomForest', 'GradBoost'])

In [None]:
metrics_DX.round(3)

In [None]:
metrics_DXSUB.round(3)

# Neuropsych vs TMCQ

Now that I've evaluated models on all the data, I want to check out what accuracy and mse looks like for models run JUST on neuropsych, and JUST on TMCQ.

I'll use the same exact procedure as above, just with different X matrices.

In [None]:
train_data = pd.read_csv('data/train_data.csv')

In [None]:
X_TMCQ = train_data[['Y1_P_TMCQ_ACTIVCONT', 'Y1_P_TMCQ_ACTIVITY', 'Y1_P_TMCQ_AFFIL',
       'Y1_P_TMCQ_ANGER', 'Y1_P_TMCQ_FEAR', 'Y1_P_TMCQ_HIP',
       'Y1_P_TMCQ_IMPULS', 'Y1_P_TMCQ_INHIBIT', 'Y1_P_TMCQ_SAD',
       'Y1_P_TMCQ_SHY', 'Y1_P_TMCQ_SOOTHE', 'Y1_P_TMCQ_ASSERT',
       'Y1_P_TMCQ_ATTFOCUS', 'Y1_P_TMCQ_LIP', 'Y1_P_TMCQ_PERCEPT',
       'Y1_P_TMCQ_DISCOMF', 'Y1_P_TMCQ_OPENNESS', 'Y1_P_TMCQ_SURGENCY',
       'Y1_P_TMCQ_EFFCONT', 'Y1_P_TMCQ_NEGAFFECT']]

In [None]:
X_TMCQ.shape

In [None]:
X_neuro = train_data[['STOP_SSRTAVE_Y1', 'DPRIME1_Y1', 'DPRIME2_Y1', 'SSBK_NUMCOMPLETE_Y1',
       'SSFD_NUMCOMPLETE_Y1', 'V_Y1', 'Y1_CLWRD_COND1', 'Y1_CLWRD_COND2',
       'Y1_DIGITS_BKWD_RS', 'Y1_DIGITS_FRWD_RS', 'Y1_TRAILS_COND2',
       'Y1_TRAILS_COND3', 'CW_RES', 'TR_RES', 'Y1_TAP_SD_TOT_CLOCK']]

In [None]:
y_all = train_data[['DX', 'DXSUB']]

In [None]:
# Must drop subjects where ALL data is missing, due to matrix factorixation imputation
X_TMCQ_nonull = X_TMCQ.dropna(how='all')
X_neuro_nonull = X_neuro.dropna(how='all')

In [None]:
%%capture
TMCQ_dx, TMCQ_dxsub = run_classifiers(X_TMCQ_nonull, y_all)

In [None]:
TMCQ_dx

In [None]:
TMCQ_dxsub

In [None]:
%%capture
neuro_dx, neuro_dxsub = run_classifiers(X_neuro_nonull, y_all)

In [None]:
neuro_dx

In [None]:
neuro_dxsub

# XGBoost model

Exploring more "modern" boosting techniques, starting with XGBoost. If it looks promising, I'll work on hyperparam tuning on this model.

In [None]:
param = {'max_depth':2, 'eta':0.1, 'objective':'binary:logistic'}

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100)

# Building Pipeline for CV 

I just remembered that sklearn.pipeline is a thing.
So, I'm going to build that so cross-validation and multiple metrics are easier!

From sklearn:
```
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, iris.data, iris.target, cv=cv)
...                                                 
array([ 0.97...,  0.93...,  0.95...])
```

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from fancyimpute import MICE
from impute_transform import ImputeTransform

In [21]:
train_data = pd.read_csv('data/train_data.csv')
train_data_small = train_data.sample(n=100)
X = train_data_small.drop(columns=['DX','DXSUB'])
y = train_data_small['DX'].map({3:1,1:0})

In [33]:
clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()), LogisticRegression(random_state=56))

In [16]:
from sklearn.model_selection import cross_validate

In [18]:
scoring = ['accuracy', 'roc_auc', 'neg_log_loss']

In [34]:
scores = cross_validate(clf, X, y, scoring=scoring, cv=2, return_train_score=True)

Train on 1454 samples, validate on 162 samples
Epoch 1/10000
 - 0s - loss: 1.1129 - val_loss: 0.9870
Epoch 2/10000
 - 0s - loss: 1.1092 - val_loss: 0.9864
Epoch 3/10000
 - 0s - loss: 1.1072 - val_loss: 0.9859
Epoch 4/10000
 - 0s - loss: 1.1054 - val_loss: 0.9854
Epoch 5/10000
 - 0s - loss: 1.1037 - val_loss: 0.9849
Epoch 6/10000
 - 0s - loss: 1.1019 - val_loss: 0.9844
Epoch 7/10000
 - 0s - loss: 1.1000 - val_loss: 0.9839
Epoch 8/10000
 - 0s - loss: 1.0981 - val_loss: 0.9834
Epoch 9/10000
 - 0s - loss: 1.0962 - val_loss: 0.9829
Epoch 10/10000
 - 0s - loss: 1.0943 - val_loss: 0.9824
Epoch 11/10000
 - 0s - loss: 1.0923 - val_loss: 0.9819
Epoch 12/10000
 - 0s - loss: 1.0903 - val_loss: 0.9814
Epoch 13/10000
 - 0s - loss: 1.0882 - val_loss: 0.9810
Epoch 14/10000
 - 0s - loss: 1.0862 - val_loss: 0.9805
Epoch 15/10000
 - 0s - loss: 1.0841 - val_loss: 0.9801
Epoch 16/10000
 - 0s - loss: 1.0821 - val_loss: 0.9796
Epoch 17/10000
 - 0s - loss: 1.0800 - val_loss: 0.9792
Epoch 18/10000
 - 0s - loss

Epoch 107/10000
 - 0s - loss: 0.8959 - val_loss: 1.1513
Epoch 108/10000
 - 0s - loss: 0.8941 - val_loss: 1.1505
Epoch 109/10000
 - 0s - loss: 0.8923 - val_loss: 1.1497
Epoch 110/10000
 - 0s - loss: 0.8905 - val_loss: 1.1489
Epoch 111/10000
 - 0s - loss: 0.8887 - val_loss: 1.1481
Epoch 112/10000
 - 0s - loss: 0.8869 - val_loss: 1.1472
Epoch 113/10000
 - 0s - loss: 0.8851 - val_loss: 1.1464
Epoch 114/10000
 - 0s - loss: 0.8833 - val_loss: 1.1456
Epoch 115/10000
 - 0s - loss: 0.8815 - val_loss: 1.1447
Epoch 116/10000
 - 0s - loss: 0.8797 - val_loss: 1.1439
Epoch 117/10000
 - 0s - loss: 0.8779 - val_loss: 1.1430
Epoch 118/10000
 - 0s - loss: 0.8760 - val_loss: 1.1421
Epoch 119/10000
 - 0s - loss: 0.8742 - val_loss: 1.1412
Epoch 120/10000
 - 0s - loss: 0.8724 - val_loss: 1.1404
Epoch 121/10000
 - 0s - loss: 0.8706 - val_loss: 1.1395
Epoch 122/10000
 - 0s - loss: 0.8687 - val_loss: 1.1386
Epoch 123/10000
 - 0s - loss: 0.8669 - val_loss: 1.1377
Epoch 124/10000
 - 0s - loss: 0.8651 - val_loss:

Epoch 254/10000
 - 0s - loss: 0.6196 - val_loss: 1.0020
Epoch 255/10000
 - 0s - loss: 0.6179 - val_loss: 1.0012
Epoch 256/10000
 - 0s - loss: 0.6162 - val_loss: 1.0004
Epoch 257/10000
 - 0s - loss: 0.6144 - val_loss: 0.9996
Epoch 258/10000
 - 0s - loss: 0.6127 - val_loss: 0.9988
Epoch 259/10000
 - 0s - loss: 0.6110 - val_loss: 0.9980
Epoch 260/10000
 - 0s - loss: 0.6093 - val_loss: 0.9973
Epoch 261/10000
 - 0s - loss: 0.6077 - val_loss: 0.9965
Epoch 262/10000
 - 0s - loss: 0.6060 - val_loss: 0.9958
Epoch 263/10000
 - 0s - loss: 0.6043 - val_loss: 0.9950
Epoch 264/10000
 - 0s - loss: 0.6026 - val_loss: 0.9943
Epoch 265/10000
 - 0s - loss: 0.6009 - val_loss: 0.9936
Epoch 266/10000
 - 0s - loss: 0.5993 - val_loss: 0.9928
Epoch 267/10000
 - 0s - loss: 0.5976 - val_loss: 0.9921
Epoch 268/10000
 - 0s - loss: 0.5960 - val_loss: 0.9914
Epoch 269/10000
 - 0s - loss: 0.5943 - val_loss: 0.9907
Epoch 270/10000
 - 0s - loss: 0.5927 - val_loss: 0.9900
Epoch 271/10000
 - 0s - loss: 0.5910 - val_loss:

Epoch 28/10000
 - 0s - loss: 1.0327 - val_loss: 1.2308
Epoch 29/10000
 - 0s - loss: 1.0305 - val_loss: 1.2305
Epoch 30/10000
 - 0s - loss: 1.0283 - val_loss: 1.2302
Epoch 31/10000
 - 0s - loss: 1.0261 - val_loss: 1.2298
Epoch 32/10000
 - 0s - loss: 1.0239 - val_loss: 1.2295
Epoch 33/10000
 - 0s - loss: 1.0217 - val_loss: 1.2291
Epoch 34/10000
 - 0s - loss: 1.0196 - val_loss: 1.2288
Epoch 35/10000
 - 0s - loss: 1.0174 - val_loss: 1.2284
Epoch 36/10000
 - 0s - loss: 1.0153 - val_loss: 1.2280
Epoch 37/10000
 - 0s - loss: 1.0131 - val_loss: 1.2277
Epoch 38/10000
 - 0s - loss: 1.0110 - val_loss: 1.2273
Epoch 39/10000
 - 0s - loss: 1.0088 - val_loss: 1.2269
Epoch 40/10000
 - 0s - loss: 1.0067 - val_loss: 1.2265
Epoch 41/10000
 - 0s - loss: 1.0046 - val_loss: 1.2261
Epoch 42/10000
 - 0s - loss: 1.0025 - val_loss: 1.2257
Epoch 43/10000
 - 0s - loss: 1.0004 - val_loss: 1.2253
Epoch 44/10000
 - 0s - loss: 0.9982 - val_loss: 1.2249
Epoch 45/10000
 - 0s - loss: 0.9962 - val_loss: 1.2245
Epoch 46/1

Epoch 176/10000
 - 0s - loss: 0.7280 - val_loss: 1.1203
Epoch 177/10000
 - 0s - loss: 0.7259 - val_loss: 1.1191
Epoch 178/10000
 - 0s - loss: 0.7238 - val_loss: 1.1179
Epoch 179/10000
 - 0s - loss: 0.7217 - val_loss: 1.1166
Epoch 180/10000
 - 0s - loss: 0.7196 - val_loss: 1.1154
Epoch 181/10000
 - 0s - loss: 0.7175 - val_loss: 1.1142
Epoch 182/10000
 - 0s - loss: 0.7153 - val_loss: 1.1129
Epoch 183/10000
 - 0s - loss: 0.7132 - val_loss: 1.1117
Epoch 184/10000
 - 0s - loss: 0.7111 - val_loss: 1.1104
Epoch 185/10000
 - 0s - loss: 0.7090 - val_loss: 1.1092
Epoch 186/10000
 - 0s - loss: 0.7069 - val_loss: 1.1079
Epoch 187/10000
 - 0s - loss: 0.7049 - val_loss: 1.1067
Epoch 188/10000
 - 0s - loss: 0.7028 - val_loss: 1.1054
Epoch 189/10000
 - 0s - loss: 0.7007 - val_loss: 1.1042
Epoch 190/10000
 - 0s - loss: 0.6986 - val_loss: 1.1029
Epoch 191/10000
 - 0s - loss: 0.6965 - val_loss: 1.1017
Epoch 192/10000
 - 0s - loss: 0.6944 - val_loss: 1.1004
Epoch 193/10000
 - 0s - loss: 0.6923 - val_loss:

Epoch 323/10000
 - 0s - loss: 0.4722 - val_loss: 0.9682
Epoch 324/10000
 - 0s - loss: 0.4710 - val_loss: 0.9675
Epoch 325/10000
 - 0s - loss: 0.4697 - val_loss: 0.9668
Epoch 326/10000
 - 0s - loss: 0.4685 - val_loss: 0.9661
Epoch 327/10000
 - 0s - loss: 0.4673 - val_loss: 0.9655
Epoch 328/10000
 - 0s - loss: 0.4661 - val_loss: 0.9648
Epoch 329/10000
 - 0s - loss: 0.4648 - val_loss: 0.9641
Epoch 330/10000
 - 0s - loss: 0.4636 - val_loss: 0.9634
Epoch 331/10000
 - 0s - loss: 0.4624 - val_loss: 0.9628
Epoch 332/10000
 - 0s - loss: 0.4612 - val_loss: 0.9621
Epoch 333/10000
 - 0s - loss: 0.4600 - val_loss: 0.9615
Epoch 334/10000
 - 0s - loss: 0.4588 - val_loss: 0.9608
Epoch 335/10000
 - 0s - loss: 0.4576 - val_loss: 0.9602
Epoch 336/10000
 - 0s - loss: 0.4565 - val_loss: 0.9595
Epoch 337/10000
 - 0s - loss: 0.4553 - val_loss: 0.9589
Epoch 338/10000
 - 0s - loss: 0.4541 - val_loss: 0.9582
Epoch 339/10000
 - 0s - loss: 0.4530 - val_loss: 0.9576
Epoch 340/10000
 - 0s - loss: 0.4518 - val_loss:

Epoch 470/10000
 - 0s - loss: 0.3398 - val_loss: 0.8904
Epoch 471/10000
 - 0s - loss: 0.3392 - val_loss: 0.8900
Epoch 472/10000
 - 0s - loss: 0.3386 - val_loss: 0.8896
Epoch 473/10000
 - 0s - loss: 0.3379 - val_loss: 0.8892
Epoch 474/10000
 - 0s - loss: 0.3373 - val_loss: 0.8887
Epoch 475/10000
 - 0s - loss: 0.3367 - val_loss: 0.8883
Epoch 476/10000
 - 0s - loss: 0.3360 - val_loss: 0.8879
Epoch 477/10000
 - 0s - loss: 0.3354 - val_loss: 0.8875
Epoch 478/10000
 - 0s - loss: 0.3348 - val_loss: 0.8870
Epoch 479/10000
 - 0s - loss: 0.3342 - val_loss: 0.8866
Epoch 480/10000
 - 0s - loss: 0.3336 - val_loss: 0.8862
Epoch 481/10000
 - 0s - loss: 0.3330 - val_loss: 0.8858
Epoch 482/10000
 - 0s - loss: 0.3323 - val_loss: 0.8854
Epoch 483/10000
 - 0s - loss: 0.3317 - val_loss: 0.8850
Epoch 484/10000
 - 0s - loss: 0.3311 - val_loss: 0.8846
Epoch 485/10000
 - 0s - loss: 0.3305 - val_loss: 0.8842
Epoch 486/10000
 - 0s - loss: 0.3299 - val_loss: 0.8838
Epoch 487/10000
 - 0s - loss: 0.3293 - val_loss:

Epoch 617/10000
 - 0s - loss: 0.2664 - val_loss: 0.8418
Epoch 618/10000
 - 0s - loss: 0.2660 - val_loss: 0.8415
Epoch 619/10000
 - 0s - loss: 0.2656 - val_loss: 0.8413
Epoch 620/10000
 - 0s - loss: 0.2652 - val_loss: 0.8411
Epoch 621/10000
 - 0s - loss: 0.2648 - val_loss: 0.8409
Epoch 622/10000
 - 0s - loss: 0.2644 - val_loss: 0.8406
Epoch 623/10000
 - 0s - loss: 0.2640 - val_loss: 0.8404
Epoch 624/10000
 - 0s - loss: 0.2636 - val_loss: 0.8401
Epoch 625/10000
 - 0s - loss: 0.2633 - val_loss: 0.8399
Epoch 626/10000
 - 0s - loss: 0.2629 - val_loss: 0.8397
Epoch 627/10000
 - 0s - loss: 0.2625 - val_loss: 0.8395
Epoch 628/10000
 - 0s - loss: 0.2621 - val_loss: 0.8392
Epoch 629/10000
 - 0s - loss: 0.2617 - val_loss: 0.8390
Epoch 630/10000
 - 0s - loss: 0.2613 - val_loss: 0.8388
Epoch 631/10000
 - 0s - loss: 0.2609 - val_loss: 0.8386
Epoch 632/10000
 - 0s - loss: 0.2606 - val_loss: 0.8384
Epoch 633/10000
 - 0s - loss: 0.2602 - val_loss: 0.8382
Epoch 634/10000
 - 0s - loss: 0.2598 - val_loss:

Epoch 69/10000
 - 0s - loss: 0.9673 - val_loss: 1.2938
Epoch 70/10000
 - 0s - loss: 0.9656 - val_loss: 1.2931
Epoch 71/10000
 - 0s - loss: 0.9638 - val_loss: 1.2923
Epoch 72/10000
 - 0s - loss: 0.9620 - val_loss: 1.2916
Epoch 73/10000
 - 0s - loss: 0.9603 - val_loss: 1.2908
Epoch 74/10000
 - 0s - loss: 0.9585 - val_loss: 1.2900
Epoch 75/10000
 - 0s - loss: 0.9567 - val_loss: 1.2893
Epoch 76/10000
 - 0s - loss: 0.9550 - val_loss: 1.2885
Epoch 77/10000
 - 0s - loss: 0.9532 - val_loss: 1.2877
Epoch 78/10000
 - 0s - loss: 0.9515 - val_loss: 1.2869
Epoch 79/10000
 - 0s - loss: 0.9497 - val_loss: 1.2861
Epoch 80/10000
 - 0s - loss: 0.9480 - val_loss: 1.2853
Epoch 81/10000
 - 0s - loss: 0.9462 - val_loss: 1.2845
Epoch 82/10000
 - 0s - loss: 0.9444 - val_loss: 1.2837
Epoch 83/10000
 - 0s - loss: 0.9427 - val_loss: 1.2829
Epoch 84/10000
 - 0s - loss: 0.9409 - val_loss: 1.2820
Epoch 85/10000
 - 0s - loss: 0.9392 - val_loss: 1.2813
Epoch 86/10000
 - 0s - loss: 0.9374 - val_loss: 1.2804
Epoch 87/1

Epoch 216/10000
 - 0s - loss: 0.6848 - val_loss: 1.1245
Epoch 217/10000
 - 0s - loss: 0.6828 - val_loss: 1.1233
Epoch 218/10000
 - 0s - loss: 0.6809 - val_loss: 1.1220
Epoch 219/10000
 - 0s - loss: 0.6789 - val_loss: 1.1208
Epoch 220/10000
 - 0s - loss: 0.6769 - val_loss: 1.1195
Epoch 221/10000
 - 0s - loss: 0.6749 - val_loss: 1.1183
Epoch 222/10000
 - 0s - loss: 0.6730 - val_loss: 1.1170
Epoch 223/10000
 - 0s - loss: 0.6710 - val_loss: 1.1158
Epoch 224/10000
 - 0s - loss: 0.6690 - val_loss: 1.1145
Epoch 225/10000
 - 0s - loss: 0.6671 - val_loss: 1.1133
Epoch 226/10000
 - 0s - loss: 0.6651 - val_loss: 1.1121
Epoch 227/10000
 - 0s - loss: 0.6632 - val_loss: 1.1109
Epoch 228/10000
 - 0s - loss: 0.6612 - val_loss: 1.1097
Epoch 229/10000
 - 0s - loss: 0.6593 - val_loss: 1.1085
Epoch 230/10000
 - 0s - loss: 0.6573 - val_loss: 1.1073
Epoch 231/10000
 - 0s - loss: 0.6554 - val_loss: 1.1061
Epoch 232/10000
 - 0s - loss: 0.6535 - val_loss: 1.1049
Epoch 233/10000
 - 0s - loss: 0.6516 - val_loss:

Epoch 363/10000
 - 0s - loss: 0.4499 - val_loss: 0.9891
Epoch 364/10000
 - 0s - loss: 0.4487 - val_loss: 0.9884
Epoch 365/10000
 - 0s - loss: 0.4476 - val_loss: 0.9878
Epoch 366/10000
 - 0s - loss: 0.4464 - val_loss: 0.9871
Epoch 367/10000
 - 0s - loss: 0.4452 - val_loss: 0.9865
Epoch 368/10000
 - 0s - loss: 0.4441 - val_loss: 0.9859
Epoch 369/10000
 - 0s - loss: 0.4429 - val_loss: 0.9853
Epoch 370/10000
 - 0s - loss: 0.4418 - val_loss: 0.9846
Epoch 371/10000
 - 0s - loss: 0.4406 - val_loss: 0.9840
Epoch 372/10000
 - 0s - loss: 0.4395 - val_loss: 0.9834
Epoch 373/10000
 - 0s - loss: 0.4384 - val_loss: 0.9828
Epoch 374/10000
 - 0s - loss: 0.4373 - val_loss: 0.9822
Epoch 375/10000
 - 0s - loss: 0.4361 - val_loss: 0.9816
Epoch 376/10000
 - 0s - loss: 0.4350 - val_loss: 0.9809
Epoch 377/10000
 - 0s - loss: 0.4339 - val_loss: 0.9804
Epoch 378/10000
 - 0s - loss: 0.4328 - val_loss: 0.9798
Epoch 379/10000
 - 0s - loss: 0.4317 - val_loss: 0.9792
Epoch 380/10000
 - 0s - loss: 0.4306 - val_loss:

Epoch 4/10000
 - 0s - loss: 1.0996 - val_loss: 1.0845
Epoch 5/10000
 - 0s - loss: 1.0977 - val_loss: 1.0845
Epoch 6/10000
 - 0s - loss: 1.0959 - val_loss: 1.0844


In [35]:
scores

{'fit_time': array([ 0.53662777,  0.52442455]),
 'score_time': array([ 2.62700605,  1.87451816]),
 'test_accuracy': array([ 0.72,  0.74]),
 'test_neg_log_loss': array([-0.91636815, -0.65024574]),
 'test_roc_auc': array([ 0.81770833,  0.82638889]),
 'train_accuracy': array([ 1.  ,  0.98]),
 'train_neg_log_loss': array([-0.06120686, -0.05493667]),
 'train_roc_auc': array([ 1.,  1.])}

### Final Thoughts for the day


My brain is pretty fried, so I'm gonna call it a night.

But here's next steps for tomorrow:
- Get this cross_validate function working for log_reg, rf, gb, and xgb
- Get this cross_validate function working for DXSUB
 - cause of the multiclass problem and all that
- Explore TMCQ and neuropsych more
 - The metrics were quite bad on these! And logistic regression actually performed better test-wise than RF and GB!
- Discuss next steps with Matt
 - Clustering ideas
 - How to approach hyperparam tuning

#### Clustering Ideas

[Subtyping ADHD Using Tempermant Dimensions](https://jamanetwork.com/journals/jamapsychiatry/fullarticle/1885709)

The above is a paper written by my boss (Dr. Karalunas) that utilized community detection analysis on the Temperment in Middle Childhood Questionnaire (TMCQ).
They had 437 children and used the TMCQ from year 1. 
They specifically used the [Fast Greedy algorithm](https://arxiv.org/abs/cond-mat/0408187) and found 3 profiles of children, which they labeled as "mild", "surgent", and "irritable".

I was thinking of trying to replicate this analysis on the full 901 dataset ([community detection in python](https://yoyoinwanderland.github.io/2017/08/08/Community-Detection-in-Python/)).
Then, I was thinking of trying different clustering algorithms to see if the same profiles seem to exist.

It'd basically be a study in reproducability.

They used physiological and MRI data to externally validate these profiles, which I don't really have. But I might be able to glean something interesting from the neuropsych data? Maybe? IDK. Focus on "are the profiles there".