In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from model_metrics import *
from fancyimpute import *
import pickle
from sklearn.metrics import roc_curve, auc

Using TensorFlow backend.


In [None]:
full_data = pd.read_csv('data/Christie_diagnosis_20180118.csv')

# Multiple Imputation Testing

For information on algorithms, see [fancyimpute](https://pypi.python.org/pypi/fancyimpute)

Make all the solver objects

In [None]:
KNN_solver = KNN(k=5)
softimpute_solver = SoftImpute()
MICE_solver = MICE()
simple_solver = SimpleFill()
iterativeSVD_solver = IterativeSVD()
matrixfactorization_solver = MatrixFactorization()

Create a dataframe from the complete data in `train_data`. Then, randomly insert some NaNs for MSE testing.

In [None]:
# Drop NaNs
complete_data = train_data.dropna()

In [None]:
# Drop DX and DXSUB
complete_data.drop(columns=['DX', 'DXSUB'], inplace=True)

In [None]:
# Randomly insert NaNs
nan_inserted_data = complete_data.copy()
import random
ix = [(row, col) for row in range(complete_data.shape[0]) for col in range(complete_data.shape[1])]
for row, col in random.sample(ix, int(round(.1*len(ix)))):
    nan_inserted_data.iat[row, col] = np.nan

In [None]:
missing_mask = nan_inserted_data.isna().any(axis=1)

Complete those missing dataframes with the various solvers!

In [None]:
def test_imputation(solver, df):
    """Impute the data using imputation methods"""
    impute_data = df.values
    data_index = df.index
    data_cols = df.columns

    impute_data_filled = solver.complete(impute_data)
    impute_df = pd.DataFrame(impute_data_filled, index=data_index, columns=data_cols)
    return impute_df

In [None]:
KNN_df = test_imputation(KNN_solver, nan_inserted_data)

In [None]:
softimpute_df = test_imputation(softimpute_solver, nan_inserted_data)

In [None]:
MICE_df = test_imputation(MICE_solver, nan_inserted_data)

In [None]:
simple_df = test_imputation(simple_solver, nan_inserted_data)

In [None]:
iterative_df = test_imputation(iterativeSVD_solver, nan_inserted_data)

In [None]:
matrixfact_df = test_imputation(matrixfactorization_solver, nan_inserted_data)

### Now cast as ints

In [None]:
solver_list = [KNN_df, softimpute_df, MICE_df,
               simple_df, iterative_df, matrixfact_df]
solver_names = ['KNN', 'SoftImpute', 'MICE', 'SimpleFill',
                   'IterativeSVD', 'MatrixFactorization']

In [None]:
KNN_df_round = KNN_df.copy()
softimpute_df_round = softimpute_df.copy()
MICE_df_round = MICE_df.copy()
simple_df_round = simple_df.copy()
iterative_df_round = iterative_df.copy()
matrixfact_df_round = matrixfact_df.copy()

In [None]:
round_list = [KNN_df_round, softimpute_df_round, MICE_df_round,
              simple_df_round, iterative_df_round, matrixfact_df_round]
round_names = ['KNN_round', 'SoftImpute_round', 'MICE_round',
            'SimpleFill_round', 'IterativeSVD_round', 'MatrixFact_round']

In [None]:
int_cols = ['SSBK_NUMCOMPLETE_Y1', 'SSFD_NUMCOMPLETE_Y1',
            'Y1_CLWRD_COND1', 'Y1_CLWRD_COND2', 'Y1_DIGITS_BKWD_RS',
            'Y1_DIGITS_FRWD_RS', 'Y1_TRAILS_COND2', 'Y1_TRAILS_COND3']
for df in round_list:
    for col in int_cols:
        df[col] = df[col].astype('int')

### Create MSEs for each

In [None]:
total_df_list = solver_list + round_list
total_df_names = solver_names + round_names

In [None]:
mse_df = pd.DataFrame(index=total_df_names, columns=complete_data.columns)

In [None]:
i = 0
for df, name in zip(total_df_list, total_df_names):
    mse = ((df[missing_mask] - complete_data[missing_mask]) ** 2).mean()
    mse_df.loc[name] = mse
    i += 1

In [None]:
mse_df

### Which method has the lowest MSEs?

Write `true` for minimums in each col

In [None]:
mse_df_bool = mse_df.copy()
for col in mse_df.columns:
    mse_df_bool[col] = (mse_df_bool[col] == np.min(mse_df_bool[col]))

In [None]:
mse_df_bool.sum(axis=1)

In [None]:
int_cols_nodx = ['SSBK_NUMCOMPLETE_Y1',
 'SSFD_NUMCOMPLETE_Y1',
 'Y1_CLWRD_COND1',
 'Y1_CLWRD_COND2',
 'Y1_DIGITS_BKWD_RS',
 'Y1_DIGITS_FRWD_RS',
 'Y1_TRAILS_COND2',
 'Y1_TRAILS_COND3']

In [None]:
mse_df_bool[int_cols_nodx].sum(axis=1)

However, MICE seems to do the best for columns that are technically integers (3 out of 8)

### Conclusions

Looks like MatrixFactorization is the best option. (out of 38 cols, 24 went to MatrixFactorization for the lowest MSE)

Rounding does not improve the MSE. 

# Testing for leaky data!

I had really high accuracy on my first logistic regression model (91% test accuracy on smaller train/test split, and 92% accuracy on cross-validated log models).

So, I want to investigate if I have any leaky data columns.

In [None]:
X_train = train_data.drop(columns=['DX','DXSUB'])
y_train = train_data['DX']

In [None]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
                    X_train, y_train, test_size=0.2, random_state=56)

In [None]:
accuracy_list = []
for col in X_train_small.columns:
    # Drop the column
    X_train_dataset = X_train_small.drop(columns=col).values
    X_test_dataset = X_test_small.drop(columns=col).values
    
    # Impute the data if missing numbers
    if np.sum(np.isnan(X_train_dataset)) > 0:
        X_train_final = impute_data(X_train_dataset)
    else:
        X_train_final = X_train_dataset.copy()

    if np.sum(np.isnan(X_test_dataset)) > 0:
        X_test_final = impute_data(X_test_dataset)
    else:
        X_test_final = X_test_dataset.copy()
        
    # Fit model
    model = LogisticRegression()
    model.fit(X_train_final, y_train_small)
    
    # Score model
    accuracy = model.score(y_test_final, y_test_small)
    accuracy_list.append((col, accuracy))

In [None]:
accuracy_list
for col, acc in accuracy_list:
    print("Accuracy removing {}: \t \t {:2.2f}".format(col, acc).expandtabs(10))

In [None]:
X_train_small_impute = impute_data(X_train_small.values)

In [None]:
logreg.fit(X_train_small_impute, y_train_small.values)

In [None]:
logreg.score(impute_data(X_test_small.values), y_test_small.values)

Looks like there aren't any leaky variables. I just have high accuracy.

This makes sense - the lab wouldn't administer tests or behavioral questionnaires that don't have something to do with ADHD. So a straight logistical model is pretty accurate.

Going forward, I want to do a few things:

- How high can I get the accuracy? Test out a few different models (RF, Gradient Boosting)
- What's the spread of the predicted probas like?
- Test out on DXSUB

# Logistic Model Metrics Visualization

In [None]:
X_train_DX = train_data.drop(columns=['DX','DXSUB'])
y_train_DX = train_data['DX']

In [None]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(
                    X_train_DX, y_train_DX, test_size=0.2, random_state=56)

In [None]:
logmod = LogisticRegression()
logmod.fit(impute_data(X_train_small.values), y_train_small)

In [None]:
pred_prob_dx = logmod.predict_proba(impute_data(X_test_small.values))

In [None]:
prob_dx = logmod.predict(impute_data(X_test_small.values))

In [None]:
len(pred_prob_dx[:,0])

In [None]:
def make_jitter(data, jitter=0.1):
    return np.random.uniform(-jitter, jitter, size=data.shape)

In [None]:
fig, ax = plt.subplots(figsize=(10,2))

_ = ax.scatter(pred_prob_dx[:,0], make_jitter(pred_prob_dx[:,0]), c=np.vectorize(dx_dict.get)(y_test_small),
           s=40, alpha=0.5)
_ = ax.set_xlim(0,1)
_ = ax.set_title('Predicted Probability of Positive vs Negative Classes')

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_small, prob_dx, pos_label=3, drop_intermediate=False)
roc_auc = auc(fpr, tpr)

In [None]:
plt.plot(fpr, tpr)

# Classification Metrics

I created logistic regression, random forest, and gradient boosting models. I want to see the MSE and accuracy on train/test, cross-validated (k-fold=10), when predicting DX, and when predicting DXSUB.

In [None]:
#train_data = pd.read_csv('data/train_data.csv')

In [None]:
#%%capture
#classifier_metrics = run_classifiers(train_data)

In [None]:
#with open('classifier_metrics.pkl', 'wb') as f:
    #pickle.dump(classifier_metrics, f)

In [None]:
classification_metrics = pickle.load(open("classifier_metrics.pkl", "rb"))

In [None]:
#[clf][pred][metric][train/test]
# Logistic Regression
lr_dx_mse_train = np.mean(classification_metrics[0][0][0][0])
lr_dx_mse_test = np.mean(classification_metrics[0][0][0][1])
lr_dx_acc_train = np.mean(classification_metrics[0][0][1][0])
lr_dx_acc_test = np.mean(classification_metrics[0][0][1][1])

lr_dxsub_mse_train = np.mean(classification_metrics[0][1][0][0])
lr_dxsub_mse_test = np.mean(classification_metrics[0][1][0][1])
lr_dxsub_acc_train = np.mean(classification_metrics[0][1][1][0])
lr_dxsub_acc_test = np.mean(classification_metrics[0][1][1][1])

# Random Forest
rf_dx_mse_train = np.mean(classification_metrics[1][0][0][0])
rf_dx_mse_test = np.mean(classification_metrics[1][0][0][1])
rf_dx_acc_train = np.mean(classification_metrics[1][0][1][0])
rf_dx_acc_test = np.mean(classification_metrics[1][0][1][1])

rf_dxsub_mse_train = np.mean(classification_metrics[1][1][0][0])
rf_dxsub_mse_test = np.mean(classification_metrics[1][1][0][1])
rf_dxsub_acc_train = np.mean(classification_metrics[1][1][1][0])
rf_dxsub_acc_test = np.mean(classification_metrics[1][1][1][1])

# Gradient Boosting
gb_dx_mse_train = np.mean(classification_metrics[2][0][0][0])
gb_dx_mse_test = np.mean(classification_metrics[2][0][0][1])
gb_dx_acc_train = np.mean(classification_metrics[2][0][1][0])
gb_dx_acc_test = np.mean(classification_metrics[2][0][1][1])

gb_dxsub_mse_train = np.mean(classification_metrics[2][1][0][0])
gb_dxsub_mse_test = np.mean(classification_metrics[2][1][0][1])
gb_dxsub_acc_train = np.mean(classification_metrics[2][1][1][0])
gb_dxsub_acc_test = np.mean(classification_metrics[2][1][1][1])

In [None]:
# Make dataframes
metrics_dx_dict = {'DX_acc_train': [lr_dx_acc_train, rf_dx_acc_train, gb_dx_acc_train],
                   'DX_acc_test': [lr_dx_acc_test, rf_dx_acc_test, gb_dx_acc_test],
                   'DX_mse_train': [lr_dx_mse_train, rf_dx_mse_train, gb_dx_mse_train],
                   'DX_mse_test': [lr_dx_mse_test, rf_dx_mse_test, gb_dx_mse_test]}

metrics_DX = pd.DataFrame(data=metrics_dx_dict,
                          columns=['DX_acc_train', 'DX_acc_test', 'DX_mse_train', 'DX_mse_test'],
                          index=['LogReg', 'RandomForest', 'GradBoost'])

In [None]:
metrics_dxsub_dict = {'DXSUB_acc_train': [lr_dxsub_acc_train, rf_dxsub_acc_train, gb_dxsub_acc_train],
                   'DXSUB_acc_test': [lr_dxsub_acc_test, rf_dxsub_acc_test, gb_dxsub_acc_test],
                   'DXSUB_mse_train': [lr_dxsub_mse_train, rf_dxsub_mse_train, gb_dxsub_mse_train],
                   'DXSUB_mse_test': [lr_dxsub_mse_test, rf_dxsub_mse_test, gb_dxsub_mse_test]}

metrics_DXSUB = pd.DataFrame(data=metrics_dxsub_dict,
                             columns=['DXSUB_acc_train', 'DXSUB_acc_test', 'DXSUB_mse_train', 'DXSUB_mse_test'],
                             index=['LogReg', 'RandomForest', 'GradBoost'])

In [None]:
metrics_DX.round(3)

In [None]:
metrics_DXSUB.round(3)

# Neuropsych vs TMCQ

Now that I've evaluated models on all the data, I want to check out what accuracy and mse looks like for models run JUST on neuropsych, and JUST on TMCQ.

I'll use the same exact procedure as above, just with different X matrices.

In [None]:
train_data = pd.read_csv('data/train_data.csv')

In [None]:
X_TMCQ = train_data[['Y1_P_TMCQ_ACTIVCONT', 'Y1_P_TMCQ_ACTIVITY', 'Y1_P_TMCQ_AFFIL',
       'Y1_P_TMCQ_ANGER', 'Y1_P_TMCQ_FEAR', 'Y1_P_TMCQ_HIP',
       'Y1_P_TMCQ_IMPULS', 'Y1_P_TMCQ_INHIBIT', 'Y1_P_TMCQ_SAD',
       'Y1_P_TMCQ_SHY', 'Y1_P_TMCQ_SOOTHE', 'Y1_P_TMCQ_ASSERT',
       'Y1_P_TMCQ_ATTFOCUS', 'Y1_P_TMCQ_LIP', 'Y1_P_TMCQ_PERCEPT',
       'Y1_P_TMCQ_DISCOMF', 'Y1_P_TMCQ_OPENNESS', 'Y1_P_TMCQ_SURGENCY',
       'Y1_P_TMCQ_EFFCONT', 'Y1_P_TMCQ_NEGAFFECT']]

In [None]:
X_TMCQ.shape

In [None]:
X_neuro = train_data[['STOP_SSRTAVE_Y1', 'DPRIME1_Y1', 'DPRIME2_Y1', 'SSBK_NUMCOMPLETE_Y1',
       'SSFD_NUMCOMPLETE_Y1', 'V_Y1', 'Y1_CLWRD_COND1', 'Y1_CLWRD_COND2',
       'Y1_DIGITS_BKWD_RS', 'Y1_DIGITS_FRWD_RS', 'Y1_TRAILS_COND2',
       'Y1_TRAILS_COND3', 'CW_RES', 'TR_RES', 'Y1_TAP_SD_TOT_CLOCK']]

In [None]:
y_all = train_data[['DX', 'DXSUB']]

In [None]:
# Must drop subjects where ALL data is missing, due to matrix factorixation imputation
X_TMCQ_nonull = X_TMCQ.dropna(how='all')
X_neuro_nonull = X_neuro.dropna(how='all')

In [None]:
%%capture
TMCQ_dx, TMCQ_dxsub = run_classifiers(X_TMCQ_nonull, y_all)

In [None]:
TMCQ_dx

In [None]:
TMCQ_dxsub

In [None]:
%%capture
neuro_dx, neuro_dxsub = run_classifiers(X_neuro_nonull, y_all)

In [None]:
neuro_dx

In [None]:
neuro_dxsub

# Building Pipeline for CV 

I just remembered that sklearn.pipeline is a thing.
So, I'm going to build that so cross-validation and multiple metrics are easier!

From sklearn:
```
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
cross_val_score(clf, iris.data, iris.target, cv=cv)
...                                                 
array([ 0.97...,  0.93...,  0.95...])
```

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from fancyimpute import MICE
from impute_transform import ImputeTransform

In [None]:
train_data = pd.read_csv('data/train_data.csv')
train_data_small = train_data.sample(n=100)
X = train_data_small.drop(columns=['DX','DXSUB'])
y = train_data_small['DX'].map({3:1,1:0})

In [None]:
clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()), LogisticRegression(random_state=56))

In [None]:
scoring = ['accuracy', 'roc_auc', 'neg_log_loss']

In [None]:
scores = cross_validate(clf, X, y, scoring=scoring, cv=2, return_train_score=True)

In [None]:
scores

### Final Thoughts for the day


My brain is pretty fried, so I'm gonna call it a night.

But here's next steps for tomorrow:
- Get this cross_validate function working for log_reg, rf, gb, and xgb
- Get this cross_validate function working for DXSUB
 - cause of the multiclass problem and all that
- Explore TMCQ and neuropsych more
 - The metrics were quite bad on these! And logistic regression actually performed better test-wise than RF and GB!
- Discuss next steps with Matt
 - Clustering ideas
 - How to approach hyperparam tuning

#### Clustering Ideas

[Subtyping ADHD Using Tempermant Dimensions](https://jamanetwork.com/journals/jamapsychiatry/fullarticle/1885709)

The above is a paper written by my boss (Dr. Karalunas) that utilized community detection analysis on the Temperment in Middle Childhood Questionnaire (TMCQ).
They had 437 children and used the TMCQ from year 1. 
They specifically used the [Fast Greedy algorithm](https://arxiv.org/abs/cond-mat/0408187) and found 3 profiles of children, which they labeled as "mild", "surgent", and "irritable".

I was thinking of trying to replicate this analysis on the full 901 dataset ([community detection in python](https://yoyoinwanderland.github.io/2017/08/08/Community-Detection-in-Python/)).
Then, I was thinking of trying different clustering algorithms to see if the same profiles seem to exist.

It'd basically be a study in reproducability.

They used physiological and MRI data to externally validate these profiles, which I don't really have. But I might be able to glean something interesting from the neuropsych data? Maybe? IDK. Focus on "are the profiles there".

# Pipeline Continuing

I want to test the following models:
- Logistic Regression
- Random Forest
- Gradient Boosting
- XGBoost

With the following metrics:
- ROC AUC
- Accuracy
- Log Loss

On the following data:
- DX
- DXSUB
- Neuropsych
- TMCQ

In [13]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from collections import defaultdict
from sklearn.metrics import make_scorer, accuracy_score, log_loss
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from impute_transform import ImputeTransform
from xgboost import XGBClassifier

# Running the models for real

In [12]:
train_data = pd.read_csv('data/train_data.csv')
X = train_data.drop(columns=['DX','DXSUB'])
y = train_data['DX'].map({3:1,1:0})

In [14]:
log_reg_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()),
                        LogisticRegression(random_state=56))

rf_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()),
                       RandomForestClassifier(n_jobs=-1, random_state=56))

gb_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()),
                       GradientBoostingClassifier(random_state=56))

xgb_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()),
                        XGBClassifier(max_depth=3, learning_rate=0.1,
                        random_state=56))

In [15]:
scoring = ['accuracy', 'roc_auc', 'neg_log_loss']

In [16]:
classifier_list = [log_reg_clf, rf_clf, gb_clf, xgb_clf]
classifier_name = ['LogReg', 'RandomForest', 'GradientBoosting', 'XGB']

In [18]:
%%capture
classifier_metrics = {}
for clf, name in zip(classifier_list, classifier_name):
    scores = cross_validate(clf, X, y, scoring=scoring, cv=5, return_train_score=False)
    classifier_metrics[name] = scores

In [19]:
classifier_metrics

{'GradientBoosting': {'fit_time': array([ 5.61365485,  5.96402812,  5.96304774,  6.71043992,  5.83259368]),
  'score_time': array([  8.31407118,   5.76415157,  10.13945103,  10.34139943,   7.99768829]),
  'test_accuracy': array([ 0.91735537,  0.90909091,  0.94214876,  0.95      ,  0.94166667]),
  'test_neg_log_loss': array([-0.25958926, -0.28074731, -0.22154339, -0.15010377, -0.21449213]),
  'test_roc_auc': array([ 0.95877193,  0.95467836,  0.96432749,  0.9763756 ,  0.97159091])},
 'LogReg': {'fit_time': array([ 3.91597986,  4.13191342,  3.81056094,  4.5563941 ,  4.3751905 ]),
  'score_time': array([ 2.40666628,  5.38883543,  2.07561421,  5.07782388,  4.31213188]),
  'test_accuracy': array([ 0.8677686 ,  0.90909091,  0.91735537,  0.95      ,  0.9       ]),
  'test_neg_log_loss': array([-0.303425  , -0.20680224, -0.17379683, -0.12765787, -0.24626423]),
  'test_roc_auc': array([ 0.94678363,  0.96783626,  0.97807018,  0.99162679,  0.96979665])},
 'RandomForest': {'fit_time': array([ 5.144

In [20]:
for classifier, dictionary in classifier_metrics.items():
    for metric, score in classifier_metrics[classifier].items():
        classifier_metrics[classifier][metric] = np.mean(score)

In [21]:
name_of_metrics = ['fit_time', 'score_time', 'test_accuracy',
                   'test_neg_log_loss', 'test_roc_auc']

In [22]:
metrics_df = pd.DataFrame(data=None,
                          index=classifier_name,
                          columns=name_of_metrics)

In [23]:
for clf in classifier_name:
    for metric in name_of_metrics:
        metrics_df[metric].loc[clf] = classifier_metrics[clf][metric]

In [24]:
metrics_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_neg_log_loss,test_roc_auc
LogReg,4.15801,3.85221,0.908843,-0.211589,0.970823
RandomForest,5.21141,5.45477,0.922052,-0.536413,0.959773
GradientBoosting,6.01675,8.51135,0.932052,-0.225295,0.965149
XGB,6.5238,9.73487,0.920427,-0.213703,0.973333


In [25]:
# testing on holdout to see if its somewhat similar or
# super off like it was for tmcq...
holdout_data = pd.read_csv('data/holdout_data.csv')
X_test = holdout_data.drop(columns=['DX','DXSUB'])
y_test = holdout_data['DX'].map({3:1,1:0})
# yep, it worked there...

XGB and LogReg have the best AUC scores, and log_loss scores. Even though GB had the highest test accuracy, I think XGB and LogReg are the models to explore.

I also want to run this on Neuropsych and TMCQ data. It seemed like last time, RF and GB overfit - but XGBoost has regularization so this might mediate that problem.

In [31]:
X_TMCQ = train_data[['Y1_P_TMCQ_ACTIVCONT', 'Y1_P_TMCQ_ACTIVITY', 'Y1_P_TMCQ_AFFIL',
       'Y1_P_TMCQ_ANGER', 'Y1_P_TMCQ_FEAR', 'Y1_P_TMCQ_HIP',
       'Y1_P_TMCQ_IMPULS', 'Y1_P_TMCQ_INHIBIT', 'Y1_P_TMCQ_SAD',
       'Y1_P_TMCQ_SHY', 'Y1_P_TMCQ_SOOTHE', 'Y1_P_TMCQ_ASSERT',
       'Y1_P_TMCQ_ATTFOCUS', 'Y1_P_TMCQ_LIP', 'Y1_P_TMCQ_PERCEPT',
       'Y1_P_TMCQ_DISCOMF', 'Y1_P_TMCQ_OPENNESS', 'Y1_P_TMCQ_SURGENCY',
       'Y1_P_TMCQ_EFFCONT', 'Y1_P_TMCQ_NEGAFFECT']]

In [35]:
X_TMCQ_nonull = X_TMCQ[X_TMCQ.isnull().sum(axis=1) == 0]
y_TMCQ_nonull = y[X_TMCQ.isnull().sum(axis=1) == 0]

In [50]:
# Don't need imputation for TMCQ because I removed NaNs
log_reg_clf = LogisticRegression(random_state=56)

rf_clf = RandomForestClassifier(n_jobs=-1, random_state=56)

gb_clf = GradientBoostingClassifier(random_state=56)

xgb_clf = XGBClassifier(max_depth=3, learning_rate=0.1,
                        random_state=56)

In [38]:
classifier_list = [log_reg_clf, rf_clf, gb_clf, xgb_clf]
classifier_name = ['LogReg', 'RandomForest', 'GradientBoosting', 'XGB']

In [39]:
%%capture
classifier_metrics_TMCQ = {}
for clf, name in zip(classifier_list, classifier_name):
    scores = cross_validate(clf, X_TMCQ_nonull, y_TMCQ_nonull, scoring=scoring, cv=5, return_train_score=True)
    classifier_metrics_TMCQ[name] = scores

In [40]:
for classifier, dictionary in classifier_metrics_TMCQ.items():
    for metric, score in classifier_metrics_TMCQ[classifier].items():
        classifier_metrics_TMCQ[classifier][metric] = np.mean(score)

In [41]:
name_of_metrics = ['fit_time', 'score_time', 'test_accuracy',
                   'test_neg_log_loss', 'test_roc_auc']

In [42]:
metrics_df_TMCQ = pd.DataFrame(data=None,
                          index=classifier_name,
                          columns=name_of_metrics)

In [43]:
for clf in classifier_name:
    for metric in name_of_metrics:
        metrics_df_TMCQ[metric].loc[clf] = classifier_metrics_TMCQ[clf][metric]

In [44]:
metrics_df_TMCQ

Unnamed: 0,fit_time,score_time,test_accuracy,test_neg_log_loss,test_roc_auc
LogReg,0.00237722,0.00162859,0.927743,-0.178276,0.978986
RandomForest,0.112036,0.318826,0.92941,-0.632891,0.956898
GradientBoosting,0.0756029,0.00187788,0.926091,-0.218861,0.969757
XGB,0.0295064,0.00283685,0.927715,-0.207033,0.971983


In [45]:
# testing on holdout to see if its somewhat similar
X_test_TMCQ = holdout_data[['Y1_P_TMCQ_ACTIVCONT', 'Y1_P_TMCQ_ACTIVITY', 'Y1_P_TMCQ_AFFIL',
       'Y1_P_TMCQ_ANGER', 'Y1_P_TMCQ_FEAR', 'Y1_P_TMCQ_HIP',
       'Y1_P_TMCQ_IMPULS', 'Y1_P_TMCQ_INHIBIT', 'Y1_P_TMCQ_SAD',
       'Y1_P_TMCQ_SHY', 'Y1_P_TMCQ_SOOTHE', 'Y1_P_TMCQ_ASSERT',
       'Y1_P_TMCQ_ATTFOCUS', 'Y1_P_TMCQ_LIP', 'Y1_P_TMCQ_PERCEPT',
       'Y1_P_TMCQ_DISCOMF', 'Y1_P_TMCQ_OPENNESS', 'Y1_P_TMCQ_SURGENCY',
       'Y1_P_TMCQ_EFFCONT', 'Y1_P_TMCQ_NEGAFFECT']]
y_test_TMCQ = holdout_data['DX'].map({3:1,1:0})

X_test_TMCQ_nonull = X_test_TMCQ[X_test_TMCQ.isnull().sum(axis=1) == 0]
y_test_TMCQ_nonull = y_test[X_test_TMCQ.isnull().sum(axis=1) == 0]
# also get similar results! yay!