# Deep learning model Cross-Tab Analysis
This notebook contains the implementation and evaluation of various neural network models with differing paremeters.

In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

%aimport deepsvr

Populating the interactive namespace from numpy and matplotlib


In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import os
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l2
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle

from analysis_utils.ClassifierPlots import create_reliability_diagram, create_roc_curve, create_feature_importance_plot, make_model_output_plot
from analysis_utils.Analysis import determine_feature_importance, print_accuracy_and_classification_report, predict_classes, get_somatic_error_type, calculate_kappa

sns.set_style("white")
sns.set_context('talk')

Using TensorFlow backend.


In [4]:
# Pull in training data
training_data = pd.read_pickle('../data/training_data_preprocessed.pkl')

In [5]:
# Remove AML31 case
aml31_training = training_data[training_data.index.str.contains('fSsMNn1DZ3AIDGk=')]
training_data = training_data[~training_data.index.str.contains('fSsMNn1DZ3AIDGk=')]

In [6]:
# Show the calls associate with training data
training_data.groupby('call').size()

call
a    10643
f     8854
g     3122
s    18381
dtype: int64

In [7]:
# Fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [8]:
# Set parameters for cross-validation fold splits
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [9]:
# Re-label the germline calls as failed calls
three_class = training_data.replace('g','f')
three_class.sort_index(axis=1, inplace=True)

In [10]:
# Show the calls associate with training data
three_class.groupby('call').size()

call
a    10643
f    11976
s    18381
dtype: int64

In [114]:
# Get labels for training data
Y = pd.get_dummies(three_class.call)
Y = Y[['a', 'f', 's']].astype(float).values

# Get training data as numpy array
X = training_data.drop(['call'], axis=1).astype(float).values

In [115]:
# Split the data for cross-validation
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.33, random_state=seed)

In [116]:
# Determine shape of training data features for cross-validation
X_train.shape

(27470, 71)

In [117]:
# Determine shape of training data calls for cross-validation
Y_train.shape

(27470, 3)

In [15]:
# Define baseline model
def three_class_model():
    # create model
    model = Sequential()
    model.add(Dense(71, input_dim=71, kernel_initializer='normal', activation='tanh', kernel_regularizer=l2(0.001)))
    model.add(Dense(20, activation='tanh', kernel_regularizer=l2(0.001)))
    model.add(Dense(20, activation='tanh', kernel_regularizer=l2(0.001)))
    model.add(Dense(20, activation='tanh', kernel_regularizer=l2(0.001)))
    model.add(Dense(20, activation='tanh', kernel_regularizer=l2(0.001)))
    model.add(Dense(3, kernel_initializer='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [16]:
# Set deep learning model parameters
estimator = KerasClassifier(build_fn=three_class_model, epochs=700, batch_size=2000, verbose=0)

In [118]:
estimator.fit(X_train, Y_train)

<keras.callbacks.History at 0x1a27d87940>

# Perform Cross Tab Analysis

In [320]:
def cross_tab(estimator, filter_feature, metric, dataframe):
    if metric == 1:
        metric = metric
        dataframe_new = dataframe[dataframe[filter_feature] == metric]
    elif len(metric) > 1:
        lower = metric[0]
        upper = metric[1]
        dataframe_new = dataframe[(dataframe[filter_feature] >= lower) & (dataframe[filter_feature] <= upper)]
    
    s = len(dataframe_new[dataframe_new['somatic'] == 1])
    f = len(dataframe_new[dataframe_new['fail'] == 1])
    a = len(dataframe_new[dataframe_new['ambiguous'] == 1])

    Y = dataframe_new.as_matrix(columns=['ambiguous', 'fail', 'somatic'])
    # Y = Y[['a', 'f', 's']].astype(float).values

    # Get training data as numpy array
    X = dataframe_new.drop(['somatic', 'fail', 'ambiguous'], axis=1).astype(float).values                               

    probs = estimator.predict_proba(X)

    probs = pd.DataFrame(probs, columns=['amb', 'fail', 'somatic'])
    Y = pd.DataFrame(Y, columns=['amb', 'fail', 'somatic'])
    class_lookup = {0: 'Ambiguous', 1: 'Fail', 2: 'Somatic'}
    roc_auc, fpr, tpr = roc_curve(Y, probs, class_lookup)

    return s, a, f, roc_auc
                                      

def roc_curve(Y, probabilities, class_lookup):
    '''Create ROC curve to compare multiclass model performance.

    Parameters:
        Y (numpy.array): Truth labels
        probabilities (numpy.array): Output of model for each class
        class_lookup (dict): lookup hash of truth labels
        title (str): Plot title
    '''
    fpr, tpr, _ = metrics.roc_curve(Y['somatic'], probabilities['somatic'])
    roc_auc = metrics.auc(fpr, tpr)

    return roc_auc, fpr, tpr                                 

In [321]:
columns_list = []
for item in list(training_data.columns.values):
    columns_list.append(item)
columns_list.append('ambiguous')
columns_list.append('fail')
columns_list.append('somatic')
columns_list.remove('call')

In [322]:
merged_dataframe = pd.DataFrame(numpy.hstack((X_test, Y_test)), columns= columns_list)

In [325]:
features_for_analysis = {'disease_AML':1,'disease_GST':1,'disease_MPNST':1,
                         'disease_SCLC':1,'disease_breast':1,'disease_colorectal':1,
                         'disease_glioblastoma':1,'disease_lymphoma':1,'disease_melanoma':1,
                         'reviewer_1':1,'reviewer_2':1,'reviewer_3':1,'reviewer_4':1,
                         'normal_depth':[0, 0.01],
                         'tumor_depth':[0, 0.01]}

In [326]:
final = []
for k,v in features_for_analysis.items():
    filter_feature = k
    metric = v
    s, a, f, roc_auc = cross_tab(estimator, filter_feature, metric, merged_dataframe)
    final.append([k, v, s, a, f, roc_auc])

In [327]:
pd.DataFrame(final, columns=['filter', 'value', 'somatic', 'ambiguous', 'fail', 'roc_auc'])

Unnamed: 0,filter,value,somatic,ambiguous,fail,roc_auc
0,disease_AML,1,872,581,1424,0.970537
1,disease_GST,1,14,13,4,0.978992
2,disease_MPNST,1,6,12,124,0.992647
3,disease_SCLC,1,2465,1463,673,0.961696
4,disease_breast,1,2319,600,1401,0.956193
5,disease_colorectal,1,12,283,124,0.756962
6,disease_glioblastoma,1,150,205,57,0.908448
7,disease_lymphoma,1,242,238,148,0.964309
8,disease_melanoma,1,35,59,6,0.894505
9,reviewer_1,1,1282,871,1347,0.967902
