In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

%aimport manual_review_classifier

Populating the interactive namespace from numpy and matplotlib


In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import os
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l2
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle

from manual_review_classifier.ClassifierPlots import create_reliability_diagram, create_roc_curve, create_feature_importance_plot, make_model_output_plot
from manual_review_classifier.Analysis import determine_feature_importance, print_accuracy_and_classification_report, predict_classes, get_somatic_error_type, calculate_kappa

sns.set_style("white")
sns.set_context('talk')

Using TensorFlow backend.


In [4]:
training_data = pd.read_pickle('../data/training_data3.pkl')

In [5]:
aml31_training = training_data[training_data.index.str.contains('H_KA-452198')]
training_data = training_data[~training_data.index.str.contains('H_KA-452198')]

In [6]:
training_data.groupby('call').size()

call
a    10643
f     8854
g     3122
s    18381
dtype: int64

In [7]:
three_class = training_data.replace('g','f')

s_v_b = three_class
#s_v_b['blood_tumor'] = s_v_b[['disease_AML','disease_lymphoma']].apply(any, axis=1).astype(int)
s_v_b['solid_tumor'] = s_v_b[['disease_GST', 'disease_MPNST', 'disease_SCLC',
                              'disease_breast', 'disease_colorectal', 
                              'disease_glioblastoma', 'disease_melanoma']].apply(any, axis=1).astype(int)
s_v_b.drop(['disease_AML', 'disease_GST', 'disease_MPNST', 'disease_SCLC',
       'disease_breast', 'disease_colorectal', 'disease_glioblastoma',
       'disease_lymphoma', 'disease_melanoma'], axis=1, inplace=True)

s_v_b.groupby('call').size()

call
a    10643
f    11976
s    18381
dtype: int64

In [8]:
# Get Labels
Y = pd.get_dummies(three_class.call).astype(float).values
# Get training data as numpy array
X = s_v_b.sort_index(axis=1).drop(['call', 'reviewer_Lee',
                'reviewer_Avi', 'reviewer_Heather', 
                'reviewer_Nick'], axis=1).astype(float).values

In [9]:
# define baseline model
def test_model():
    # create model
    model = Sequential()
    model.add(Dense(59, input_dim=59, kernel_initializer='normal', activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(20, activation='tanh'))
    model.add(Dropout(0.4))
    model.add(Dense(20, activation='tanh'))
    model.add(Dropout(0.4))
    model.add(Dense(20, activation='tanh'))
    model.add(Dropout(0.4))
    model.add(Dense(20, activation='tanh'))
    model.add(Dropout(0.4))
    model.add(Dense(3, kernel_initializer='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
test_estimator = KerasClassifier(build_fn=test_model, epochs=700, batch_size=2000, verbose=0)

In [18]:
test_estimator.fit(X, Y)

<keras.callbacks.History at 0x160c01940>

In [12]:
aml31 = pd.read_pickle('../data/aml31/AML31/train.pkl')

In [13]:
aml31['solid_tomor']=1

In [14]:
aml31_X = aml31.sort_index(axis=1).drop(['disease_AML', 'reviewer_None'], axis=1).astype(float).values

In [20]:
predictions = test_estimator.predict_proba(aml31_X)

array([[ 0.83176953,  0.06802499,  0.10020553],
       [ 0.81903011,  0.08769801,  0.09327181],
       [ 0.82265919,  0.00493509,  0.17240576],
       ..., 
       [ 0.84179646,  0.00863952,  0.14956395],
       [ 0.84154475,  0.00844866,  0.15000664],
       [ 0.84806657,  0.01679234,  0.13514104]], dtype=float32)

In [21]:
predictions = pd.DataFrameFrame(predictions, columns=['amb', 'fail', 'somatic'])

In [23]:
predictions.index = aml31.index

In [24]:
predictions['individual_name'] = predictions.index.str.extract('(.*)~', expand=False)
predictions['chr'] = predictions.index.str.extract('~(.*):', expand=False)
predictions['start'] = predictions.index.str.extract(':(.*)-\d+[A-z-]+>', expand=False)
predictions['stop'] = predictions.index.str.extract('\d+-(\d*)[A-z-]+>', expand=False)
predictions['ref'] = predictions.index.str.extract('-\d+([A-z-]+)>', expand=False)
predictions['var'] = predictions.index.str.extract('>([A-z-]+)', expand=False)

In [26]:
predictions['prediction'] = test_estimator.predict(aml31_X)

In [40]:
platinum = pd.read_csv('../data/aml31/Supplemental_Dataset_3-PlatinumSnvList.tsv', sep='\t')
platinum.rename(columns={'chromosome_name':'chr',  'reference':'ref', 'variant':'var'}, inplace=True)

In [52]:
predictions['start'] = predictions.start.astype(int)
predictions['stop'] = predictions.stop.astype(int)

In [54]:
m = pd.merge(predictions, platinum, how='left', on=['chr', 'start', 'stop',
                                                'ref', 'var'])

In [75]:
len(m[(m.prediction==2)& (m.gene_name.notnull())])/len(platinum)

0.9054355919583023

In [97]:
print('0.7 somatic threshold')
print('False Negatives: ', len(m[(m.somatic<=.7)& (m.gene_name.notnull())]))
print('True Positives: ', len(m[(m.somatic>=.7)& (m.gene_name.notnull())]))
print('False Positives: ', len(m[(m.somatic>=.7)& (m.gene_name.isnull())]))
print('True Negatives: ', len(m[(m.somatic<=.7)& (m.gene_name.isnull())]))
print('\nSimple majority threshold')
print('False Negatives: ', len(m[(m.prediction<2)& (m.gene_name.notnull())]))
print('True Positives: ', len(m[(m.prediction==2)& (m.gene_name.notnull())]))
print('False Positives: ', len(m[(m.prediction==2)& (m.gene_name.isnull())]))
print('True Negatives: ', len(m[(m.prediction<2)& (m.gene_name.isnull())]))

0.7 somatic threshold
False Negatives:  324
True Positives:  1012
False Positives:  2351
True Negatives:  188554

Simple majority threshold
False Negatives:  120
True Positives:  1216
False Positives:  44305
True Negatives:  146600


In [80]:
len(predictions)

192241