In [1]:
import pandas as pd
import numpy as np
from IPython.display import HTML
from collections import OrderedDict
from virtual_screening.function import *
from prospective_screening_model_names import *
from prospective_screening_metric_names import *



In [2]:
dataframe = pd.read_excel('../../output/stage_2_predictions/Keck_LC4_export.xlsx')

supplier_id = dataframe['Supplier ID'].tolist()
failed_id = ['F0401-0050', 'F2964-1411', 'F2964-1523']
inhibits = dataframe[
    'PriA-SSB AS, normalized for plate and edge effects, correct plate map: % inhibition Alpha, normalized (%)'].tolist()

positive_enumerate = filter(lambda x: x[1] >= 35 and supplier_id[x[0]] not in failed_id, enumerate(inhibits))
positive_idx = map(lambda x: x[0], positive_enumerate)
actual_label = map(lambda x: 1 if x in positive_idx else 0, range(len(supplier_id)))

complete_df = pd.DataFrame({'molecule id': supplier_id, 'label': actual_label, 'inhibition': inhibits})
column_names = ['molecule id', 'label', 'inhibition']
complete_df = complete_df[column_names]

dir_ = '../../output/stage_2_predictions/RMI'

file_path = '{}/{}.npz'.format(dir_, 'vanilla_lstm_19')
data = np.load(file_path)
molecule_id = data['molecule_id']

model_names = []
special_models = ['irv', 'random_forest', 'dockscore', 'consensus', 'baseline']

for model_name in model_name_mapping.keys():
    file_path = '{}/{}.npz'.format(dir_, model_name)
    if not os.path.exists(file_path):
        continue
    data = np.load(file_path)

    if any(x in model_name for x in special_models):
        y_pred = data['y_pred_on_test']
    else:
        y_pred = data['y_pred']
    if y_pred.ndim == 2:
        y_pred = y_pred[:, 0]

    temp_df = pd.DataFrame({'molecule id': molecule_id,
                            model_name_mapping[model_name]: y_pred})

    model_names.append(model_name_mapping[model_name])
    complete_df = complete_df.join(temp_df.set_index('molecule id'), on='molecule id')

model_names = sorted(model_names)
column_names.extend(model_names)

complete_df = complete_df[column_names]

In [3]:
true_label = complete_df['label'].as_matrix()
true_label = reshape_data_into_2_dim(true_label)

roc_auc_list = []
metric_df = pd.DataFrame({'Model': model_names})

for (metric_name, metric_) in metric_name_mapping.iteritems():
    print metric_name
    metric_values = []
    for model_name in model_names:
        pred = complete_df[model_name].as_matrix()
        pred = reshape_data_into_2_dim(pred)

        actual, pred = collectively_drop_nan(true_label, pred)
        value = metric_['function'](actual, pred, **metric_['argument'])
        metric_values.append(value)
    metric_df[metric_name] = metric_values    

roc_auc
bed_roc_auc
precision_auc_single
nef_001
nef_01
nef_02
number_of_hit_250
number_of_hit_500
number_of_hit_1000
ratio_of_hit_001
ratio_of_hit_01
ratio_of_hit_02


In [4]:
HTML(metric_df.to_html())

Unnamed: 0,Model,roc_auc,bed_roc_auc,precision_auc_single,nef_001,nef_01,nef_02,number_of_hit_250,number_of_hit_500,number_of_hit_1000,ratio_of_hit_001,ratio_of_hit_01,ratio_of_hit_02
0,CBF_a,0.526642,0.144061,0.011286,0.047619,0.142857,0.27381,4,4,5,4,12,23
1,CBF_b,0.482274,0.132979,0.005194,0.047619,0.166667,0.22619,4,6,6,4,14,19
2,CBF_c,0.48315,0.119252,0.004964,0.035714,0.142857,0.178571,3,5,5,3,12,15
3,CBF_d,0.594106,0.154198,0.013976,0.035714,0.166667,0.261905,3,5,8,3,14,22
4,CBF_e,0.569842,0.137479,0.004989,0.035714,0.142857,0.22619,3,5,7,3,12,19
5,CBF_f,0.567843,0.136228,0.005704,0.035714,0.130952,0.22619,3,4,8,3,11,19
6,ConsensusDocking_efr1_opt,0.454964,0.079837,0.002922,0.0,0.095238,0.154762,0,2,2,0,8,13
7,ConsensusDocking_max,0.524284,0.098762,0.00349,0.0,0.095238,0.166667,0,4,5,0,8,14
8,ConsensusDocking_mean,0.503629,0.096666,0.003306,0.0,0.107143,0.190476,0,2,3,0,9,16
9,ConsensusDocking_median,0.503286,0.101187,0.003397,0.02381,0.095238,0.178571,2,3,4,2,8,15
