In [2]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Set BASE_PATH to the base directory where all the model data will be stored
BASE_PATH = os.path.expanduser('~/models')

model_dir = os.path.join(BASE_PATH, 'models_full_sim/saureus/saureus_organism_ic50')

In [4]:
map_submodel_abbreviation = {
    'pred-value':'pooled',
    'individual_full_descriptors-cc-signaturizer':'cc_signaturizer',
    'individual_full_descriptors-morgan-counts':'morgan_counts',
    'individual_full_descriptors-mordred':'mordred',
    'individual_full_descriptors-grover-embedding':'grover'
}

In [5]:
df = pd.read_csv(os.path.join(model_dir, 'test', 'output_table.csv'),
        usecols=['true-value', 'pred-value',
                 'individual_full_descriptors-cc-signaturizer',
                 'individual_full_descriptors-morgan-counts',
                 'individual_full_descriptors-mordred',
                 'individual_full_descriptors-grover-embedding',
                 'manifolds', 'classic', 'reference_embedding', 'fingerprint', 'molmap'])
df.rename(inplace=True, columns=map_submodel_abbreviation)
print('\nShape:', df.shape)


Shape: (349, 11)


In [6]:
df_performance = pd.read_csv(os.path.join(model_dir, 'test', 'performance_table.csv'))
print('\nShape:', df_performance.shape)


Shape: (10, 25)


In [7]:
# Create variable with abbreviated submodel names
df_performance['submodel_short'] = df_performance.model.replace(map_submodel_abbreviation)

In [8]:
df_performance

Unnamed: 0,model,num_train,num_test,num_train_0,num_train_1,num_test_0,num_test_1,auroc,aupr,cutoff,...,precision,recall,f1_score,mcc,precision_at_1,precision_at_5,precision_at_10,precision_at_50,precision_at_100,submodel_short
0,pooled,1225,349,0,0,0,0,0.868657,0.81357,0.4,...,0.675,0.771429,0.72,0.590532,1.0,1.0,1.0,0.94,0.74,pooled
1,individual_full_descriptors-cc-signaturizer,1225,349,0,0,0,0,0.820531,0.733185,0.45,...,0.65,0.619048,0.634146,0.482432,1.0,1.0,1.0,0.86,0.65,cc_signaturizer
2,individual_full_descriptors-morgan-counts,1225,349,0,0,0,0,0.856714,0.793902,0.4,...,0.678571,0.72381,0.700461,0.56615,1.0,1.0,1.0,0.96,0.71,morgan_counts
3,individual_full_descriptors-mordred,1225,349,0,0,0,0,0.85767,0.795648,0.05,...,0.333333,0.971429,0.49635,0.188884,1.0,1.0,1.0,0.9,0.74,mordred
4,individual_full_descriptors-grover-embedding,1225,349,0,0,0,0,0.84918,0.740357,0.35,...,0.589552,0.752381,0.661088,0.496941,0.0,0.8,0.9,0.9,0.7,grover
5,manifolds,1225,349,0,0,0,0,0.824824,0.732266,0.55,...,0.612069,0.67619,0.642534,0.478784,1.0,1.0,1.0,0.8,0.68,manifolds
6,classic,1225,349,0,0,0,0,0.809758,0.686677,0.05,...,0.527027,0.742857,0.616601,0.423154,1.0,1.0,0.9,0.76,0.65,classic
7,reference_embedding,1225,349,0,0,0,0,0.850644,0.794494,0.5,...,0.65,0.742857,0.693333,0.551073,1.0,1.0,1.0,0.94,0.73,reference_embedding
8,fingerprint,1225,349,0,0,0,0,0.856265,0.794635,0.4,...,0.68932,0.67619,0.682692,0.548068,1.0,1.0,1.0,0.94,0.7,fingerprint
9,molmap,1225,349,0,0,0,0,0.818755,0.728427,0.45,...,0.626263,0.590476,0.607843,0.446483,1.0,1.0,1.0,0.84,0.62,molmap


In [9]:
# Read cutoff value of each submodel into a dictionary (key=submodel)
submodel_cutoff = {}
for _, row in df_performance.iterrows():
    submodel_cutoff[row.submodel_short] = row.cutoff
    
submodel_cutoff

{'pooled': 0.4,
 'cc_signaturizer': 0.45,
 'morgan_counts': 0.4,
 'mordred': 0.05,
 'grover': 0.35,
 'manifolds': 0.55,
 'classic': 0.05,
 'reference_embedding': 0.5,
 'fingerprint': 0.4,
 'molmap': 0.45}

In [12]:
# Calculate values predicted by each submodel using the cutoff of the submodel

df['pred_pooled'] = df.pooled.apply(lambda x: 1 if x >= submodel_cutoff['pooled'] else 0)
df['pred_cc_signaturizer'] = df.cc_signaturizer.apply(lambda x: 1 if x >= submodel_cutoff['cc_signaturizer'] else 0)
df['pred_morgan_counts'] = df.morgan_counts.apply(lambda x: 1 if x >= submodel_cutoff['morgan_counts'] else 0)
df['pred_mordred'] = df.mordred.apply(lambda x: 1 if x >= submodel_cutoff['mordred'] else 0)
df['pred_grover'] = df.mordred.apply(lambda x: 1 if x >= submodel_cutoff['grover'] else 0)
df['pred_manifolds'] = df.manifolds.apply(lambda x: 1 if x >= submodel_cutoff['manifolds'] else 0)
df['pred_classic'] = df.classic.apply(lambda x: 1 if x >= submodel_cutoff['classic'] else 0)
df['pred_reference_embedding'] = df.reference_embedding.apply(lambda x: 1 if x >= submodel_cutoff['reference_embedding'] else 0)
df['pred_fingerprint'] = df.fingerprint.apply(lambda x: 1 if x >= submodel_cutoff['fingerprint'] else 0)
df['pred_molmap'] = df.fingerprint.apply(lambda x: 1 if x >= submodel_cutoff['molmap'] else 0)



In [13]:
# Calculate if the prediction of each submodel is correct

df['correct_pooled'] = df.apply(lambda row: 1 if row['true-value']==row.pred_pooled else 0, axis = 1)
df['correct_cc_signaturizer'] = df.apply(lambda row: 1 if row['true-value']==row.pred_cc_signaturizer else 0, axis = 1)
df['correct_morgan_counts'] = df.apply(lambda row: 1 if row['true-value']==row.pred_morgan_counts else 0, axis = 1)
df['correct_mordred'] = df.apply(lambda row: 1 if row['true-value']==row.pred_mordred else 0, axis = 1)
df['correct_grover'] = df.apply(lambda row: 1 if row['true-value']==row.pred_grover else 0, axis = 1)
df['correct_manifolds'] = df.apply(lambda row: 1 if row['true-value']==row.pred_manifolds else 0, axis = 1)
df['correct_classic'] = df.apply(lambda row: 1 if row['true-value']==row.pred_classic else 0, axis = 1)
df['correct_reference_embedding'] = df.apply(lambda row: 1 if row['true-value']==row.pred_reference_embedding else 0, axis = 1)
df['correct_fingerprint'] = df.apply(lambda row: 1 if row['true-value']==row.pred_fingerprint else 0, axis = 1)
df['correct_molmap'] = df.apply(lambda row: 1 if row['true-value']==row.pred_molmap else 0, axis = 1)


In [14]:
df.head()

Unnamed: 0,true-value,pooled,cc_signaturizer,morgan_counts,mordred,grover,manifolds,classic,reference_embedding,fingerprint,...,correct_pooled,correct_cc_signaturizer,correct_morgan_counts,correct_mordred,correct_grover,correct_manifolds,correct_classic,correct_reference_embedding,correct_fingerprint,correct_molmap
0,0,0.415223,0.396259,0.41583,0.456857,0.217651,0.623189,0.008554,0.733694,0.468976,...,0,1,0,0,0,0,1,0,0,0
1,0,0.554912,0.464286,0.337862,0.54723,0.380803,0.645859,0.737548,0.842505,0.542982,...,0,0,1,0,0,0,0,0,0,0
2,1,0.491452,0.614286,0.482045,0.27248,0.530525,0.853384,0.113647,0.906651,0.369434,...,1,1,1,1,0,1,1,1,0,0
3,0,0.489071,0.492603,0.52979,0.570391,0.520485,0.272483,0.023673,0.90394,0.581721,...,0,0,0,0,0,1,1,0,0,0
4,0,0.236841,0.080613,0.105125,0.111717,0.174273,0.110253,0.966397,0.042104,0.08044,...,1,1,1,0,1,1,0,1,1,1


In [16]:
df1 = df[[
    'true-value',
    'pred_pooled',
    'correct_pooled',
    'correct_cc_signaturizer',
    'correct_morgan_counts',
    'correct_mordred',
    'correct_grover',
    'correct_manifolds',
    'correct_classic',
    'correct_reference_embedding',
    'correct_fingerprint',
    'correct_molmap'
]]

In [17]:
# For the cases where the pool model is wrong, see % of correct results of each submodel
pd.options.display.float_format = '{:,.0f}%'.format
df1[df1.correct_pooled==0].drop(columns=['true-value', 'pred_pooled']).mean() * 100

correct_pooled                 0%
correct_cc_signaturizer       35%
correct_morgan_counts         24%
correct_mordred               33%
correct_grover                16%
correct_manifolds             22%
correct_classic               22%
correct_reference_embedding   22%
correct_fingerprint           24%
correct_molmap                30%
dtype: float64

In [18]:
# For false positives in pool model, see % of correct results of each submodel
pd.options.display.float_format = '{:,.0f}%'.format
df1[(df.pred_pooled==1) & (df['true-value']==0)].drop(columns=['true-value', 'pred_pooled']).mean() * 100

correct_pooled                 0%
correct_cc_signaturizer       46%
correct_morgan_counts         33%
correct_mordred                0%
correct_grover                15%
correct_manifolds             26%
correct_classic               18%
correct_reference_embedding   28%
correct_fingerprint           33%
correct_molmap                46%
dtype: float64

In [19]:
# For false negatives in pool model, see % of correct results of each submodel
pd.options.display.float_format = '{:,.0f}%'.format
df1[(df.pred_pooled==0) & (df['true-value']==1)].drop(columns=['true-value', 'pred_pooled']).mean() * 100

correct_pooled                 0%
correct_cc_signaturizer       17%
correct_morgan_counts          8%
correct_mordred               88%
correct_grover                17%
correct_manifolds             17%
correct_classic               29%
correct_reference_embedding   12%
correct_fingerprint            8%
correct_molmap                 4%
dtype: float64