In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import json
import glob
import pathlib
import sys
sys.path.insert(0, '../src/')

from function import *
from util import output_classification_result
from evaluation import *

def extract_feature_and_label(data_pd,
                              feature_name,
                              label_name_list):
    # By default, feature should be fingerprints
    X_data = data_pd[feature_name].tolist()
    X_data = np.vstack([np.fromstring(x, 'u1') - ord('0') for x in data_pd[feature_name]]).astype(float)

    y_data = data_pd[label_name_list].values.tolist()
    y_data = np.array(y_data)
    y_data = reshape_data_into_2_dim(y_data)

    X_data = X_data.astype(float)
    y_data = y_data.astype(float)

    return X_data, y_data

def extract(file_path):
    if not os.path.isfile(file_path):
        return -1, -1, -1

    with open(file_path, 'r') as f:
        lines = f.readlines()

    test_roc, test_precision, test_NEF = -1, -1, -1
    for line in lines:
        if 'test precision' in line:
            line = line.strip().split(':')
            test_precision = float(line[1])
        if 'test roc' in line:
            line = line.strip().split(':')
            test_roc = float(line[1])
        if 'ratio: 0.01, NEF:' in line:
            line = line.strip().replace('NEF:', '').split(',')
            test_NEF = float(line[1])
    return test_roc, test_precision, test_NEF

def get_metrics(y_pred_on_test, y_test, EF_ratio=0.01):
    test_precision = precision_auc_single(y_pred_on_test, y_test)
    test_roc = roc_auc_single(y_pred_on_test, y_test)
    test_NEF = normalized_enrichment_factor_single(y_pred_on_test, y_test, EF_ratio)
    return test_roc, test_precision, test_NEF

# specify dataset
K = 10
directory = '../datasets/keck_pria/fold_{}.csv'
file_list = []
for i in range(K):
    file_list.append(directory.format(i))
file_list = np.array(file_list)

label_name_list = ["PriA-SSB AS Activity"]
EF_ratio_list =[0.01]
test_index = 9
test_file_list = [file_list[test_index]]
print('test files ', test_file_list)
test_pd = filter_out_missing_values(read_merged_data(test_file_list), label_list=label_name_list)
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='1024 MorganFP Radius 2',
                                           label_name_list=label_name_list)

test files  ['../datasets/keck_pria/fold_9.csv']


In [2]:
model_process_num_list = {
    'random_forest_classification': [139, 69, 111, 212, 210, 148, 28, 61, 124, 130, 131, 141, 14, 38, 165, 65, 123, 94, 3, 88, 72],
    'xgboost_classification': [140, 967, 960, 807, 263, 694, 440, 47, 116, 792, 663, 32, 564, 950, 735, 84, 364, 605, 431, 55, 388],
    'xgboost_regression': [187, 6, 514, 507, 880, 440, 605, 718, 754, 409, 586, 214, 753, 65, 294, 911, 721, 81, 321, 545, 280],
    'single_deep_classification': [356, 404, 215, 93, 254, 88, 423, 47, 363, 132, 5, 385, 370, 29, 415, 54, 124, 183, 180, 416],
    'single_deep_regression': [199, 323, 114, 123, 47, 175, 17, 178, 106, 265, 67, 157, 369, 115, 191, 20, 27, 108, 270, 45]}
  #  'ensemble': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
  # 'ensemble_02': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
#}

for model_class in model_process_num_list:
    print('--------------------------------------------------------------------------------------')
    print(model_class)
    model_ids = glob.glob('../model_weight/{}/*'.format(model_class))
    model_ids = [m.split('_')[-2] for m in model_ids]

    for mid in model_process_num_list[model_class]:
        config_json_file = '../config/{}/{}.json'.format(model_class, mid)
        weight_file = '../model_weight/{}/{}_{}_4.pkl'.format(model_class, model_class, mid)
        pred_file = '../model_preds/chtc_full/{}/{}_9.npy'.format(model_class, mid)
        orig_output = '../output/{}/{}_{}_4.out'.format(model_class, model_class, mid)
        if model_class in ['ensemble', 'ensemble_02']:
            orig_output = '../output/{}/{}.out'.format(model_class, mid)
        
        pathlib.Path(pred_file).parent.mkdir(parents=True, exist_ok=True)

        y_pred_on_test = np.load(pred_file)

        test_roc, test_precision, test_NEF = get_metrics(y_pred_on_test, y_test, EF_ratio=0.01)
        ref_roc, ref_precision, ref_NEF = extract(orig_output)
        
        assert np.abs(test_roc - ref_roc) <= 1e-6
        assert np.abs(test_precision - ref_precision) <= 1e-6
        assert np.abs(test_NEF - ref_NEF) <= 1e-6

--------------------------------------------------------------------------------------
random_forest_classification
--------------------------------------------------------------------------------------
xgboost_classification
--------------------------------------------------------------------------------------
xgboost_regression
--------------------------------------------------------------------------------------
single_deep_classification
--------------------------------------------------------------------------------------
single_deep_regression


In [3]:
model_nef_list = []

for model_class in model_process_num_list:
    for mid in model_process_num_list[model_class]:
        pred_file = '../model_preds/chtc_full/{}/{}_9.npy'.format(model_class, mid)
        y_pred_on_test = np.load(pred_file)

        test_roc, test_precision, test_NEF = get_metrics(y_pred_on_test, y_test, EF_ratio=0.01)
        
        model_nef_list.append([model_class, mid, pred_file, test_roc, test_precision, test_NEF])
        
model_df = pd.DataFrame(model_nef_list,
                        columns=['class', 'id', 'file', 'roc', 'pr', 'nef'])

top_models = model_df.sort_values('nef', ascending=False)
top_models = top_models.drop_duplicates('class', keep='first')

In [4]:
model_df[model_df['id'].isin(['140', '356', '187', '139', '199'])].sort_values('class')

Unnamed: 0,class,id,file,roc,pr,nef
0,random_forest_classification,139,../model_preds/chtc_full/random_forest_classif...,0.934515,0.173999,0.618182
63,single_deep_classification,356,../model_preds/chtc_full/single_deep_classific...,0.814814,0.157134,0.545455
83,single_deep_regression,199,../model_preds/chtc_full/single_deep_regressio...,0.922721,0.071245,0.418182
21,xgboost_classification,140,../model_preds/chtc_full/xgboost_classificatio...,0.923256,0.171148,0.581818
42,xgboost_regression,187,../model_preds/chtc_full/xgboost_regression/18...,0.842132,0.018381,0.254545


In [10]:
print(top_models.sort_values('class')[['class', 'id', 'roc', 'pr', 'nef']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                        class &   id &       roc &        pr &       nef \\
\midrule
 random\_forest\_classification &   14 &  0.890389 &  0.190374 &  0.636364 \\
   single\_deep\_classification &   47 &  0.827064 &  0.130725 &  0.581818 \\
       single\_deep\_regression &  191 &  0.902766 &  0.069076 &  0.490909 \\
       xgboost\_classification &  140 &  0.923256 &  0.171148 &  0.581818 \\
           xgboost\_regression &   81 &  0.876637 &  0.049912 &  0.418182 \\
\bottomrule
\end{tabular}



In [12]:
model_df[model_df['id'] == 139]

Unnamed: 0,class,id,file,roc,pr,nef
0,random_forest_classification,139,../model_preds/chtc_full/random_forest_classif...,0.934515,0.173999,0.618182


In [57]:
active_idx = np.where(y_test[:,0] == 1)[0]

topk = 427
all_actives = []
all_preds = []
for i, row in top_models.iterrows():
    y_pred = np.load(row['file'])[:,0]
    if row['class'] == 'random_forest_classification':
        y_pred = np.load('../model_preds/chtc/random_forest_classification/139.npy')[:,0]
    
    all_preds.append(y_pred)
    topsort = np.argsort(y_pred)[::-1][:topk]
    m_actives = np.intersect1d(topsort, active_idx)
    all_actives.append(m_actives)
    
all_actives = np.hstack(all_actives)
all_actives = np.unique(all_actives)

all_preds = np.hstack([y.reshape(-1,1) for y in all_preds])
all_preds = np.max(all_preds, axis=1)
all_topsort = np.argsort(all_preds)[::-1][:topk]
mv_actives = np.intersect1d(all_topsort, active_idx)
cov = []
for i, row in top_models.iterrows():
    y_pred = np.load(row['file'])[:,0]
    if row['class'] == 'random_forest_classification':
        y_pred = np.load('../model_preds/chtc/random_forest_classification/139.npy')[:,0]
        
    topsort = np.argsort(y_pred)[::-1][:topk]
    m_actives = np.intersect1d(topsort, active_idx)
    m_coverage = np.setdiff1d(all_actives, m_actives).shape[0]
    mv_coverage = np.setdiff1d(mv_actives, m_actives).shape[0]

    cov.append([row['class'], m_coverage, mv_coverage])
    
cdf = pd.DataFrame(cov, columns=['Class', 'All Actives \ Model Actives', 'Max-Vote Actives \ Model Actives'])

In [58]:
cdf

Unnamed: 0,Class,All Actives \ Model Actives,Max-Vote Actives \ Model Actives
0,random_forest_classification,4,1
1,xgboost_classification,5,2
2,xgboost_regression,11,3
3,single_deep_classification,6,3
4,single_deep_regression,11,0


---
# Ensemble

In [163]:
model_process_num_list = {
    'random_forest_classification': [139],
    'xgboost_classification': [140],
    'xgboost_regression': [187],
    'single_deep_classification': [356],
    'single_deep_regression': [199]}

model_pname = {
    'random_forest_classification': 'RF-C',
    'xgboost_classification': 'XGB-C',
    'xgboost_regression': 'XGB-R',
    'single_deep_classification': 'NN-C',
    'single_deep_regression': 'NN-R'}

ensembles = {
   'ensemble': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
  'ensemble_02': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
}

active_idx = np.where(y_test[:,0] == 1)[0]

topk = 427

res = []
for model_class in model_process_num_list:
    mid = model_process_num_list[model_class][0]
    y_pred = np.load('../model_preds/chtc_full/{}/{}_9.npy'.format(model_class, mid))[:,0]
    
    for ens in ensembles:
        for ens_mid in ensembles[ens]:
            y_ens = np.load('../model_preds/chtc_full/{}/{}_9.npy'.format(ens, ens_mid))
            
            if y_ens.ndim == 2:
                y_ens = y_ens[:,0]
            
            
            topsort = np.argsort(y_pred)[::-1][:topk]
            model_actives = np.intersect1d(topsort, active_idx)
            topsort = np.argsort(y_ens)[::-1][:topk]
            ens_actives = np.intersect1d(topsort, active_idx)
            
            model_coverage = np.setdiff1d(ens_actives, model_actives)
            ens_coverage = np.setdiff1d(model_actives, ens_actives)
            
            if ens == 'ensemble':
                ens_name = 'Model-Based'
            else:
                ens_name = 'Max Vote'
                
            model_name = '{}'.format(model_pname[model_class])
            ens_name = '{} #{}'.format(ens_name, ens_mid)
            
            res.append([model_name, ens_name, model_coverage.shape[0], ens_coverage.shape[0]])
    
df = pd.DataFrame(res, columns=['Model', 'Ensemble', 'Ensemble \ Model', 'Model \ Ensemble'])

In [164]:
for model_class in df['Model'].unique():
    tmp_df = df[df['Model'].str.contains(model_class)]
    print(tmp_df.to_latex(index=False))

\begin{tabular}{llrr}
\toprule
Model &         Ensemble &  Ensemble \textbackslash  Model &  Model \textbackslash  Ensemble \\
\midrule
 RF-C &   Model-Based \#0 &                 0 &                 0 \\
 RF-C &   Model-Based \#1 &                 1 &                 3 \\
 RF-C &   Model-Based \#2 &                 2 &                22 \\
 RF-C &   Model-Based \#3 &                 0 &                 4 \\
 RF-C &   Model-Based \#4 &                 2 &                13 \\
 RF-C &   Model-Based \#5 &                 2 &                13 \\
 RF-C &   Model-Based \#6 &                 3 &                13 \\
 RF-C &   Model-Based \#7 &                 2 &                12 \\
 RF-C &   Model-Based \#8 &                 2 &                13 \\
 RF-C &   Model-Based \#9 &                 3 &                11 \\
 RF-C &  Model-Based \#10 &                 2 &                 8 \\
 RF-C &  Model-Based \#11 &                 2 &                 7 \\
 RF-C &  Model-Based \#12 &         

In [159]:
tmp_df

Unnamed: 0,Model,Ensemble,Ensemble Actives \ Model Actives,Model Actives \ Ensemble Actives
