In [1]:
from NN_Tree_Comparison import *

%load_ext autoreload
%autoreload 2
from virtual_screening.function import *
from virtual_screening.evaluation import *
from virtual_screening.models.deep_classification import *

Using Theano backend.
Using gpu device 1: Tesla K40m (CNMeM is disabled, CuDNN 4004)


In [2]:
running_index = 1
k = 5
pos_num = 50
neg_num = 200

# Summary

| model-model comparison | val or test set | prediction summary | bad on NN, better on tree-like | good on both |
| --- | --- | --- | --- | --- |
| NN - XGBoost | validation set | cell: #15 | cell: #16 | cell: #17 |
| NN - XGBoost | test set | cell: #18| cell: #19 | cell: #20 |
| NN - RF | validation set | cell: #26 | cell: #27 | cell: #28 |
| NN -RF | test set| cell: #29 | cell: #30 | cell: #31 |

# Get Input Feature

In [3]:
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('{}file_{}.csv'.format(directory, i))
file_list = np.array(file_list)

# read data
test_index = running_index / 4
val_index = running_index % 4 + (running_index % 4 >= test_index)
complete_index = np.arange(k)
train_index = np.where((complete_index != test_index) & (complete_index != val_index))[0]

train_file_list = file_list[train_index]
val_file_list = file_list[val_index:val_index+1]
test_file_list = file_list[test_index:test_index+1]

In [4]:
label_name_list = ['Keck_Pria_AS_Retest']

train_pd = filter_out_missing_values(read_merged_data(train_file_list), label_list=label_name_list)
val_pd = filter_out_missing_values(read_merged_data(val_file_list), label_list=label_name_list)
test_pd = filter_out_missing_values(read_merged_data(test_file_list), label_list=label_name_list)

# extract data, and split training data into training and val
X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=label_name_list)
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=label_name_list)
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=label_name_list)

(43453, 1)
(14484, 1)
(14486, 1)


# Test Single-task

In [5]:
with open('../../output/cross_validation/single_classification_22.json', 'r') as f:
    conf = json.load(f)
single_task = SingleClassification(conf=conf)

In [6]:
PMTNN_weight_file = '../../output/cross_validation/single_classification_22/45540945/{}.weight'.format(running_index)
print PMTNN_weight_file
single_task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test,
                           PMTNN_weight_file=PMTNN_weight_file)

../../output/cross_validation/single_classification_22/45540945/1.weight

train precision: 0.998314670017
train roc: 0.999998039294
train bedroc: 0.994706144082

validation precision: 0.307938220156
validation roc: 0.783930484518
validation bedroc: 0.612508723339

test precision: 0.0959068095274
test roc: 0.837264599171
test bedroc: 0.628970145806

ratio: 0.02, EF: 25.0,	active: 16.0
ratio: 0.01, EF: 43.75,	active: 16.0
ratio: 0.0015, EF: 208.333333333,	active: 16.0
ratio: 0.001, EF: 312.5,	active: 16.0


In [7]:
single_model = single_task.setup_model()
single_model.load_weights(PMTNN_weight_file)

## Show val and test rank on single NN

In [8]:
y_pred_on_val_single_NN = single_model.predict(X_val)
y_pred_on_test_single_NN = single_model.predict(X_test)

In [9]:
print 'For validation'
get_rank(y_val, y_pred_on_val_single_NN)
print
print 'For test'
get_rank(y_test, y_pred_on_test_single_NN)

For validation
0.999667	     1/ 14484		Rank: 0.000069
0.000004	 13523/ 14484		Rank: 0.933651
0.007614	    26/ 14484		Rank: 0.001795
0.000007	 11032/ 14484		Rank: 0.761668
0.003222	    34/ 14484		Rank: 0.002347
0.012485	    23/ 14484		Rank: 0.001588
0.992665	     4/ 14484		Rank: 0.000276
0.000008	 10578/ 14484		Rank: 0.730323
0.997849	     3/ 14484		Rank: 0.000207
0.000013	  5067/ 14484		Rank: 0.349834
0.000012	  6246/ 14484		Rank: 0.431234
0.000013	  5766/ 14484		Rank: 0.398094
0.164386	     9/ 14484		Rank: 0.000621
0.149422	    10/ 14484		Rank: 0.000690
0.000069	   306/ 14484		Rank: 0.021127
0.809995	     6/ 14484		Rank: 0.000414

For test
0.000033	   769/ 14486		Rank: 0.053086
0.000002	 14220/ 14486		Rank: 0.981637
0.000029	   999/ 14486		Rank: 0.068963
0.160746	     9/ 14486		Rank: 0.000621
0.000026	  1262/ 14486		Rank: 0.087119
0.708671	     6/ 14486		Rank: 0.000414
0.000322	    94/ 14486		Rank: 0.006489
0.000028	  1089/ 14486		Rank: 0.075176
0.000078	   254/ 14486		Rank: 0.017534


# XGBoost

In [10]:
import xgboost as xgb
from xgboost import DMatrix

In [11]:
clf = xgb.XGBClassifier(max_depth=10, n_estimators=100, silent=True, objective='binary:logistic')
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='auc', verbose=False)

  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [12]:
predict_with_existing(clf, X_train, y_train, X_val, y_val, X_test, y_test)

train precision: 0.930939315435
train roc: 0.999898288402
train bedroc: 0.993722514348

validation precision: 0.289237329326
validation roc: 0.89217151645
validation bedroc: 0.7628689951

test precision: 0.149050688356
test roc: 0.880422857636
test bedroc: 0.78269282675

ratio: 0.02, EF: 37.5,	active: 16.0
ratio: 0.01, EF: 68.75,	active: 16.0
ratio: 0.0015, EF: 208.333333333,	active: 16.0
ratio: 0.001, EF: 250.0,	active: 16.0


In [13]:
y_pred_on_train_xgboost = reshape_data_into_2_dim(clf.predict_proba(X_train)[:, 1])
y_pred_on_val_xgboost = reshape_data_into_2_dim(clf.predict_proba(X_val)[:, 1])
y_pred_on_test_xgboost = reshape_data_into_2_dim(clf.predict_proba(X_test)[:, 1])

## Show val and test rank on XGBoost

In [14]:
print 'For validation'
get_rank(y_val, y_pred_on_val_xgboost)
print
print 'For test'
get_rank(y_test, y_pred_on_test_xgboost)

For validation
0.255025	     8/ 14484		Rank: 0.000552
0.006654	    98/ 14484		Rank: 0.006766
0.000973	   902/ 14484		Rank: 0.062276
0.004228	   138/ 14484		Rank: 0.009528
0.128536	    13/ 14484		Rank: 0.000898
0.034595	    29/ 14484		Rank: 0.002002
0.482657	     2/ 14484		Rank: 0.000138
0.001720	   450/ 14484		Rank: 0.031069
0.302829	     7/ 14484		Rank: 0.000483
0.000115	  9162/ 14484		Rank: 0.632560
0.000396	  2706/ 14484		Rank: 0.186827
0.000083	 11536/ 14484		Rank: 0.796465
0.303580	     6/ 14484		Rank: 0.000414
0.397426	     4/ 14484		Rank: 0.000276
0.032834	    31/ 14484		Rank: 0.002140
0.315210	     5/ 14484		Rank: 0.000345

For test
0.011174	    58/ 14486		Rank: 0.004004
0.000630	  1526/ 14486		Rank: 0.105343
0.004896	   131/ 14486		Rank: 0.009043
0.604062	     2/ 14486		Rank: 0.000138
0.001474	   522/ 14486		Rank: 0.036035
0.188084	    11/ 14486		Rank: 0.000759
0.018850	    44/ 14486		Rank: 0.003037
0.002573	   264/ 14486		Rank: 0.018224
0.004952	   128/ 14486		Rank: 0.008836


## Compare Rank

### Compare Rank on Val

In [15]:
compare_rank(y_val, y_pred_on_val_single_NN, y_pred_on_val_xgboost)

14423 	0.999667	     1		Rank: 0.000069	|	0.255025	     8		Rank: 0.000552
14424 	0.000004	 13523		Rank: 0.933651	|	0.006654	    98		Rank: 0.006766
14425 	0.007614	    26		Rank: 0.001795	|	0.000973	   902		Rank: 0.062276
14426 	0.000007	 11032		Rank: 0.761668	|	0.004228	   138		Rank: 0.009528
14427 	0.003222	    34		Rank: 0.002347	|	0.128536	    13		Rank: 0.000898
14428 	0.012485	    23		Rank: 0.001588	|	0.034595	    29		Rank: 0.002002
14429 	0.992665	     4		Rank: 0.000276	|	0.482657	     2		Rank: 0.000138
14430 	0.000008	 10578		Rank: 0.730323	|	0.001720	   450		Rank: 0.031069
14431 	0.997849	     3		Rank: 0.000207	|	0.302829	     7		Rank: 0.000483
14432 	0.000013	  5067		Rank: 0.349834	|	0.000115	  9162		Rank: 0.632560
14433 	0.000012	  6246		Rank: 0.431234	|	0.000396	  2706		Rank: 0.186827
14434 	0.000013	  5766		Rank: 0.398094	|	0.000083	 11536		Rank: 0.796465
14435 	0.164386	     9		Rank: 0.000621	|	0.303580	     6		Rank: 0.000414
14436 	0.149422	    10		Rank: 0.000690	|	0.397426	 

#### Bad predicted on NN but Better on XGBoost

In [16]:
test_list = [14424, 14426, 14430, 14437]
print_list(test_list, X_val, X_train, y_train, top=30)

testing Molecule:  14424
  22955		sim: 0.854409		true label: 0.0
  40475		sim: 0.832050		true label: 0.0
  35534		sim: 0.784465		true label: 0.0
  35424		sim: 0.768658		true label: 0.0
  32188		sim: 0.752618		true label: 0.0
   3197		sim: 0.750092		true label: 0.0
  17711		sim: 0.748931		true label: 0.0
   3226		sim: 0.745241		true label: 0.0
  11511		sim: 0.736075		true label: 0.0
   6299		sim: 0.734718		true label: 0.0
   3255		sim: 0.732997		true label: 0.0
  32150		sim: 0.732997		true label: 0.0
   3263		sim: 0.723536		true label: 0.0
  32157		sim: 0.715628		true label: 0.0
  18263		sim: 0.706018		true label: 0.0
   3223		sim: 0.695182		true label: 0.0
  17741		sim: 0.693375		true label: 0.0
  17679		sim: 0.693037		true label: 0.0
  32170		sim: 0.692308		true label: 0.0
  17759		sim: 0.691861		true label: 0.0
  17720		sim: 0.689900		true label: 0.0
  37627		sim: 0.688379		true label: 0.0
  32164		sim: 0.685656		true label: 0.0
   3237		sim: 0.680545		true label: 0.0
[31m  28909		s

#### Good predictions onboth NN and XGBoost

In [17]:
test_list = [14423, 14429, 14431, 14435, 14436, 14438] # NN is less confident on 14435, 14436
print_list(test_list, X_val, X_train, y_train)

testing Molecule:  14423
[31m  14427		sim: 0.840168		true label: 1.0[0m
   3286		sim: 0.755732		true label: 0.0
  35467		sim: 0.749613		true label: 0.0
  17790		sim: 0.740148		true label: 0.0
   3320		sim: 0.732709		true label: 0.0
  17828		sim: 0.732709		true label: 0.0
  25942		sim: 0.729204		true label: 0.0
  32200		sim: 0.722610		true label: 0.0
   3346		sim: 0.717741		true label: 0.0
  17802		sim: 0.711057		true label: 0.0
   3338		sim: 0.701964		true label: 0.0
  32225		sim: 0.693103		true label: 0.0
  17778		sim: 0.692436		true label: 0.0
  40715		sim: 0.686352		true label: 0.0
  32260		sim: 0.677930		true label: 0.0
  22981		sim: 0.677930		true label: 0.0
  17791		sim: 0.666973		true label: 0.0
   3304		sim: 0.661976		true label: 0.0
  29616		sim: 0.660225		true label: 0.0
  37474		sim: 0.660132		true label: 0.0

testing Molecule:  14429
[31m  28917		sim: 0.808138		true label: 1.0[0m
  27466		sim: 0.701793		true label: 0.0
  20334		sim: 0.697518		true label: 0.0
  12965		si

### Compare Rank on Test

In [18]:
compare_rank(y_test, y_pred_on_test_single_NN, y_pred_on_test_xgboost)

14424 	0.000033	   769		Rank: 0.053086	|	0.011174	    58		Rank: 0.004004
14425 	0.000002	 14220		Rank: 0.981637	|	0.000630	  1526		Rank: 0.105343
14426 	0.000029	   999		Rank: 0.068963	|	0.004896	   131		Rank: 0.009043
14427 	0.160746	     9		Rank: 0.000621	|	0.604062	     2		Rank: 0.000138
14428 	0.000026	  1262		Rank: 0.087119	|	0.001474	   522		Rank: 0.036035
14429 	0.708671	     6		Rank: 0.000414	|	0.188084	    11		Rank: 0.000759
14430 	0.000322	    94		Rank: 0.006489	|	0.018850	    44		Rank: 0.003037
14431 	0.000028	  1089		Rank: 0.075176	|	0.002573	   264		Rank: 0.018224
14432 	0.000078	   254		Rank: 0.017534	|	0.004952	   128		Rank: 0.008836
14433 	0.000026	  1308		Rank: 0.090294	|	0.005711	   115		Rank: 0.007939
14434 	0.000009	  8551		Rank: 0.590294	|	0.000092	 10853		Rank: 0.749206
14435 	0.000008	  9950		Rank: 0.686870	|	0.000053	 14108		Rank: 0.973906
14436 	0.920472	     5		Rank: 0.000345	|	0.393397	     5		Rank: 0.000345
14437 	0.069630	    11		Rank: 0.000759	|	0.035639	 

#### Bad predicted on NN but Better on XGBoost

In [19]:
test_list = [14425, 14428, 14431, 14438]
print_list(test_list, X_test, X_train, y_train, top=30)

testing Molecule:  14425
  33815		sim: 0.953313		true label: 0.0
  33834		sim: 0.880771		true label: 0.0
  33792		sim: 0.872872		true label: 0.0
  19401		sim: 0.866025		true label: 0.0
    925		sim: 0.827535		true label: 0.0
  33820		sim: 0.819382		true label: 0.0
  33825		sim: 0.813838		true label: 0.0
  33833		sim: 0.811107		true label: 0.0
  19423		sim: 0.811107		true label: 0.0
   4912		sim: 0.808290		true label: 0.0
  19388		sim: 0.807373		true label: 0.0
  19398		sim: 0.805316		true label: 0.0
   4858		sim: 0.789024		true label: 0.0
  15347		sim: 0.781271		true label: 0.0
   6270		sim: 0.779525		true label: 0.0
  15664		sim: 0.777616		true label: 0.0
  29836		sim: 0.775058		true label: 0.0
  33817		sim: 0.774139		true label: 0.0
    898		sim: 0.773723		true label: 0.0
    906		sim: 0.768347		true label: 0.0
  33850		sim: 0.766032		true label: 0.0
  33799		sim: 0.763763		true label: 0.0
    933		sim: 0.762216		true label: 0.0
    936		sim: 0.750555		true label: 0.0
  20814		sim: 0

#### Good predictions onboth NN and XGBoost

In [20]:
test_list = [14427, 14429, 14436, 14437, 14439]
print_list(test_list, X_test, X_train, y_train)

testing Molecule:  14427
  20334		sim: 0.755742		true label: 0.0
[31m  28908		sim: 0.739130		true label: 1.0[0m
[31m  28907		sim: 0.739130		true label: 1.0[0m
  34811		sim: 0.695182		true label: 0.0
[31m  28917		sim: 0.647179		true label: 1.0[0m

testing Molecule:  14429
[31m  28915		sim: 0.737865		true label: 1.0[0m
  40486		sim: 0.703562		true label: 0.0
[31m  14429		sim: 0.693103		true label: 1.0[0m
  11519		sim: 0.692820		true label: 0.0
  11518		sim: 0.686904		true label: 0.0
  23068		sim: 0.679366		true label: 0.0
  40488		sim: 0.676475		true label: 0.0
[31m  14430		sim: 0.673610		true label: 1.0[0m
  35352		sim: 0.673610		true label: 0.0
  28330		sim: 0.666701		true label: 0.0
  22929		sim: 0.653497		true label: 0.0
  11899		sim: 0.642364		true label: 0.0
  15995		sim: 0.640000		true label: 0.0
  39928		sim: 0.639010		true label: 0.0
  11657		sim: 0.636878		true label: 0.0
  18959		sim: 0.632785		true label: 0.0
  26334		sim: 0.632785		true label: 0.0
  26037		sim: 0

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rnd_state = 1337
clf = RandomForestClassifier(n_estimators=4000,
                             max_features='log2',
                             min_samples_leaf=1, 
                             n_jobs=3, 
                             class_weight='balanced',
                             random_state=rnd_state,
                             oob_score=False, 
                             verbose=0)
clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='log2',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4000, n_jobs=3,
            oob_score=False, random_state=1337, verbose=0,
            warm_start=False)

In [23]:
predict_with_existing(clf, X_train, y_train, X_val, y_val, X_test, y_test)

train precision: 1.0
train roc: 1.0
train bedroc: 0.994725515647

validation precision: 0.351985010494
validation roc: 0.961587641692
validation bedroc: 0.829019613984

test precision: 0.130721818647
test roc: 0.872224861783
test bedroc: 0.689646653184

ratio: 0.02, EF: 31.25,	active: 16.0
ratio: 0.01, EF: 62.5,	active: 16.0
ratio: 0.0015, EF: 208.333333333,	active: 16.0
ratio: 0.001, EF: 187.5,	active: 16.0


## Show val and test rank on RF

In [24]:
y_pred_on_val_rf = reshape_data_into_2_dim(clf.predict_proba(X_val)[:, 1])
y_pred_on_test_rf = reshape_data_into_2_dim(clf.predict_proba(X_test)[:, 1])

In [25]:
print 'For validation'
get_rank(y_val, y_pred_on_val_rf)
print
print 'For test'
get_rank(y_test, y_pred_on_test_rf)

For validation
0.175000	     8/ 14484		Rank: 0.000552
0.010500	   173/ 14484		Rank: 0.011944
0.033250	    46/ 14484		Rank: 0.003176
0.000500	  4584/ 14484		Rank: 0.316487
0.133250	    15/ 14484		Rank: 0.001036
0.083750	    21/ 14484		Rank: 0.001450
0.307500	     1/ 14484		Rank: 0.000069
0.054500	    30/ 14484		Rank: 0.002071
0.198250	     5/ 14484		Rank: 0.000345
0.001750	  1086/ 14484		Rank: 0.074979
0.008500	   208/ 14484		Rank: 0.014361
0.000750	  3469/ 14484		Rank: 0.239506
0.121250	    16/ 14484		Rank: 0.001105
0.170750	     9/ 14484		Rank: 0.000621
0.044750	    38/ 14484		Rank: 0.002624
0.295500	     2/ 14484		Rank: 0.000138

For test
0.021250	    93/ 14486		Rank: 0.006420
0.000000	  8667/ 14486		Rank: 0.598302
0.000750	  3383/ 14486		Rank: 0.233536
0.251750	     2/ 14486		Rank: 0.000138
0.001500	  1441/ 14486		Rank: 0.099475
0.147250	     8/ 14486		Rank: 0.000552
0.063000	    33/ 14486		Rank: 0.002278
0.004000	   443/ 14486		Rank: 0.030581
0.019500	    97/ 14486		Rank: 0.006696


## Compare Rank

### Compare Rank on Val

In [26]:
compare_rank(y_val, y_pred_on_val_single_NN, y_pred_on_val_rf)

14423 	0.999667	     1		Rank: 0.000069	|	0.175000	     8		Rank: 0.000552
14424 	0.000004	 13523		Rank: 0.933651	|	0.010500	   173		Rank: 0.011944
14425 	0.007614	    26		Rank: 0.001795	|	0.033250	    46		Rank: 0.003176
14426 	0.000007	 11032		Rank: 0.761668	|	0.000500	  4584		Rank: 0.316487
14427 	0.003222	    34		Rank: 0.002347	|	0.133250	    15		Rank: 0.001036
14428 	0.012485	    23		Rank: 0.001588	|	0.083750	    21		Rank: 0.001450
14429 	0.992665	     4		Rank: 0.000276	|	0.307500	     1		Rank: 0.000069
14430 	0.000008	 10578		Rank: 0.730323	|	0.054500	    30		Rank: 0.002071
14431 	0.997849	     3		Rank: 0.000207	|	0.198250	     5		Rank: 0.000345
14432 	0.000013	  5067		Rank: 0.349834	|	0.001750	  1086		Rank: 0.074979
14433 	0.000012	  6246		Rank: 0.431234	|	0.008500	   208		Rank: 0.014361
14434 	0.000013	  5766		Rank: 0.398094	|	0.000750	  3469		Rank: 0.239506
14435 	0.164386	     9		Rank: 0.000621	|	0.121250	    16		Rank: 0.001105
14436 	0.149422	    10		Rank: 0.000690	|	0.170750	 

#### Bad predicted on NN but Better on RF

In [27]:
test_list = [14424, 14426, 14430, 14432, 14433, 14434]
print_list(test_list, X_val, X_train, y_train, top=30)

testing Molecule:  14424
  22955		sim: 0.854409		true label: 0.0
  40475		sim: 0.832050		true label: 0.0
  35534		sim: 0.784465		true label: 0.0
  35424		sim: 0.768658		true label: 0.0
  32188		sim: 0.752618		true label: 0.0
   3197		sim: 0.750092		true label: 0.0
  17711		sim: 0.748931		true label: 0.0
   3226		sim: 0.745241		true label: 0.0
  11511		sim: 0.736075		true label: 0.0
   6299		sim: 0.734718		true label: 0.0
   3255		sim: 0.732997		true label: 0.0
  32150		sim: 0.732997		true label: 0.0
   3263		sim: 0.723536		true label: 0.0
  32157		sim: 0.715628		true label: 0.0
  18263		sim: 0.706018		true label: 0.0
   3223		sim: 0.695182		true label: 0.0
  17741		sim: 0.693375		true label: 0.0
  17679		sim: 0.693037		true label: 0.0
  32170		sim: 0.692308		true label: 0.0
  17759		sim: 0.691861		true label: 0.0
  17720		sim: 0.689900		true label: 0.0
  37627		sim: 0.688379		true label: 0.0
  32164		sim: 0.685656		true label: 0.0
   3237		sim: 0.680545		true label: 0.0
[31m  28909		s

#### Good predictions onboth NN and RF

In [28]:
test_list = [14423, 14429, 14431, 14435, 14436, 14438] # NN is less confident on 14435, 14436
print_list(test_list, X_val, X_train, y_train)

testing Molecule:  14423
[31m  14427		sim: 0.840168		true label: 1.0[0m
   3286		sim: 0.755732		true label: 0.0
  35467		sim: 0.749613		true label: 0.0
  17790		sim: 0.740148		true label: 0.0
   3320		sim: 0.732709		true label: 0.0
  17828		sim: 0.732709		true label: 0.0
  25942		sim: 0.729204		true label: 0.0
  32200		sim: 0.722610		true label: 0.0
   3346		sim: 0.717741		true label: 0.0
  17802		sim: 0.711057		true label: 0.0
   3338		sim: 0.701964		true label: 0.0
  32225		sim: 0.693103		true label: 0.0
  17778		sim: 0.692436		true label: 0.0
  40715		sim: 0.686352		true label: 0.0
  32260		sim: 0.677930		true label: 0.0
  22981		sim: 0.677930		true label: 0.0
  17791		sim: 0.666973		true label: 0.0
   3304		sim: 0.661976		true label: 0.0
  29616		sim: 0.660225		true label: 0.0
  37474		sim: 0.660132		true label: 0.0

testing Molecule:  14429
[31m  28917		sim: 0.808138		true label: 1.0[0m
  27466		sim: 0.701793		true label: 0.0
  20334		sim: 0.697518		true label: 0.0
  12965		si

### Compare Rank on Test

In [29]:
compare_rank(y_test, y_pred_on_test_single_NN, y_pred_on_test_rf)

14424 	0.000033	   769		Rank: 0.053086	|	0.021250	    93		Rank: 0.006420
14425 	0.000002	 14220		Rank: 0.981637	|	0.000000	  8667		Rank: 0.598302
14426 	0.000029	   999		Rank: 0.068963	|	0.000750	  3383		Rank: 0.233536
14427 	0.160746	     9		Rank: 0.000621	|	0.251750	     2		Rank: 0.000138
14428 	0.000026	  1262		Rank: 0.087119	|	0.001500	  1441		Rank: 0.099475
14429 	0.708671	     6		Rank: 0.000414	|	0.147250	     8		Rank: 0.000552
14430 	0.000322	    94		Rank: 0.006489	|	0.063000	    33		Rank: 0.002278
14431 	0.000028	  1089		Rank: 0.075176	|	0.004000	   443		Rank: 0.030581
14432 	0.000078	   254		Rank: 0.017534	|	0.019500	    97		Rank: 0.006696
14433 	0.000026	  1308		Rank: 0.090294	|	0.017750	   103		Rank: 0.007110
14434 	0.000009	  8551		Rank: 0.590294	|	0.000250	  6975		Rank: 0.481499
14435 	0.000008	  9950		Rank: 0.686870	|	0.000250	  7019		Rank: 0.484537
14436 	0.920472	     5		Rank: 0.000345	|	0.169500	     6		Rank: 0.000414
14437 	0.069630	    11		Rank: 0.000759	|	0.122250	 

#### Bad predicted on NN but Better on RF

In [30]:
test_list = [14425, 14428, 14431, 14438]
print_list(test_list, X_test, X_train, y_train, top=30)

testing Molecule:  14425
  33815		sim: 0.953313		true label: 0.0
  33834		sim: 0.880771		true label: 0.0
  33792		sim: 0.872872		true label: 0.0
  19401		sim: 0.866025		true label: 0.0
    925		sim: 0.827535		true label: 0.0
  33820		sim: 0.819382		true label: 0.0
  33825		sim: 0.813838		true label: 0.0
  33833		sim: 0.811107		true label: 0.0
  19423		sim: 0.811107		true label: 0.0
   4912		sim: 0.808290		true label: 0.0
  19388		sim: 0.807373		true label: 0.0
  19398		sim: 0.805316		true label: 0.0
   4858		sim: 0.789024		true label: 0.0
  15347		sim: 0.781271		true label: 0.0
   6270		sim: 0.779525		true label: 0.0
  15664		sim: 0.777616		true label: 0.0
  29836		sim: 0.775058		true label: 0.0
  33817		sim: 0.774139		true label: 0.0
    898		sim: 0.773723		true label: 0.0
    906		sim: 0.768347		true label: 0.0
  33850		sim: 0.766032		true label: 0.0
  33799		sim: 0.763763		true label: 0.0
    933		sim: 0.762216		true label: 0.0
    936		sim: 0.750555		true label: 0.0
  20814		sim: 0

#### Good predictions onboth NN and RF

In [31]:
test_list = [14427, 14429, 14436, 14437, 14439]
print_list(test_list, X_test, X_train, y_train)

testing Molecule:  14427
  20334		sim: 0.755742		true label: 0.0
[31m  28908		sim: 0.739130		true label: 1.0[0m
[31m  28907		sim: 0.739130		true label: 1.0[0m
  34811		sim: 0.695182		true label: 0.0
[31m  28917		sim: 0.647179		true label: 1.0[0m

testing Molecule:  14429
[31m  28915		sim: 0.737865		true label: 1.0[0m
  40486		sim: 0.703562		true label: 0.0
[31m  14429		sim: 0.693103		true label: 1.0[0m
  11519		sim: 0.692820		true label: 0.0
  11518		sim: 0.686904		true label: 0.0
  23068		sim: 0.679366		true label: 0.0
  40488		sim: 0.676475		true label: 0.0
[31m  14430		sim: 0.673610		true label: 1.0[0m
  35352		sim: 0.673610		true label: 0.0
  28330		sim: 0.666701		true label: 0.0
  22929		sim: 0.653497		true label: 0.0
  11899		sim: 0.642364		true label: 0.0
  15995		sim: 0.640000		true label: 0.0
  39928		sim: 0.639010		true label: 0.0
  11657		sim: 0.636878		true label: 0.0
  18959		sim: 0.632785		true label: 0.0
  26334		sim: 0.632785		true label: 0.0
  26037		sim: 0