In [1]:
from NN_Tree_Comparison import *

%load_ext autoreload
%autoreload 2
from virtual_screening.function import *
from virtual_screening.evaluation import *
from virtual_screening.models.deep_classification import *

Using Theano backend.
Using gpu device 1: Tesla K40m (CNMeM is disabled, CuDNN 4004)


In [2]:
running_index = 1
k = 5
pos_num = 50
neg_num = 200

# Summary

| model-model comparison | val or test set | prediction summary | bad on NN, better on tree-like | good on both |
| --- | --- | --- | --- | --- |
| NN - XGBoost | validation set | cell: [#15](#15) | cell: [#16](#16) | cell: [#17](#17) |
| NN - XGBoost | test set | cell: [#18](#18)| cell: [#19](#19) | cell: [#20](#20) |
| NN - RF | validation set | cell: [#26](#26) | cell: [#27](#27) | cell: [#28](#28) |
| NN -RF | test set| cell: [#29](#29) | cell: [#30](#30)| cell: [#31](#31) |

# Get Input Feature

In [3]:
directory = '../../dataset/fixed_dataset/fold_{}/'.format(k)
file_list = []
for i in range(k):
    file_list.append('{}file_{}.csv'.format(directory, i))
file_list = np.array(file_list)

# read data
test_index = running_index / 4
val_index = running_index % 4 + (running_index % 4 >= test_index)
complete_index = np.arange(k)
train_index = np.where((complete_index != test_index) & (complete_index != val_index))[0]

train_file_list = file_list[train_index]
val_file_list = file_list[val_index:val_index+1]
test_file_list = file_list[test_index:test_index+1]

In [4]:
label_name_list = ['Keck_Pria_AS_Retest']

train_pd = filter_out_missing_values(read_merged_data(train_file_list), label_list=label_name_list)
val_pd = filter_out_missing_values(read_merged_data(val_file_list), label_list=label_name_list)
test_pd = filter_out_missing_values(read_merged_data(test_file_list), label_list=label_name_list)

# extract data, and split training data into training and val
X_train, y_train = extract_feature_and_label(train_pd,
                                             feature_name='Fingerprints',
                                             label_name_list=label_name_list)
X_val, y_val = extract_feature_and_label(val_pd,
                                         feature_name='Fingerprints',
                                         label_name_list=label_name_list)
X_test, y_test = extract_feature_and_label(test_pd,
                                           feature_name='Fingerprints',
                                           label_name_list=label_name_list)

(43453, 1)
(14484, 1)
(14486, 1)


# Test Single-task

In [5]:
with open('../../output/cross_validation/single_classification_22.json', 'r') as f:
    conf = json.load(f)
single_task = SingleClassification(conf=conf)

In [6]:
PMTNN_weight_file = '../../output/cross_validation/single_classification_22/45540945/{}.weight'.format(running_index)
print PMTNN_weight_file
single_task.predict_with_existing(X_train, y_train, X_val, y_val, X_test, y_test,
                           PMTNN_weight_file=PMTNN_weight_file)

../../output/cross_validation/single_classification_22/45540945/1.weight

train precision: 0.998314670017
train roc: 0.999998039294
train bedroc: 0.994706144082

validation precision: 0.307938220156
validation roc: 0.783930484518
validation bedroc: 0.612508723339

test precision: 0.0959068095274
test roc: 0.837264599171
test bedroc: 0.628970145806

ratio: 0.02, EF: 25.0,	active: 16.0
ratio: 0.01, EF: 43.75,	active: 16.0
ratio: 0.0015, EF: 208.333333333,	active: 16.0
ratio: 0.001, EF: 312.5,	active: 16.0


In [7]:
single_model = single_task.setup_model()
single_model.load_weights(PMTNN_weight_file)

## Show val and test rank on single NN

In [8]:
y_pred_on_val_single_NN = single_model.predict(X_val)
y_pred_on_test_single_NN = single_model.predict(X_test)

In [9]:
print 'For validation'
get_rank(y_val, y_pred_on_val_single_NN)
print
print 'For test'
get_rank(y_test, y_pred_on_test_single_NN)

For validation
0.999667	     1/ 14484		Rank: 0.000069
0.000004	 13523/ 14484		Rank: 0.933651
0.007614	    26/ 14484		Rank: 0.001795
0.000007	 11032/ 14484		Rank: 0.761668
0.003222	    34/ 14484		Rank: 0.002347
0.012485	    23/ 14484		Rank: 0.001588
0.992665	     4/ 14484		Rank: 0.000276
0.000008	 10578/ 14484		Rank: 0.730323
0.997849	     3/ 14484		Rank: 0.000207
0.000013	  5067/ 14484		Rank: 0.349834
0.000012	  6246/ 14484		Rank: 0.431234
0.000013	  5766/ 14484		Rank: 0.398094
0.164386	     9/ 14484		Rank: 0.000621
0.149422	    10/ 14484		Rank: 0.000690
0.000069	   306/ 14484		Rank: 0.021127
0.809995	     6/ 14484		Rank: 0.000414

For test
0.000033	   769/ 14486		Rank: 0.053086
0.000002	 14220/ 14486		Rank: 0.981637
0.000029	   999/ 14486		Rank: 0.068963
0.160746	     9/ 14486		Rank: 0.000621
0.000026	  1262/ 14486		Rank: 0.087119
0.708671	     6/ 14486		Rank: 0.000414
0.000322	    94/ 14486		Rank: 0.006489
0.000028	  1089/ 14486		Rank: 0.075176
0.000078	   254/ 14486		Rank: 0.017534


# XGBoost

In [10]:
import xgboost as xgb
from xgboost import DMatrix

In [11]:
xgb_clf = xgb.XGBClassifier(max_depth=10, n_estimators=100, silent=True, objective='binary:logistic')
xgb_clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='auc', verbose=False)

  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [12]:
predict_with_existing(xgb_clf, X_train, y_train, X_val, y_val, X_test, y_test)

train precision: 0.930939315435
train roc: 0.999898288402
train bedroc: 0.993722514348

validation precision: 0.289237329326
validation roc: 0.89217151645
validation bedroc: 0.7628689951

test precision: 0.149050688356
test roc: 0.880422857636
test bedroc: 0.78269282675

ratio: 0.02, EF: 37.5,	active: 16.0
ratio: 0.01, EF: 68.75,	active: 16.0
ratio: 0.0015, EF: 208.333333333,	active: 16.0
ratio: 0.001, EF: 250.0,	active: 16.0


In [13]:
fscore = xgb_clf.booster().get_fscore()
important_feature_number = len(fscore)
print 'XGBoost consider {} features as important'.format(important_feature_number)
feature_importances_ = np.zeros(1024)
for k,v in fscore.iteritems():
    feature_importances_[int(k[1:])] = v
importance_index = np.argsort(feature_importances_)[::-1]

y_pred_on_val_xgboost = reshape_data_into_2_dim(xgb_clf.predict_proba(X_val)[:, 1])
y_pred_on_test_xgboost = reshape_data_into_2_dim(xgb_clf.predict_proba(X_test)[:, 1])

XGBoost consider 266 features as important


## Show val and test rank on XGBoost

In [14]:
print 'For validation'
get_rank(y_val, y_pred_on_val_xgboost)
print
print 'For test'
get_rank(y_test, y_pred_on_test_xgboost)

For validation
0.255025	     8/ 14484		Rank: 0.000552
0.006654	    98/ 14484		Rank: 0.006766
0.000973	   902/ 14484		Rank: 0.062276
0.004228	   138/ 14484		Rank: 0.009528
0.128536	    13/ 14484		Rank: 0.000898
0.034595	    29/ 14484		Rank: 0.002002
0.482657	     2/ 14484		Rank: 0.000138
0.001720	   450/ 14484		Rank: 0.031069
0.302829	     7/ 14484		Rank: 0.000483
0.000115	  9162/ 14484		Rank: 0.632560
0.000396	  2706/ 14484		Rank: 0.186827
0.000083	 11536/ 14484		Rank: 0.796465
0.303580	     6/ 14484		Rank: 0.000414
0.397426	     4/ 14484		Rank: 0.000276
0.032834	    31/ 14484		Rank: 0.002140
0.315210	     5/ 14484		Rank: 0.000345

For test
0.011174	    58/ 14486		Rank: 0.004004
0.000630	  1526/ 14486		Rank: 0.105343
0.004896	   131/ 14486		Rank: 0.009043
0.604062	     2/ 14486		Rank: 0.000138
0.001474	   522/ 14486		Rank: 0.036035
0.188084	    11/ 14486		Rank: 0.000759
0.018850	    44/ 14486		Rank: 0.003037
0.002573	   264/ 14486		Rank: 0.018224
0.004952	   128/ 14486		Rank: 0.008836


## Compare Rank

### Compare Rank on Val
<a id=15></a>

In [15]:
compare_rank(y_val, y_pred_on_val_single_NN, y_pred_on_val_xgboost)

14423 	|	0.999667	1	Rank: 0.0001	|	0.255025	8	Rank: 0.0006
14424 	|	0.000004	13523	Rank: 0.9337	|	0.006654	98	Rank: 0.0068
14425 	|	0.007614	26	Rank: 0.0018	|	0.000973	902	Rank: 0.0623
14426 	|	0.000007	11032	Rank: 0.7617	|	0.004228	138	Rank: 0.0095
14427 	|	0.003222	34	Rank: 0.0023	|	0.128536	13	Rank: 0.0009
14428 	|	0.012485	23	Rank: 0.0016	|	0.034595	29	Rank: 0.0020
14429 	|	0.992665	4	Rank: 0.0003	|	0.482657	2	Rank: 0.0001
14430 	|	0.000008	10578	Rank: 0.7303	|	0.001720	450	Rank: 0.0311
14431 	|	0.997849	3	Rank: 0.0002	|	0.302829	7	Rank: 0.0005
14432 	|	0.000013	5067	Rank: 0.3498	|	0.000115	9162	Rank: 0.6326
14433 	|	0.000012	6246	Rank: 0.4312	|	0.000396	2706	Rank: 0.1868
14434 	|	0.000013	5766	Rank: 0.3981	|	0.000083	11536	Rank: 0.7965
14435 	|	0.164386	9	Rank: 0.0006	|	0.303580	6	Rank: 0.0004
14436 	|	0.149422	10	Rank: 0.0007	|	0.397426	4	Rank: 0.0003
14437 	|	0.000069	306	Rank: 0.0211	|	0.032834	31	Rank: 0.0021
14438 	|	0.809995	6	Rank: 0.0004	|	0.315210	5	Rank: 0.0003


#### Bad predicted on NN but Better on XGBoost
<a id=16></a>

In [16]:
test_list = [14424, 14426, 14430, 14437]
print_list(test_list, X_val, X_train, y_train, top=30,
           importance_index=importance_index, important_fetcher=important_feature_number)

testing Molecule:  14424
  22955 |  sim: 0.854409	sim(only important): 0.91644		true label: 0
  40475 |  sim: 0.832050	sim(only important): 0.920751		true label: 0
  35534 |  sim: 0.784465	sim(only important): 0.800445		true label: 0
  35424 |  sim: 0.768658	sim(only important): 0.774917		true label: 0
  32188 |  sim: 0.752618	sim(only important): 0.833797		true label: 0
   3197 |  sim: 0.750092	sim(only important): 0.843221		true label: 0
  17711 |  sim: 0.748931	sim(only important): 0.833797		true label: 0
   3226 |  sim: 0.745241	sim(only important): 0.833797		true label: 0
  11511 |  sim: 0.736075	sim(only important): 0.711556		true label: 0
   6299 |  sim: 0.734718	sim(only important): 0.750366		true label: 0
   3255 |  sim: 0.732997	sim(only important): 0.833797		true label: 0
  32150 |  sim: 0.732997	sim(only important): 0.833797		true label: 0
   3263 |  sim: 0.723536	sim(only important): 0.819782		true label: 0
  32157 |  sim: 0.715628	sim(only important): 0.806452		true label

#### Good predictions onboth NN and XGBoost
<a id=17></a>

In [17]:
test_list = [14423, 14429, 14431, 14435, 14436, 14438] # NN is less confident on 14435, 14436
print_list(test_list, X_val, X_train, y_train,
           importance_index=importance_index, important_fetcher=important_feature_number)

testing Molecule:  14423
[31m  14427 |  sim: 0.840168	sim(only important): 0.861111		true label: 1  *[0m
   3286 |  sim: 0.755732	sim(only important): 0.760726		true label: 0
  35467 |  sim: 0.749613	sim(only important): 0.7298		true label: 0
  17790 |  sim: 0.740148	sim(only important): 0.714577		true label: 0
  17828 |  sim: 0.732709	sim(only important): 0.760726		true label: 0
   3320 |  sim: 0.732709	sim(only important): 0.754337		true label: 0
  25942 |  sim: 0.729204	sim(only important): 0.833333		true label: 0
  32200 |  sim: 0.722610	sim(only important): 0.724432		true label: 0
   3346 |  sim: 0.717741	sim(only important): 0.70565		true label: 0
  17802 |  sim: 0.711057	sim(only important): 0.729996		true label: 0
   3338 |  sim: 0.701964	sim(only important): 0.711832		true label: 0
  32225 |  sim: 0.693103	sim(only important): 0.677644		true label: 0
  17778 |  sim: 0.692436	sim(only important): 0.714577		true label: 0
  40715 |  sim: 0.686352	sim(only important): 0.714435		

### Compare Rank on Test
<a id=18></a>

In [18]:
compare_rank(y_test, y_pred_on_test_single_NN, y_pred_on_test_xgboost)

14424 	|	0.000033	769	Rank: 0.0531	|	0.011174	58	Rank: 0.0040
14425 	|	0.000002	14220	Rank: 0.9816	|	0.000630	1526	Rank: 0.1053
14426 	|	0.000029	999	Rank: 0.0690	|	0.004896	131	Rank: 0.0090
14427 	|	0.160746	9	Rank: 0.0006	|	0.604062	2	Rank: 0.0001
14428 	|	0.000026	1262	Rank: 0.0871	|	0.001474	522	Rank: 0.0360
14429 	|	0.708671	6	Rank: 0.0004	|	0.188084	11	Rank: 0.0008
14430 	|	0.000322	94	Rank: 0.0065	|	0.018850	44	Rank: 0.0030
14431 	|	0.000028	1089	Rank: 0.0752	|	0.002573	264	Rank: 0.0182
14432 	|	0.000078	254	Rank: 0.0175	|	0.004952	128	Rank: 0.0088
14433 	|	0.000026	1308	Rank: 0.0903	|	0.005711	115	Rank: 0.0079
14434 	|	0.000009	8551	Rank: 0.5903	|	0.000092	10853	Rank: 0.7492
14435 	|	0.000008	9950	Rank: 0.6869	|	0.000053	14108	Rank: 0.9739
14436 	|	0.920472	5	Rank: 0.0003	|	0.393397	5	Rank: 0.0003
14437 	|	0.069630	11	Rank: 0.0008	|	0.035639	28	Rank: 0.0019
14438 	|	0.000193	117	Rank: 0.0081	|	0.059066	19	Rank: 0.0013
14439 	|	0.040926	14	Rank: 0.0010	|	0.298602	7	Rank: 0.0005


#### Bad predicted on NN but Better on XGBoost
<a id=19></a>

In [19]:
test_list = [14425, 14428, 14431, 14438]
print_list(test_list, X_test, X_train, y_train, top=30,
           importance_index=importance_index, important_fetcher=important_feature_number)

testing Molecule:  14425
  33815 |  sim: 0.953313	sim(only important): 0.963624		true label: 0
  33834 |  sim: 0.880771	sim(only important): 0.928571		true label: 0
  33792 |  sim: 0.872872	sim(only important): 0.872872		true label: 0
  19401 |  sim: 0.866025	sim(only important): 0.872872		true label: 0
    925 |  sim: 0.827535	sim(only important): 0.852437		true label: 0
  33820 |  sim: 0.819382	sim(only important): 0.831522		true label: 0
  33825 |  sim: 0.813838	sim(only important): 0.866921		true label: 0
  33833 |  sim: 0.811107	sim(only important): 0.872872		true label: 0
  19423 |  sim: 0.811107	sim(only important): 0.857143		true label: 0
   4912 |  sim: 0.808290	sim(only important): 0.831522		true label: 0
  19388 |  sim: 0.807373	sim(only important): 0.836502		true label: 0
  19398 |  sim: 0.805316	sim(only important): 0.848555		true label: 0
   4858 |  sim: 0.789024	sim(only important): 0.793575		true label: 0
  15347 |  sim: 0.781271	sim(only important): 0.907115		true labe

#### Good predictions onboth NN and XGBoost
<a id=20></a>

In [20]:
test_list = [14427, 14429, 14436, 14437, 14439]
print_list(test_list, X_test, X_train, y_train,
           importance_index=importance_index, important_fetcher=important_feature_number)

testing Molecule:  14427
  20334 |  sim: 0.755742	sim(only important): 0.966092		true label: 0
[31m  28908 |  sim: 0.739130	sim(only important): 1.0		true label: 1  *[0m
[31m  28907 |  sim: 0.739130	sim(only important): 0.907485		true label: 1  *[0m
  34811 |  sim: 0.695182	sim(only important): 0.858395		true label: 0
[31m  28917 |  sim: 0.647179	sim(only important): 0.816497		true label: 1  *[0m
  10626 |  sim: 0.595880	sim(only important): 0.713024		true label: 0
  25495 |  sim: 0.591520	sim(only important): 0.771517		true label: 0
[31m  14433 |  sim: 0.546019	sim(only important): 0.601338		true label: 1  *[0m
  37261 |  sim: 0.542137	sim(only important): 0.785714		true label: 0
  28939 |  sim: 0.512272	sim(only important): 0.667124		true label: 0
  38906 |  sim: 0.510754	sim(only important): 0.601338		true label: 0
  41876 |  sim: 0.473557	sim(only important): 0.600099		true label: 0
  39570 |  sim: 0.466252	sim(only important): 0.617213		true label: 0
  27466 |  sim: 0.4562

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rnd_state = 1337
rf_clf = RandomForestClassifier(n_estimators=4000,
                                max_features='log2',
                                min_samples_leaf=1, 
                                n_jobs=3, 
                                class_weight='balanced',
                                random_state=rnd_state,
                                oob_score=False, 
                                verbose=0)
rf_clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='log2',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4000, n_jobs=3,
            oob_score=False, random_state=1337, verbose=0,
            warm_start=False)

In [23]:
predict_with_existing(rf_clf, X_train, y_train, X_val, y_val, X_test, y_test)

train precision: 1.0
train roc: 1.0
train bedroc: 0.994725515647

validation precision: 0.351985010494
validation roc: 0.961587641692
validation bedroc: 0.829019613984

test precision: 0.130721818647
test roc: 0.872224861783
test bedroc: 0.689646653184

ratio: 0.02, EF: 31.25,	active: 16.0
ratio: 0.01, EF: 62.5,	active: 16.0
ratio: 0.0015, EF: 208.333333333,	active: 16.0
ratio: 0.001, EF: 187.5,	active: 16.0


## Show val and test rank on RF

In [24]:
importance_index = np.argsort(rf_clf.feature_importances_)[::-1]
y_pred_on_val_rf = reshape_data_into_2_dim(rf_clf.predict_proba(X_val)[:, 1])
y_pred_on_test_rf = reshape_data_into_2_dim(rf_clf.predict_proba(X_test)[:, 1])

In [25]:
print 'For validation'
get_rank(y_val, y_pred_on_val_rf)
print
print 'For test'
get_rank(y_test, y_pred_on_test_rf)

For validation
0.175000	     8/ 14484		Rank: 0.000552
0.010500	   173/ 14484		Rank: 0.011944
0.033250	    46/ 14484		Rank: 0.003176
0.000500	  4584/ 14484		Rank: 0.316487
0.133250	    15/ 14484		Rank: 0.001036
0.083750	    21/ 14484		Rank: 0.001450
0.307500	     1/ 14484		Rank: 0.000069
0.054500	    30/ 14484		Rank: 0.002071
0.198250	     5/ 14484		Rank: 0.000345
0.001750	  1086/ 14484		Rank: 0.074979
0.008500	   208/ 14484		Rank: 0.014361
0.000750	  3469/ 14484		Rank: 0.239506
0.121250	    16/ 14484		Rank: 0.001105
0.170750	     9/ 14484		Rank: 0.000621
0.044750	    38/ 14484		Rank: 0.002624
0.295500	     2/ 14484		Rank: 0.000138

For test
0.021250	    93/ 14486		Rank: 0.006420
0.000000	  8667/ 14486		Rank: 0.598302
0.000750	  3383/ 14486		Rank: 0.233536
0.251750	     2/ 14486		Rank: 0.000138
0.001500	  1441/ 14486		Rank: 0.099475
0.147250	     8/ 14486		Rank: 0.000552
0.063000	    33/ 14486		Rank: 0.002278
0.004000	   443/ 14486		Rank: 0.030581
0.019500	    97/ 14486		Rank: 0.006696


## Compare Rank

### Compare Rank on Val
<a id=26></a>

In [26]:
compare_rank(y_val, y_pred_on_val_single_NN, y_pred_on_val_rf)

14423 	|	0.999667	1	Rank: 0.0001	|	0.175000	8	Rank: 0.0006
14424 	|	0.000004	13523	Rank: 0.9337	|	0.010500	173	Rank: 0.0119
14425 	|	0.007614	26	Rank: 0.0018	|	0.033250	46	Rank: 0.0032
14426 	|	0.000007	11032	Rank: 0.7617	|	0.000500	4584	Rank: 0.3165
14427 	|	0.003222	34	Rank: 0.0023	|	0.133250	15	Rank: 0.0010
14428 	|	0.012485	23	Rank: 0.0016	|	0.083750	21	Rank: 0.0014
14429 	|	0.992665	4	Rank: 0.0003	|	0.307500	1	Rank: 0.0001
14430 	|	0.000008	10578	Rank: 0.7303	|	0.054500	30	Rank: 0.0021
14431 	|	0.997849	3	Rank: 0.0002	|	0.198250	5	Rank: 0.0003
14432 	|	0.000013	5067	Rank: 0.3498	|	0.001750	1086	Rank: 0.0750
14433 	|	0.000012	6246	Rank: 0.4312	|	0.008500	208	Rank: 0.0144
14434 	|	0.000013	5766	Rank: 0.3981	|	0.000750	3469	Rank: 0.2395
14435 	|	0.164386	9	Rank: 0.0006	|	0.121250	16	Rank: 0.0011
14436 	|	0.149422	10	Rank: 0.0007	|	0.170750	9	Rank: 0.0006
14437 	|	0.000069	306	Rank: 0.0211	|	0.044750	38	Rank: 0.0026
14438 	|	0.809995	6	Rank: 0.0004	|	0.295500	2	Rank: 0.0001


#### Bad predicted on NN but Better on RF
<a id=27></a>

In [27]:
test_list = [14424, 14426, 14430, 14432, 14433, 14434]
print_list(test_list, X_val, X_train, y_train, top=30, importance_index=importance_index)

testing Molecule:  14424
  22955 |  sim: 0.854409	sim(only important): 0.970143		true label: 0
  40475 |  sim: 0.832050	sim(only important): 0.968246		true label: 0
  35534 |  sim: 0.784465	sim(only important): 0.935414		true label: 0
  35424 |  sim: 0.768658	sim(only important): 0.801784		true label: 0
  32188 |  sim: 0.752618	sim(only important): 0.903696		true label: 0
   3197 |  sim: 0.750092	sim(only important): 0.866025		true label: 0
  17711 |  sim: 0.748931	sim(only important): 0.848875		true label: 0
   3226 |  sim: 0.745241	sim(only important): 0.868599		true label: 0
  11511 |  sim: 0.736075	sim(only important): 0.866025		true label: 0
   6299 |  sim: 0.734718	sim(only important): 0.802955		true label: 0
   3255 |  sim: 0.732997	sim(only important): 0.839146		true label: 0
  32150 |  sim: 0.732997	sim(only important): 0.839146		true label: 0
   3263 |  sim: 0.723536	sim(only important): 0.903696		true label: 0
  32157 |  sim: 0.715628	sim(only important): 0.848875		true labe

#### Good predictions onboth NN and RF
<a id=28></a>

In [28]:
test_list = [14423, 14429, 14431, 14435, 14436, 14438] # NN is less confident on 14435, 14436
print_list(test_list, X_val, X_train, y_train, importance_index=importance_index)

testing Molecule:  14423
[31m  14427 |  sim: 0.840168	sim(only important): 0.919255		true label: 1  *[0m
   3286 |  sim: 0.755732	sim(only important): 0.794719		true label: 0
  35467 |  sim: 0.749613	sim(only important): 0.688247		true label: 0
  17790 |  sim: 0.740148	sim(only important): 0.710819		true label: 0
  17828 |  sim: 0.732709	sim(only important): 0.735767		true label: 0
   3320 |  sim: 0.732709	sim(only important): 0.842105		true label: 0
  25942 |  sim: 0.729204	sim(only important): 0.95119		true label: 0
  32200 |  sim: 0.722610	sim(only important): 0.763542		true label: 0
   3346 |  sim: 0.717741	sim(only important): 0.699913		true label: 0
  17802 |  sim: 0.711057	sim(only important): 0.778981		true label: 0
   3338 |  sim: 0.701964	sim(only important): 0.745601		true label: 0
  32225 |  sim: 0.693103	sim(only important): 0.735767		true label: 0
  17778 |  sim: 0.692436	sim(only important): 0.710819		true label: 0
  40715 |  sim: 0.686352	sim(only important): 0.858395

### Compare Rank on Test
<a id=29></a>

In [29]:
compare_rank(y_test, y_pred_on_test_single_NN, y_pred_on_test_rf)

14424 	|	0.000033	769	Rank: 0.0531	|	0.021250	93	Rank: 0.0064
14425 	|	0.000002	14220	Rank: 0.9816	|	0.000000	8667	Rank: 0.5983
14426 	|	0.000029	999	Rank: 0.0690	|	0.000750	3383	Rank: 0.2335
14427 	|	0.160746	9	Rank: 0.0006	|	0.251750	2	Rank: 0.0001
14428 	|	0.000026	1262	Rank: 0.0871	|	0.001500	1441	Rank: 0.0995
14429 	|	0.708671	6	Rank: 0.0004	|	0.147250	8	Rank: 0.0006
14430 	|	0.000322	94	Rank: 0.0065	|	0.063000	33	Rank: 0.0023
14431 	|	0.000028	1089	Rank: 0.0752	|	0.004000	443	Rank: 0.0306
14432 	|	0.000078	254	Rank: 0.0175	|	0.019500	97	Rank: 0.0067
14433 	|	0.000026	1308	Rank: 0.0903	|	0.017750	103	Rank: 0.0071
14434 	|	0.000009	8551	Rank: 0.5903	|	0.000250	6975	Rank: 0.4815
14435 	|	0.000008	9950	Rank: 0.6869	|	0.000250	7019	Rank: 0.4845
14436 	|	0.920472	5	Rank: 0.0003	|	0.169500	6	Rank: 0.0004
14437 	|	0.069630	11	Rank: 0.0008	|	0.122250	15	Rank: 0.0010
14438 	|	0.000193	117	Rank: 0.0081	|	0.089500	21	Rank: 0.0014
14439 	|	0.040926	14	Rank: 0.0010	|	0.082250	26	Rank: 0.0018


#### Bad predicted on NN but Better on RF
<a id=30></a>

In [30]:
test_list = [14425, 14428, 14431, 14438]
print_list(test_list, X_test, X_train, y_train, top=30, importance_index=importance_index)

testing Molecule:  14425
  33815 |  sim: 0.953313	sim(only important): 1.0		true label: 0
  33834 |  sim: 0.880771	sim(only important): 0.963624		true label: 0
  33792 |  sim: 0.872872	sim(only important): 0.963624		true label: 0
  19401 |  sim: 0.866025	sim(only important): 0.800641		true label: 0
    925 |  sim: 0.827535	sim(only important): 0.846154		true label: 0
  33820 |  sim: 0.819382	sim(only important): 0.741249		true label: 0
  33825 |  sim: 0.813838	sim(only important): 0.784465		true label: 0
  33833 |  sim: 0.811107	sim(only important): 0.836242		true label: 0
  19423 |  sim: 0.811107	sim(only important): 0.889499		true label: 0
   4912 |  sim: 0.808290	sim(only important): 0.800641		true label: 0
  19388 |  sim: 0.807373	sim(only important): 0.889499		true label: 0
  19398 |  sim: 0.805316	sim(only important): 1.0		true label: 0
   4858 |  sim: 0.789024	sim(only important): 0.880705		true label: 0
  15347 |  sim: 0.781271	sim(only important): 0.877058		true label: 0
   62

#### Good predictions onboth NN and RF
<a id=31></a>

In [31]:
test_list = [14427, 14429, 14436, 14437, 14439]
print_list(test_list, X_test, X_train, y_train, importance_index=importance_index)

testing Molecule:  14427
  20334 |  sim: 0.755742	sim(only important): 1.0		true label: 0
[31m  28908 |  sim: 0.739130	sim(only important): 1.0		true label: 1  *[0m
[31m  28907 |  sim: 0.739130	sim(only important): 1.0		true label: 1  *[0m
  34811 |  sim: 0.695182	sim(only important): 0.83666		true label: 0
[31m  28917 |  sim: 0.647179	sim(only important): 0.881917		true label: 1  *[0m
  10626 |  sim: 0.595880	sim(only important): 0.755929		true label: 0
  25495 |  sim: 0.591520	sim(only important): 0.92582		true label: 0
  22138 |  sim: 0.563926	sim(only important): 0.606092		true label: 0
[31m  14433 |  sim: 0.546019	sim(only important): 0.845154		true label: 1  *[0m
  37261 |  sim: 0.542137	sim(only important): 0.717137		true label: 0
  28939 |  sim: 0.512272	sim(only important): 0.771517		true label: 0
  38906 |  sim: 0.510754	sim(only important): 0.668153		true label: 0
  20375 |  sim: 0.500435	sim(only important): 0.617213		true label: 0
  19591 |  sim: 0.478365	sim(only 