# 5. Best of all

Since the best score obtained, in general, has been thanks to those models that were training all the dataset (without dividing it into time-dose combinations), we won't be using them. What will be done in here will be the following:
- Compute the probabilities of the same training set we have used to create the models of those models which obtained the lowest score.
- Compute the log loss score for each column (i.e. MoA).
- Decide the best model for each column (depending on the last results).
- Compute the probabilities given by the best model of each column using the test set (last step).

At this point we have already proved there's no point in taking the probabilities before forcing the controls to be 0.0 into account. So, we can get rid off them!

MODELS TO BE USED:
- Random Forest Classifier: `1b_model_200_64.joblib` 
- Support Vector Classifier: `2a_model.joblib`
- Naive Bayes: `3a1_model.joblib` (with the imp features from the RFC, top 15)
- Logistic Regression: `4a1_model.joblib` (with the imp features from the RFC, top 15)

In [33]:
from sklearn.metrics import log_loss
from joblib import dump, load
import pandas as pd
import numpy as np

In [34]:
# Data to be used for the model:
xtrain = pd.read_csv("xtrain.csv", index_col=0)
ytrain = pd.read_csv("ytrain.csv", index_col=0)

# Data to be used for the predictions:
xtest = pd.read_csv("xtest.csv", index_col=0)
ytest = pd.read_csv("ytest.csv", index_col=0)

In [35]:
def list_logloss(ytest, proba_pred):
    log_loss_list = []
    for i in range(ytest.shape[1]):
        log_loss_list.append(log_loss(ytest.iloc[:, i], proba_pred.iloc[:, i], labels = [0, 1]))
    return log_loss_list

## a) RFC probabilities

In [36]:
RFC_model = load('output/1b_model_200_64.joblib')

#new dataframe for saving the predictions
proba_pred_RFC = pd.DataFrame(columns=ytest.columns)

name_col = ytest.columns.tolist()
for i in range(ytest.shape[1]): 
    proba_pred_RFC[name_col[i]] = RFC_model[i].predict_proba(xtrain)[:, 1]
    print(i, end=' ', flush=True)



0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 

In [37]:
proba_pred_RFC["sig_id"]= xtrain.index.tolist()
proba_pred_RFC = proba_pred_RFC.set_index('sig_id')
proba_pred_RFC.loc[xtrain.cp_type == 1, :] = 0
proba_pred_RFC.head(5)

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,5.2e-05,0.002261,0.000154,0.006034,0.018302,0.005222,0.002925,0.003602,0.000153,0.01295,...,0.0,0.0,0.001549,0.000704,0.002174,0.000101,0.002232,0.000311,0.001226,0.001743
id_000779bfc,0.000191,0.000225,0.000172,0.007732,0.008358,0.000774,0.001099,0.002804,5.5e-05,0.007892,...,0.000187,0.000248,0.000209,0.000206,0.001549,0.0,0.000707,0.001094,0.000432,0.001471
id_000a6266a,0.001285,0.000859,0.001869,0.010876,0.017281,0.004576,0.002748,0.005809,0.001605,0.0206,...,5.4e-05,0.00108,0.003089,0.010551,0.00148,0.0002,0.042972,0.000409,0.002561,0.002897
id_0015fd391,0.000159,0.001088,0.001446,0.007361,0.00843,0.001133,0.000811,0.001772,0.000519,0.00618,...,0.0,0.004625,0.002334,0.168074,0.00469,0.000425,0.008898,0.001288,0.000669,0.000851
id_001626bd3,0.001015,0.0018,0.004536,0.015056,0.02037,0.007301,0.006812,0.008186,0.00104,0.029784,...,0.001055,0.002397,0.003628,0.013834,0.005307,0.000321,0.01034,0.004865,0.00232,0.002047


In [38]:
list_logloss_RFC = list_logloss(ytrain, proba_pred_RFC)
print('Log loss: ', np.mean(np.array(list_logloss_RFC)))

Log loss:  0.012397412065333286


This last output is the log loss average. Since we want to know how well each column is predicted, we'll save the log loss score for each MoA in a new dataframe.

In [39]:
#dataframe with the scores
columns_score = pd.DataFrame(index=ytest.columns)
columns_score['RFC'] = list_logloss_RFC

columns_score

Unnamed: 0,RFC
5-alpha_reductase_inhibitor,0.003690
11-beta-hsd1_inhibitor,0.004028
acat_inhibitor,0.005102
acetylcholine_receptor_agonist,0.032621
acetylcholine_receptor_antagonist,0.048619
...,...
ubiquitin_specific_protease_inhibitor,0.001429
vegfr_inhibitor,0.026086
vitamin_b,0.005267
vitamin_d_receptor_agonist,0.007170


## b) SVC probabilities

In [40]:
SVC_model = load('output/2a_model.joblib') #list with the models for each target column

#new dataframe for saving the predictions
proba_pred_SVC = pd.DataFrame(columns=ytest.columns)

name_col = ytest.columns.tolist()
for i in range(ytest.shape[1]): 
    proba_pred_SVC[name_col[i]] = SVC_model[i].predict_proba(xtrain)[:, 1]
    print(i, end=' ', flush=True)



0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 

In [41]:
proba_pred_SVC["sig_id"]= xtrain.index.tolist()
proba_pred_SVC = proba_pred_SVC.set_index('sig_id')
proba_pred_SVC.loc[xtrain.cp_type == 1, :] = 0
proba_pred_SVC.head(5)

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,0.0002674565,0.001048,0.000995,0.0059,0.018258,0.004052,0.002882,0.001362,9e-05,0.012619,...,9e-05,1.2e-05,0.001716,0.002081,0.001761,0.000177,0.000322,0.001253,6e-06,0.002579
id_000779bfc,9.670806e-05,0.000625,0.001038,0.0106,0.014467,0.004083,0.002052,0.003766,3.5e-05,0.003309,...,0.000257,0.000418,0.001898,0.002544,0.001993,0.000183,0.001706,0.001683,0.000574,0.002159
id_000a6266a,1.410812e-05,0.000766,0.00104,0.004229,0.018293,0.003214,0.00252,0.003568,0.000659,0.00515,...,0.000158,0.000478,0.000915,0.001263,0.001014,0.000395,0.020407,0.000691,0.000273,0.001726
id_0015fd391,9.965451e-07,0.000858,0.000993,0.007734,0.007578,0.002374,0.002375,0.002091,2e-06,0.000909,...,0.000136,0.004174,0.001341,0.01841,0.001972,0.000327,5e-05,0.000996,2e-06,0.000443
id_001626bd3,4.280354e-05,0.000794,0.001109,0.009555,0.01022,0.002744,0.00277,0.002512,0.000278,0.027426,...,0.00047,0.000167,0.000813,0.00279,0.001506,0.000303,0.002815,0.001555,8e-05,0.002082


In [42]:
list_logloss_SVC = list_logloss(ytrain, proba_pred_SVC)
print('Log loss: ', np.mean(np.array(list_logloss_SVC)))

Log loss:  0.009247943751795414


In [43]:
#dataframe with the scores
columns_score['SVC'] = list_logloss_SVC

columns_score

Unnamed: 0,RFC,SVC
5-alpha_reductase_inhibitor,0.003690,0.000804
11-beta-hsd1_inhibitor,0.004028,0.004654
acat_inhibitor,0.005102,0.007580
acetylcholine_receptor_agonist,0.032621,0.031486
acetylcholine_receptor_antagonist,0.048619,0.051494
...,...,...
ubiquitin_specific_protease_inhibitor,0.001429,0.001007
vegfr_inhibitor,0.026086,0.009281
vitamin_b,0.005267,0.005255
vitamin_d_receptor_agonist,0.007170,0.000733


## c) NB probabilities

In [44]:
NB_model = load('output/3a1_model.joblib')
imp_features = load('important_features_RFC_15.joblib')

proba_pred_NB = pd.DataFrame(columns=ytest.columns)

name_col = ytest.columns.tolist()
for i in range(ytest.shape[1]): 
    proba_pred_NB[name_col[i]] = NB_model[i].predict_proba(xtrain.iloc[:, imp_features[i].tolist()])[:, 1]
    print(i, end=' ', flush=True)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 

In [45]:
proba_pred_NB["sig_id"]= xtrain.index.tolist()
proba_pred_NB = proba_pred_NB.set_index('sig_id')
proba_pred_NB.loc[xtrain.cp_type == 1, :] = 0
proba_pred_NB.head(5)

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,0.0006085114,0.45634,0.005417,0.01816787,0.965132,0.393794,0.051739,0.4511197,1.593294e-06,0.0674114,...,0.0002123343,0.000171,0.02377669,1.483583e-05,0.000417,9.193571e-05,0.028762,0.001596,4.982138e-06,0.2700689
id_000779bfc,1.623319e-12,0.029283,0.003072,0.2240646,0.8652744,0.339391,0.112679,0.1233626,0.0002409643,0.8509354,...,2.991058e-05,0.007808,0.06319533,1.555344e-07,0.00083,0.0001045505,0.030814,0.006839,0.009165235,0.01384688
id_000a6266a,2.335837e-53,0.079823,0.107604,0.003206329,0.1521984,0.256203,0.000381,0.01385686,0.0004910642,0.8421274,...,2.314166e-12,3.3e-05,0.0002777754,7.363143e-08,0.003615,1.656176e-06,0.008645,0.004145,1.456416e-07,0.002727999
id_0015fd391,4.58617e-24,0.000255,0.000347,4.061703e-11,7.208057999999999e-34,0.00593,9e-06,1.32169e-08,1.1673549999999999e-37,1.995761e-12,...,1.3116380000000001e-17,0.003713,2.613815e-09,0.425434,0.019688,5.504843e-09,1.4e-05,0.005542,4.975834e-20,1.12084e-12
id_001626bd3,8.098875e-12,0.004615,0.04591,0.004912132,0.1937106,0.022405,0.159328,0.5472007,8.647079e-07,0.3858927,...,3.211983e-09,0.000407,0.00172488,8.578195e-06,0.003474,0.000454745,0.073191,0.127168,0.0006847485,0.01856768


In [46]:
list_logloss_NB = list_logloss(ytrain, proba_pred_NB)
print('Log loss: ', np.mean(np.array(list_logloss_NB)))

Log loss:  0.20017029734522845


In [47]:
#dataframe with the scores
columns_score['NB'] = list_logloss_NB

columns_score

Unnamed: 0,RFC,SVC,NB
5-alpha_reductase_inhibitor,0.003690,0.000804,0.052357
11-beta-hsd1_inhibitor,0.004028,0.004654,0.020467
acat_inhibitor,0.005102,0.007580,0.031101
acetylcholine_receptor_agonist,0.032621,0.031486,0.108851
acetylcholine_receptor_antagonist,0.048619,0.051494,1.193241
...,...,...,...
ubiquitin_specific_protease_inhibitor,0.001429,0.001007,0.061496
vegfr_inhibitor,0.026086,0.009281,0.053477
vitamin_b,0.005267,0.005255,0.026283
vitamin_d_receptor_agonist,0.007170,0.000733,0.006298


## d) LR probabilities

In [48]:
LR_model = load('output/4a1_model.joblib')
imp_features = load('important_features_RFC_15.joblib')

proba_pred_LR = pd.DataFrame(columns=ytest.columns)

name_col = ytest.columns.tolist()
for i in range(ytest.shape[1]): 
    proba_pred_LR[name_col[i]] = LR_model[i].predict_proba(xtrain.iloc[:, imp_features[i].tolist()])[:, 1]
    print(i, end=' ', flush=True)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 

In [49]:
proba_pred_LR["sig_id"]= xtrain.index.tolist()
proba_pred_LR = proba_pred_LR.set_index('sig_id')
proba_pred_LR.loc[xtrain.cp_type == 1, :] = 0
proba_pred_LR.head(5)

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,0.0001109856,0.003396,0.001813,0.006501,0.051071,0.004706,0.000977,0.00407,4.607191e-06,0.006953,...,0.0001584836,1.4e-05,0.001319,0.009592,0.000969,0.00036,0.004299,0.00024,5.958408e-05,0.004744
id_000779bfc,3.295886e-05,0.002191,0.000592,0.011266,0.008747,0.00146,0.00114,0.002816,4.882609e-05,0.02131,...,9.126641e-05,0.000253,0.0016,0.002606,0.001757,8.3e-05,0.002917,0.000813,0.0005319878,0.001684
id_000a6266a,1.454779e-05,0.002867,0.013971,0.001284,0.010914,0.00212,0.000854,0.003204,3.754485e-05,0.028398,...,2.37689e-06,0.000391,0.000764,0.001911,0.003468,3.2e-05,0.028466,0.000426,1.748122e-05,0.00013
id_0015fd391,1.083275e-07,0.000719,0.00014,0.000601,0.000709,0.001417,0.000525,0.00656,9.457401e-14,0.000787,...,1.868039e-07,0.001977,0.000326,0.028163,0.01719,1.1e-05,7.6e-05,0.000661,1.211446e-07,2.6e-05
id_001626bd3,3.392684e-05,0.000835,0.006963,0.009277,0.015422,0.002484,0.0019,0.011153,1.206347e-05,0.019165,...,6.480296e-05,0.006965,0.000121,0.000919,0.005501,7.2e-05,0.004054,0.059541,0.0007838522,0.002954


In [50]:
list_logloss_LR = list_logloss(ytrain, proba_pred_LR)
print('Log loss: ', np.mean(np.array(list_logloss_LR)))

Log loss:  0.015301122889885995


In [51]:
#dataframe with the scores
columns_score['LR'] = list_logloss_LR

columns_score

Unnamed: 0,RFC,SVC,NB,LR
5-alpha_reductase_inhibitor,0.003690,0.000804,0.052357,0.003761
11-beta-hsd1_inhibitor,0.004028,0.004654,0.020467,0.005590
acat_inhibitor,0.005102,0.007580,0.031101,0.007251
acetylcholine_receptor_agonist,0.032621,0.031486,0.108851,0.042931
acetylcholine_receptor_antagonist,0.048619,0.051494,1.193241,0.062662
...,...,...,...,...
ubiquitin_specific_protease_inhibitor,0.001429,0.001007,0.061496,0.001756
vegfr_inhibitor,0.026086,0.009281,0.053477,0.028154
vitamin_b,0.005267,0.005255,0.026283,0.006871
vitamin_d_receptor_agonist,0.007170,0.000733,0.006298,0.005187


We want to save this dataframe so as to be able to know at any moment the MoAs that are easier or harder to predict (we will leave the NB scores out).

In [53]:
scores = columns_score[['RFC', 'SVC', 'LR']].copy()
scores['mean'] = scores.mean(axis=1)
scores['median'] = scores.median(axis=1)
scores

Unnamed: 0,RFC,SVC,LR,mean,median
5-alpha_reductase_inhibitor,0.003690,0.000804,0.003761,0.002751,0.003221
11-beta-hsd1_inhibitor,0.004028,0.004654,0.005590,0.004757,0.004706
acat_inhibitor,0.005102,0.007580,0.007251,0.006644,0.006947
acetylcholine_receptor_agonist,0.032621,0.031486,0.042931,0.035679,0.034150
acetylcholine_receptor_antagonist,0.048619,0.051494,0.062662,0.054258,0.052876
...,...,...,...,...,...
ubiquitin_specific_protease_inhibitor,0.001429,0.001007,0.001756,0.001397,0.001413
vegfr_inhibitor,0.026086,0.009281,0.028154,0.021173,0.023629
vitamin_b,0.005267,0.005255,0.006871,0.005798,0.005533
vitamin_d_receptor_agonist,0.007170,0.000733,0.005187,0.004363,0.004775


In [54]:
scores.to_csv('scores_prediction_MoA.csv')

## Best model for each column (MoA)

Deciding the best model for each column:

In [21]:
name_col = ytest.columns.tolist()

In [22]:
list_min_score = []

for r in range(columns_score.shape[0]):
    current_row = columns_score.iloc[r,:].tolist()
    model_min = current_row.index(min(current_row))
    if model_min == 0:
        list_min_score.append('RFC')
    if model_min == 1:
        list_min_score.append('SVC')
    if model_min == 2:
        list_min_score.append('NB')
    if model_min == 3:
        list_min_score.append('LR')

list_min_score

['SVC',
 'RFC',
 'RFC',
 'SVC',
 'RFC',
 'RFC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'RFC',
 'RFC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'NB',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'RFC',
 'RFC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'NB',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'RFC',
 'SVC',
 'SVC',
 'SVC',
 'SVC',
 'LR',
 'SVC',
 'SVC',
 'RFC',
 'SVC',
 'S

In [23]:
dump(list_min_score, 'list_min_score.joblib')

['list_min_score.joblib']

## Probabilites for the test set

In [29]:
proba_pred_best = pd.DataFrame(columns=ytest.columns)

name_col = ytest.columns.tolist()
for i, best in zip(range(ytest.shape[1]), list_min_score): 
    if best=='RFC':  
        proba_pred_best[name_col[i]] = RFC_model[i].predict_proba(xtest)[:, 1]
    if best=='SVC':
        proba_pred_best[name_col[i]] = SVC_model[i].predict_proba(xtest)[:, 1]
    if best=='NB':  
        proba_pred_best[name_col[i]] = NB_model[i].predict_proba(xtest.iloc[:, imp_features[i].tolist()])[:, 1]
    if best=='LR':  
        proba_pred_best[name_col[i]] = LR_model[i].predict_proba(xtest.iloc[:, imp_features[i].tolist()])[:, 1]

    print(i, end=' ', flush=True)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 

In [30]:
proba_pred_best

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0.000677,0.000246,0.000863,0.007985,0.012511,0.004340,0.004140,0.003205,1.643145e-04,0.023980,...,0.000308,0.000284,0.003525,0.001783,0.005807,0.000353,0.001197,0.000579,4.084030e-05,0.000336
1,0.000414,0.000301,0.000706,0.011380,0.013796,0.002217,0.002779,0.007874,8.936935e-06,0.009102,...,0.000303,0.000138,0.002969,0.001939,0.003654,0.000377,0.003436,0.001509,3.342885e-05,0.000962
2,0.001359,0.000263,0.000285,0.008003,0.017532,0.002997,0.001498,0.010359,5.453278e-06,0.017884,...,0.000174,0.000320,0.001842,0.003110,0.001995,0.000768,0.001851,0.000926,2.176002e-05,0.001201
3,0.001481,0.000111,0.001020,0.004104,0.015061,0.000957,0.000734,0.004658,4.448741e-05,0.002333,...,0.000090,0.000102,0.001842,0.001464,0.003366,0.000427,0.000683,0.000740,9.828005e-04,0.000809
4,0.000194,0.001488,0.001648,0.005316,0.027394,0.005101,0.005345,0.003223,4.889386e-07,0.012203,...,0.000159,0.000470,0.000869,0.002805,0.001777,0.000158,0.003450,0.001087,3.217573e-05,0.000999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1186,0.000652,0.000109,0.000182,0.006833,0.018098,0.000924,0.000560,0.001981,4.363302e-07,0.010471,...,0.000188,0.000642,0.003003,0.004462,0.000681,0.000402,0.000588,0.000758,3.486932e-03,0.002016
1187,0.000006,0.000588,0.001826,0.006344,0.010557,0.002197,0.005113,0.001840,5.564583e-06,0.001169,...,0.000306,0.000043,0.000707,0.001275,0.003126,0.000089,0.000044,0.000415,1.638456e-07,0.000840
1188,0.000177,0.000255,0.000183,0.005596,0.010701,0.000687,0.000752,0.004191,7.100289e-06,0.013814,...,0.000151,0.000444,0.002085,0.004324,0.000991,0.000185,0.000801,0.001075,1.258943e-04,0.002689
1189,0.000209,0.002090,0.002602,0.016383,0.018218,0.005474,0.006333,0.002584,4.125846e-06,0.005525,...,0.000123,0.000391,0.005261,0.004655,0.002146,0.000131,0.000311,0.001217,1.852360e-07,0.000888


In [32]:
#Saving the probabilities:
proba_pred_best.to_csv(r'output/5_probas.csv')