Screen the following datasets to predict active compounds using the `xgboost_5` model <br>
1. Dataset of known inhibitors (validation of models)
2. Dataset of Peptidomimetics from ChemDiv
3. Dataset of Peptidomimetics from Life Chemicals
4. Dataset of Anti-inflammatory compounds


In [1]:
# importing libraries
import pandas as pd
import numpy as np
import pickle

In [2]:
# load vt pickle object
with open('xgboost_5_vt_pickle', 'rb') as f:
    mask = pickle.load(f)

In [3]:
# load model
with open('xgboost_5_pickle', 'rb') as f:
    clf = pickle.load(f)

1. Dataset of know inhibitors (validation of models)

In [4]:
# read data
df_inhi = pd.read_csv('../../../../../data/Molecular_Descriptors_Known_Inhibitors.csv')

# check shape and names of columns
df_inhi.shape, df_inhi.columns

((9, 779),
 Index(['ReadIn_ID', 'USER_ID', 'D001', 'D002', 'D003', 'D004', 'D005', 'D006',
        'D007', 'D008',
        ...
        'D768', 'D769', 'D770', 'D771', 'D772', 'D773', 'D774', 'D775', 'D776',
        'D777'],
       dtype='object', length=779))

In [5]:
# drop columns
df_inhi.drop(['ReadIn_ID'], inplace=True, axis=1)
df_inhi.shape

(9, 778)

In [6]:
# Make USER_ID index
df_inhi.set_index('USER_ID', inplace=True)
df_inhi.shape

(9, 777)

In [7]:
# Subset the dataset for prediction
df_inhi_pred = df_inhi.loc[:, mask]

# check shape
df_inhi_pred.shape

(9, 356)

In [8]:
# predict
pred_1 = clf.predict(df_inhi_pred)
pred_prob_1 = clf.predict_proba(df_inhi_pred)

# check if there is any prediction of 1 (active)
pred_1.sum()

6

In [9]:
# create a dataframe of predictions and probabilities
df_inhi['Predictions'] = pred_1
df_inhi['Probabilities'] = pred_prob_1[:, 1]
df_inhi.head()

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8087741,2,0,0,2,3,0,0,0,0,0,...,1,0,0,0,5.459,-0.313,0.353,3.386,1,0.995676
2545304,2,0,0,0,3,0,0,0,0,0,...,0,0,0,0,5.358,0.322,0.4,2.651,1,0.941898
2534913,2,0,0,1,4,0,0,0,0,0,...,1,0,0,0,5.644,-0.737,0.308,1.511,1,0.999892
11703255,1,0,0,0,2,0,0,0,0,0,...,0,0,0,0,4.954,-0.643,0.25,2.766,1,0.920193
53480255,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,6.209,4.027,0.0,0.974,0,0.002491


In [10]:
# create a dataframe of predictions of 1 only
df_inhi[df_inhi['Predictions'] == 1]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8087741,2,0,0,2,3,0,0,0,0,0,...,1,0,0,0,5.459,-0.313,0.353,3.386,1,0.995676
2545304,2,0,0,0,3,0,0,0,0,0,...,0,0,0,0,5.358,0.322,0.4,2.651,1,0.941898
2534913,2,0,0,1,4,0,0,0,0,0,...,1,0,0,0,5.644,-0.737,0.308,1.511,1,0.999892
11703255,1,0,0,0,2,0,0,0,0,0,...,0,0,0,0,4.954,-0.643,0.25,2.766,1,0.920193
44224215,2,0,0,1,2,0,0,0,0,0,...,0,0,0,0,4.755,0.437,0.6,2.149,1,0.994203
DB01136,3,0,0,1,3,0,0,0,0,0,...,0,0,0,0,5.392,-0.379,0.545,2.339,1,0.992609


In [11]:
# write dataframe to csv
df_inhi[df_inhi['Predictions'] == 1].to_csv('../../data/xgboost_5_predicted_known_inhibitors.csv', index=True, columns=['Probabilities'])

In [12]:
# create a dataframe of predictions of 1 only with probabilities greater than 0.7
df_inhi[(df_inhi['Predictions'] == 1) & (df_inhi['Probabilities'] > 0.7)]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8087741,2,0,0,2,3,0,0,0,0,0,...,1,0,0,0,5.459,-0.313,0.353,3.386,1,0.995676
2545304,2,0,0,0,3,0,0,0,0,0,...,0,0,0,0,5.358,0.322,0.4,2.651,1,0.941898
2534913,2,0,0,1,4,0,0,0,0,0,...,1,0,0,0,5.644,-0.737,0.308,1.511,1,0.999892
11703255,1,0,0,0,2,0,0,0,0,0,...,0,0,0,0,4.954,-0.643,0.25,2.766,1,0.920193
44224215,2,0,0,1,2,0,0,0,0,0,...,0,0,0,0,4.755,0.437,0.6,2.149,1,0.994203
DB01136,3,0,0,1,3,0,0,0,0,0,...,0,0,0,0,5.392,-0.379,0.545,2.339,1,0.992609


2. Dataset of Peptidomimetics from ChemDiv

In [13]:
# read data
df_pep_cd = pd.read_csv('../../../../../data/Molecular_Descriptors_Peptidomimetics_ChemDiv.csv')

# check shape and names of columns
df_pep_cd.shape, df_pep_cd.columns

((36711, 780),
 Index(['ReadIn_ID', 'USER_ID', 'USER_ID_2', 'D001', 'D002', 'D003', 'D004',
        'D005', 'D006', 'D007',
        ...
        'D768', 'D769', 'D770', 'D771', 'D772', 'D773', 'D774', 'D775', 'D776',
        'D777'],
       dtype='object', length=780))

In [14]:
# drop columns
df_pep_cd.drop(['ReadIn_ID', 'USER_ID_2'], inplace=True, axis=1)
df_pep_cd.shape

(36711, 778)

In [15]:
# Make USER_ID index
df_pep_cd.set_index('USER_ID', inplace=True)
df_pep_cd.shape

(36711, 777)

In [16]:
# Subset the dataset for prediction
df_pep_cd_pred = df_pep_cd.loc[:, mask]

# check shape
df_pep_cd_pred.shape

(36711, 356)

In [17]:
# predict
pred_2 = clf.predict(df_pep_cd_pred)
pred_prob_2 = clf.predict_proba(df_pep_cd_pred)

# check if there is any prediction of 1 (active)
pred_2.sum()

12

In [18]:
# create a dataframe of predictions and probabilities
df_pep_cd['Predictions'] = pred_2
df_pep_cd['Probabilities'] = pred_prob_2[:, 1]
df_pep_cd.head()

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M652-0486,1,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.42626,-0.648642,0.181818,0.498807,0,0.000112
L258-0120,2,0,0,0,3,0,0,0,0,0,...,0,0,0,0,5.35755,-0.726331,0.375,2.81018,0,0.000288
L834-0670,2,0,0,1,3,0,0,0,0,0,...,0,0,0,0,5.2854,-0.696023,0.4,2.89726,0,0.000655
L258-0113,2,0,0,0,3,0,0,0,0,0,...,0,0,0,0,5.35755,-0.726331,0.375,2.81018,0,5.2e-05
L834-0614,2,0,0,1,3,0,0,0,0,0,...,0,0,0,0,5.2854,-0.696023,0.4,3.01043,0,6.4e-05


In [19]:
# create a dataframe of predictions of 1 only
df_pep_cd[df_pep_cd['Predictions'] == 1]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F037-0026,3,0,0,0,3,1,0,0,0,0,...,0,0,0,1,5.70044,-0.708671,0.473684,2.37194,1,0.51077
G622-0800,2,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.64386,-0.781209,0.324324,3.24668,1,0.638913
L036-0317,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.70044,-0.816815,0.473684,3.33826,1,0.525448
L834-1079,2,0,0,2,3,0,0,0,0,0,...,0,0,0,0,5.61471,-0.721116,0.324324,2.84723,1,0.670045
F144-0356,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.70044,-0.744309,0.461538,3.96095,1,0.726722
F144-0520,3,1,0,0,4,0,0,0,0,0,...,0,0,0,0,5.72792,-0.744309,0.45,3.96095,1,0.563392
F144-0264,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.75489,-0.757747,0.439024,3.51617,1,0.504879
F144-0051,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.72792,-0.751209,0.45,3.32071,1,0.539736
D517-1329,3,0,0,2,4,0,0,0,0,0,...,0,0,0,0,5.75489,-0.793711,0.45,3.66712,1,0.769916
F089-0084,3,0,0,0,3,1,0,0,0,0,...,0,0,0,1,5.67243,-0.700127,0.486486,3.00755,1,0.712711


In [20]:
# create a dataframe of predictions of 1 only with probabilities greater than 0.7
df_pep_cd[(df_pep_cd['Predictions'] == 1) & (df_pep_cd['Probabilities'] > 0.69)]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F144-0356,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.70044,-0.744309,0.461538,3.96095,1,0.726722
D517-1329,3,0,0,2,4,0,0,0,0,0,...,0,0,0,0,5.75489,-0.793711,0.45,3.66712,1,0.769916
F089-0084,3,0,0,0,3,1,0,0,0,0,...,0,0,0,1,5.67243,-0.700127,0.486486,3.00755,1,0.712711
L036-0366,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.75489,-0.799443,0.45,3.04257,1,0.751666


3. Dataset of Peptidomimetics from Life Chemicals

In [21]:
# read data
df_pep_lc = pd.read_csv('../../../../../data/Molecular_Descriptors_Peptidomimetics_Life_Chemicals.csv')

# check shape and names of columns
df_pep_lc.shape, df_pep_lc.columns

((5836, 779),
 Index(['ReadIn_ID', 'USER_ID', 'D001', 'D002', 'D003', 'D004', 'D005', 'D006',
        'D007', 'D008',
        ...
        'D768', 'D769', 'D770', 'D771', 'D772', 'D773', 'D774', 'D775', 'D776',
        'D777'],
       dtype='object', length=779))

In [22]:
# drop columns
df_pep_lc.drop(['ReadIn_ID'], inplace=True, axis=1)
df_pep_lc.shape

(5836, 778)

In [23]:
# Make USER_ID index
df_pep_lc.set_index('USER_ID', inplace=True)
df_pep_lc.shape

(5836, 777)

In [24]:
# Subset the dataset for prediction
df_pep_lc_pred = df_pep_lc.loc[:, mask]

# check shape
df_pep_lc_pred.shape

(5836, 356)

In [25]:
# predict
pred_3 = clf.predict(df_pep_lc_pred)
pred_prob_3 = clf.predict_proba(df_pep_lc_pred)

# check if there is any prediction of 1 (active)
pred_3.sum()

3071

In [26]:
# create a dataframe of predictions and probabilities
df_pep_lc['Predictions'] = pred_3
df_pep_lc['Probabilities'] = pred_prob_3[:, 1]
df_pep_lc.head()

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F8881-1049,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,4.087,-0.626,0.0,0.259,0,0.072959
F6513-5720,0,0,0,1,2,0,0,0,0,0,...,0,0,0,0,4.644,-0.776,0.0,3.033,1,0.762628
F6619-2116,0,0,0,2,1,0,0,0,0,0,...,0,0,0,0,4.644,-0.244,0.0,0.265,0,0.03298
F6497-5659,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,4.459,-0.135,0.0,-0.512,0,0.177678
F6559-1463,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,4.459,-0.692,0.0,1.597,0,0.013581


In [27]:
# create a dataframe of predictions of 1 only
df_pep_lc[df_pep_lc['Predictions'] == 1]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F6513-5720,0,0,0,1,2,0,0,0,0,0,...,0,0,0,0,4.644,-0.776,0.000,3.033,1,0.762628
F6545-0060,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,4.524,-0.692,0.000,0.836,1,0.719536
F6521-7740,0,1,0,1,0,0,0,0,0,0,...,1,0,0,0,4.322,-0.673,0.000,1.415,1,0.726198
F6556-2677,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,4.459,-0.192,0.333,1.726,1,0.831613
F6548-0173,1,0,0,0,3,0,0,0,0,0,...,0,0,0,0,4.585,-0.750,0.316,1.645,1,0.508943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F0410-0004,5,0,0,0,6,0,0,0,0,0,...,0,0,0,0,6.044,-0.852,0.638,6.654,1,0.552399
F0411-0012,5,0,0,0,6,0,0,0,0,0,...,0,0,0,0,6.044,-0.824,0.652,6.743,1,0.999452
F0410-0003,5,0,0,0,6,0,0,0,0,0,...,0,0,0,0,6.044,-0.828,0.638,6.922,1,0.993724
F0410-0005,5,0,0,0,6,0,0,0,0,0,...,0,0,0,0,6.044,-0.828,0.638,7.011,1,0.737636


In [28]:
# create a dataframe of predictions of 1 only with probabilities greater than 0.7
df_pep_lc[(df_pep_lc['Predictions'] == 1) & (df_pep_lc['Probabilities'] > 0.95)]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F6521-7751,0,0,0,2,0,0,0,0,0,0,...,1,0,0,0,4.459,-0.709,0.000,1.951,1,0.996115
F6475-4409,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,4.807,-0.171,0.000,-0.690,1,0.966716
F6521-7936,0,0,0,1,1,0,0,0,0,0,...,1,0,0,0,4.585,-0.724,0.000,3.028,1,0.985307
F6204-2182,0,0,0,1,2,0,0,0,0,0,...,1,0,0,0,4.700,-0.787,0.000,3.278,1,0.985985
F6507-8857,1,0,0,0,2,0,0,0,0,0,...,1,0,0,0,4.700,-0.797,0.273,2.609,1,0.988106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F1280-0010,4,0,0,1,5,0,0,0,0,0,...,1,0,0,0,5.919,-0.789,0.523,5.183,1,0.973810
F0554-0832,4,0,0,2,4,0,0,0,0,0,...,1,0,0,0,5.845,-0.784,0.535,4.177,1,0.994496
F0554-0828,4,0,0,1,4,0,0,0,0,0,...,1,0,0,0,5.895,-0.795,0.523,4.625,1,0.995296
F0411-0012,5,0,0,0,6,0,0,0,0,0,...,0,0,0,0,6.044,-0.824,0.652,6.743,1,0.999452


In [29]:
# write dataframe to csv
df_pep_lc[(df_pep_lc['Predictions'] == 1) & (df_pep_lc['Probabilities'] > 0.95)].to_csv('../../data/xgboost_5_predicted_peptidomimetics_life_chemicals.csv', index=True, columns=['Probabilities'])

4. Dataset of Anti-inflammatory compounds

In [30]:
# read data
df_infla = pd.read_csv('../../../../../data/Molecular_Descriptors_Antiinflammatory.csv')

# check shape and names of columns
df_infla.shape, df_infla.columns

((23839, 779),
 Index(['ReadIn_ID', 'USER_ID', 'D001', 'D002', 'D003', 'D004', 'D005', 'D006',
        'D007', 'D008',
        ...
        'D768', 'D769', 'D770', 'D771', 'D772', 'D773', 'D774', 'D775', 'D776',
        'D777'],
       dtype='object', length=779))

In [31]:
# drop columns
df_infla.drop(['ReadIn_ID'], inplace=True, axis=1)
df_infla.shape

(23839, 778)

In [32]:
# Make USER_ID index
df_infla.set_index('USER_ID', inplace=True)
df_infla.shape

(23839, 777)

In [33]:
#  Subset the dataset for prediction
df_infla_pred = df_infla.loc[:, mask]

# check shape
df_infla_pred.shape

(23839, 356)

In [34]:
# predict
pred_4 = clf.predict(df_infla_pred)
pred_prob_4 = clf.predict_proba(df_infla_pred)

# check if there is any prediction of 1 (active)
pred_4.sum()

150

In [35]:
# create a dataframe of predictions and probabilities
df_infla['Predictions'] = pred_4
df_infla['Probabilities'] = pred_prob_4[:, 1]
df_infla.head()

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F987-0090,2,0,0,2,2,0,0,0,0,0,...,1,0,0,0,5.49185,-0.703176,0.352941,2.63176,0,0.024043
F279-0513,1,0,0,2,2,0,0,0,0,0,...,0,0,0,0,5.32193,-0.706842,0.193548,1.8641,0,0.000172
G300-0457,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.58496,-0.7434,0.5,4.46192,0,0.000365
G732-2734,3,0,0,2,3,0,0,0,0,0,...,0,0,0,0,5.70044,-0.737017,0.461538,2.03623,0,0.000175
F838-0741,2,0,0,2,2,0,0,0,0,0,...,0,0,0,1,5.52356,-0.671214,0.352941,2.08026,0,0.00399


In [36]:
# create a dataframe of predictions of 1 only
df_infla[df_infla['Predictions'] == 1]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G334-0434,2,0,0,2,3,0,0,0,0,0,...,0,0,0,0,5.64386,-0.816815,0.315789,4.42719,1,0.656106
C680-0603,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.75489,-0.270150,0.450000,3.21352,1,0.870455
C893-0646,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.61471,-0.811255,0.486486,4.28037,1,0.749681
F432-0488,3,0,0,1,3,0,0,0,0,0,...,1,0,0,0,5.61471,-0.681450,0.514286,2.99648,1,0.719729
L322-0443,3,0,0,1,3,0,0,0,0,0,...,0,0,0,1,5.61471,-0.661023,0.500000,4.15144,1,0.735843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4896-5362,3,0,0,2,3,0,0,0,0,0,...,0,0,0,0,5.42626,-0.426120,0.562500,3.48450,1,0.711505
Y040-7511,2,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.24793,-0.798443,0.400000,5.16370,1,0.908452
D143-0194,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.44294,-0.883552,0.531250,4.36997,1,0.593424
G646-0666,2,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.58496,-0.712425,0.342857,2.82994,1,0.753424


In [37]:
# create a dataframe of predictions of 1 only with probabilities greater than 0.7
df_infla[(df_infla['Predictions'] == 1) & (df_infla['Probabilities'] > 0.9)]

Unnamed: 0_level_0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D770,D771,D772,D773,D774,D775,D776,D777,Predictions,Probabilities
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3793-3202,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.65821,-0.751162,0.459459,3.61264,1,0.933813
8006-5235,4,0,0,0,6,0,0,0,0,0,...,0,0,0,0,5.87036,-0.783876,0.534884,4.57107,1,0.951641
7999-1527,3,0,0,0,5,0,0,0,0,0,...,0,0,0,0,5.64386,-0.816815,0.473684,3.74519,1,0.942519
3807-1819,3,0,0,1,4,0,0,0,0,0,...,0,0,0,0,5.64386,-0.774374,0.5,3.79728,1,0.904023
C849-0801,2,0,0,1,3,0,0,0,0,0,...,2,0,0,0,5.2854,-0.732656,0.4,5.68169,1,0.932215
F845-0450,2,0,0,2,2,0,0,0,0,0,...,1,0,0,0,5.49185,-0.627286,0.363636,3.25798,1,0.92592
D145-0157,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.49185,-0.7434,0.514286,5.71498,1,0.951739
Y040-7718,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.61471,-0.729299,0.486486,4.86269,1,0.988936
Y040-6965,3,0,0,0,4,0,0,0,0,0,...,0,0,0,0,5.45943,-0.693316,0.545455,4.6388,1,0.957052
8539-0699,3,0,0,0,6,0,0,0,0,0,...,0,0,0,0,5.70044,-0.811255,0.473684,4.88988,1,0.930438
