In [71]:
# import pandas
import pandas as pd

**Combining csv for known inhibitors**

In [76]:
# reading csv files
df_gbc_5 = pd.read_csv('gbc_5_predicted_known_inhibitors.csv')
df_lightgbc_3 = pd.read_csv('lightgbc_3_predicted_known_inhibitors.csv')
df_lightgbc_5 = pd.read_csv('lightgbc_5_predicted_known_inhibitors.csv')
df_xgboost_1 = pd.read_csv('xgboost_1_predicted_known_inhibitors.csv')
df_xgboost_2 = pd.read_csv('xgboost_2_predicted_known_inhibitors.csv')
df_xgboost_3 = pd.read_csv('xgboost_3_predicted_known_inhibitors.csv')
df_xgboost_4 = pd.read_csv('xgboost_4_predicted_known_inhibitors.csv')
df_xgboost_5 = pd.read_csv('xgboost_5_predicted_known_inhibitors.csv')

In [77]:
# combining dataframes by column
df_known_inhibitors = pd.concat([df_gbc_5, df_lightgbc_3, df_lightgbc_5, df_xgboost_1, df_xgboost_2, df_xgboost_3, df_xgboost_4, df_xgboost_5], axis=0)

# changing USER_ID column to strings
df_known_inhibitors['USER_ID'] = df_known_inhibitors['USER_ID'].astype(str)

In [79]:
# checking total number of unique compounds
print('Total number of unique compounds: ', len(df_known_inhibitors['USER_ID'].unique()))

Total number of unique compounds:  8


In [80]:
df_known_inhibitors['USER_ID'].value_counts()

8087741     8
2534913     8
44224215    8
2545304     7
11703255    7
DB01136     2
53480255    1
6912404     1
Name: USER_ID, dtype: int64

From inspection, `DB00704` was not predicted by any of the 8 models

In [81]:
# create a dataframe with compounds and their probability range
df_known_inhibitors_range = df_known_inhibitors.groupby('USER_ID')['Probabilities'].agg(['min', 'max']).reset_index()
df_known_inhibitors_range

Unnamed: 0,USER_ID,min,max
0,11703255,0.907255,0.999985
1,2534913,0.968404,1.0
2,2545304,0.922072,0.994888
3,44224215,0.841027,0.997138
4,53480255,0.731497,0.731497
5,6912404,0.504077,0.504077
6,8087741,0.897402,0.999999
7,DB01136,0.992609,0.995041


**Find the peptidomimetic compounds that were predicted by all 8 models**

In [82]:
df_gbc_5 = pd.read_csv('gbc_5_predicted_peptidomimetics_life_chemicals.csv')
df_lightgbc_3 = pd.read_csv('lightgbc_3_predicted_peptidomimetics_life_chemicals.csv')
df_lightgbc_5 = pd.read_csv('lightgbc_5_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_1 = pd.read_csv('xgboost_1_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_2 = pd.read_csv('xgboost_2_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_3 = pd.read_csv('xgboost_3_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_4 = pd.read_csv('xgboost_4_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_5 = pd.read_csv('xgboost_5_predicted_peptidomimetics_life_chemicals.csv')

In [83]:
# combine the datasets
df_peptidomimetics = pd.concat([df_gbc_5, df_lightgbc_3, df_lightgbc_5, df_xgboost_1, df_xgboost_2, df_xgboost_3, df_xgboost_4, df_xgboost_5], axis=0)

In [84]:
# making useer_id column strings
df_peptidomimetics['USER_ID'] = df_peptidomimetics['USER_ID'].astype(str)

In [85]:
# checking number of compounds predicted by all models
df_peptidomimetics['USER_ID'].nunique()

3415

A total of 3415 compounds were predicted by the 8 models as actives with confindence above 0.95

In [86]:
# checking number of times a compounds was predicted as active by all models
df_peptidomimetics['USER_ID'].value_counts()

F1885-0077    8
F3225-8556    8
F0623-0305    8
F2721-0639    8
F5222-0010    8
             ..
F2781-0076    1
F2781-0134    1
F2721-0626    1
F2750-0108    1
F3411-4808    1
Name: USER_ID, Length: 3415, dtype: int64

In [89]:
# create dataframe with only compounds predicted by all models
df_peptidomimetics_8 = df_peptidomimetics[df_peptidomimetics['USER_ID'].isin(df_peptidomimetics['USER_ID'].value_counts()[df_peptidomimetics['USER_ID'].value_counts() == 8].index)]

In [91]:
# create a dataframe with only compounds that appear 8 times with their probability range
df_peptidomimetics_8.groupby('USER_ID')['Probabilities'].agg(['min', 'max']).reset_index()

Unnamed: 0,USER_ID,min,max
0,F0291-0021,0.951411,0.999998
1,F0375-0109,0.980746,1.000000
2,F0421-0040,0.966963,0.999998
3,F0423-0108,0.971748,1.000000
4,F0537-0332,0.964406,1.000000
...,...,...,...
154,F6451-1593,0.961244,1.000000
155,F6451-2637,0.987643,1.000000
156,F6451-2638,0.980010,1.000000
157,F6469-1029,0.997595,1.000000


159 compounds were predicted by all 8 models