In [2]:
# import pandas
import pandas as pd

**Combining csv for known inhibitors**

In [3]:
# reading csv files
df_gbc_5 = pd.read_csv('gbc_5_predicted_known_inhibitors.csv')
df_lightgbc_3 = pd.read_csv('lightgbc_3_predicted_known_inhibitors.csv')
df_lightgbc_5 = pd.read_csv('lightgbc_5_predicted_known_inhibitors.csv')
df_xgboost_1 = pd.read_csv('xgboost_1_predicted_known_inhibitors.csv')
df_xgboost_2 = pd.read_csv('xgboost_2_predicted_known_inhibitors.csv')
df_xgboost_3 = pd.read_csv('xgboost_3_predicted_known_inhibitors.csv')
df_xgboost_4 = pd.read_csv('xgboost_4_predicted_known_inhibitors.csv')
df_xgboost_5 = pd.read_csv('xgboost_5_predicted_known_inhibitors.csv')

In [4]:
# combining dataframes by column
df_known_inhibitors = pd.concat([df_gbc_5, df_lightgbc_3, df_lightgbc_5, df_xgboost_1, df_xgboost_2, df_xgboost_3, df_xgboost_4, df_xgboost_5], axis=0)

# changing USER_ID column to strings
df_known_inhibitors['USER_ID'] = df_known_inhibitors['USER_ID'].astype(str)

In [5]:
# checking total number of unique compounds
print('Total number of unique compounds: ', len(df_known_inhibitors['USER_ID'].unique()))

Total number of unique compounds:  8


In [6]:
df_known_inhibitors['USER_ID'].value_counts()

8087741     8
2534913     8
44224215    8
2545304     7
11703255    7
DB01136     2
53480255    1
6912404     1
Name: USER_ID, dtype: int64

From inspection, `DB00704` was not predicted by any of the 8 models

In [7]:
# create a dataframe with compounds and their probability range
df_known_inhibitors_range = df_known_inhibitors.groupby('USER_ID')['Probabilities'].agg(['min', 'max']).reset_index()
df_known_inhibitors_range

Unnamed: 0,USER_ID,min,max
0,11703255,0.907255,0.999985
1,2534913,0.968404,1.0
2,2545304,0.922072,0.994888
3,44224215,0.841027,0.997138
4,53480255,0.731497,0.731497
5,6912404,0.504077,0.504077
6,8087741,0.897402,0.999999
7,DB01136,0.992609,0.995041


**Find the peptidomimetic compounds that were predicted by all 8 models**

In [8]:
df_gbc_5 = pd.read_csv('gbc_5_predicted_peptidomimetics.csv')
df_lightgbc_3 = pd.read_csv('lightgbc_3_predicted_peptidomimetics.csv')
df_lightgbc_5 = pd.read_csv('lightgbc_5_predicted_peptidomimetics.csv')
df_xgboost_1 = pd.read_csv('xgboost_1_predicted_peptidomimetics.csv')
df_xgboost_2 = pd.read_csv('xgboost_2_predicted_peptidomimetics.csv')
df_xgboost_3 = pd.read_csv('xgboost_3_predicted_peptidomimetics.csv')
df_xgboost_4 = pd.read_csv('xgboost_4_predicted_peptidomimetics.csv')
df_xgboost_5 = pd.read_csv('xgboost_5_predicted_peptidomimetics.csv')

In [9]:
# combining dataframes by column
df_peptidomimetics_nt = pd.concat([df_gbc_5, df_lightgbc_3, df_lightgbc_5, df_xgboost_1, df_xgboost_2, df_xgboost_3, df_xgboost_4, df_xgboost_5], axis=0)
df_peptidomimetics_nt

Unnamed: 0,USER_ID,Probabilities
0,F6513-5720,0.865767
1,F6492-0089,0.806874
2,F6507-8802,0.672406
3,F6507-8808,0.783268
4,F6521-7740,0.713178
...,...,...
3066,F0410-0004,0.552399
3067,F0411-0012,0.999452
3068,F0410-0003,0.993724
3069,F0410-0005,0.737636


In [10]:
# making useer_id column strings
df_peptidomimetics_nt['USER_ID'] = df_peptidomimetics_nt['USER_ID'].astype(str)

In [11]:
# checking number of compounds predicted by all models
df_peptidomimetics_nt['USER_ID'].nunique()

4287

When confidence threshold is not taken into consideration, a total of 4287 are collectively predicted as active compounds by the 8 models

In [23]:
df_gbc_5 = pd.read_csv('gbc_5_predicted_peptidomimetics_life_chemicals.csv')
df_lightgbc_3 = pd.read_csv('lightgbc_3_predicted_peptidomimetics_life_chemicals.csv')
df_lightgbc_5 = pd.read_csv('lightgbc_5_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_1 = pd.read_csv('xgboost_1_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_2 = pd.read_csv('xgboost_2_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_3 = pd.read_csv('xgboost_3_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_4 = pd.read_csv('xgboost_4_predicted_peptidomimetics_life_chemicals.csv')
df_xgboost_5 = pd.read_csv('xgboost_5_predicted_peptidomimetics_life_chemicals.csv')

In [24]:
# combine the datasets
df_peptidomimetics = pd.concat([df_gbc_5, df_lightgbc_3, df_lightgbc_5, df_xgboost_1, df_xgboost_2, df_xgboost_3, df_xgboost_4, df_xgboost_5], axis=0)

In [25]:
# making useer_id column strings
df_peptidomimetics['USER_ID'] = df_peptidomimetics['USER_ID'].astype(str)

In [26]:
# checking number of compounds predicted by all models
df_peptidomimetics['USER_ID'].nunique()

3415

A total of 3415 compounds were predicted by the 8 models as actives with confindence above 0.95

In [27]:
# checking number of times a compounds was predicted as active by all models
df_peptidomimetics['USER_ID'].value_counts()

F1885-0077    8
F3225-8556    8
F0623-0305    8
F2721-0639    8
F5222-0010    8
             ..
F2781-0076    1
F2781-0134    1
F2721-0626    1
F2750-0108    1
F3411-4808    1
Name: USER_ID, Length: 3415, dtype: int64

In [28]:
# create dataframe with only compounds predicted by all models
df_peptidomimetics_8 = df_peptidomimetics[df_peptidomimetics['USER_ID'].isin(df_peptidomimetics['USER_ID'].value_counts()[df_peptidomimetics['USER_ID'].value_counts() == 8].index)]

In [29]:
# create a dataframe with only compounds that appear 8 times with their probability range
df_peptidomimetics_8.groupby('USER_ID')['Probabilities'].agg(['min', 'max']).reset_index()

Unnamed: 0,USER_ID,min,max
0,F0291-0021,0.951411,0.999998
1,F0375-0109,0.980746,1.000000
2,F0421-0040,0.966963,0.999998
3,F0423-0108,0.971748,1.000000
4,F0537-0332,0.964406,1.000000
...,...,...,...
154,F6451-1593,0.961244,1.000000
155,F6451-2637,0.987643,1.000000
156,F6451-2638,0.980010,1.000000
157,F6469-1029,0.997595,1.000000


159 compounds were predicted by all 8 models

**Get the index of the compounds produced by the ML models** <br>
To be able to select the compounds in data warrior, we need to find the index of the active compounds in the whole dataset to select their compounds to create an sdf file

In [30]:
df_whole = pd.read_csv('../../../../data/Molecular_Descriptors_Peptidomimetics_Life_Chemicals.csv')
df_whole.head(5)

Unnamed: 0,ReadIn_ID,USER_ID,D001,D002,D003,D004,D005,D006,D007,D008,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,1,F8881-1049,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,4.087,-0.626,0.0,0.259
1,2,F6513-5720,0,0,0,1,2,0,0,0,...,0,0,0,0,0,0,4.644,-0.776,0.0,3.033
2,3,F6619-2116,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,4.644,-0.244,0.0,0.265
3,4,F6497-5659,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,4.459,-0.135,0.0,-0.512
4,5,F6559-1463,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,4.459,-0.692,0.0,1.597


In [31]:
df_whole.drop(['ReadIn_ID'], inplace = True, axis = 1)
df_whole.head(5)

Unnamed: 0,USER_ID,D001,D002,D003,D004,D005,D006,D007,D008,D009,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,F8881-1049,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,4.087,-0.626,0.0,0.259
1,F6513-5720,0,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,4.644,-0.776,0.0,3.033
2,F6619-2116,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,4.644,-0.244,0.0,0.265
3,F6497-5659,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,4.459,-0.135,0.0,-0.512
4,F6559-1463,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,4.459,-0.692,0.0,1.597


In [32]:
# making useer_id column strings
df_whole['USER_ID'] = df_whole['USER_ID'].astype(str)

In [33]:
# creating a dataframe from df_whole with only USER_ID from df_peptidomimetics_8
df_whole_8 = df_whole[df_whole['USER_ID'].isin(df_peptidomimetics_8['USER_ID'])]
df_whole_8

Unnamed: 0,USER_ID,D001,D002,D003,D004,D005,D006,D007,D008,D009,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
297,F6420-0636,1,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,4.700,-0.657,0.273,3.582
314,F6420-0562,1,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,4.524,-0.709,0.316,2.664
339,F1301-0059,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,4.524,-0.654,0.000,0.135
430,F6521-7868,1,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,4.907,-0.643,0.250,3.900
489,F0291-0021,1,0,0,0,2,0,0,0,0,...,0,0,1,0,0,0,4.954,-0.216,0.261,-0.231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5677,F3406-1383,3,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,5.700,-0.737,0.474,3.723
5678,F0554-0824,4,0,0,1,4,0,0,0,0,...,0,0,1,0,0,0,5.768,-0.799,0.575,4.938
5709,F1161-0075,3,0,0,1,4,0,0,0,0,...,0,0,1,0,0,0,5.524,-0.825,0.529,4.433
5775,F0554-0835,3,0,0,2,3,0,0,0,0,...,0,0,1,0,0,0,5.714,-0.794,0.436,3.407


In [44]:
# get the index of the dataframe and add 1 to it
ind = df_whole_8.index + 1
for i in ind:
    print(i)

298
315
340
431
490
558
572
667
671
684
746
827
897
907
971
975
1060
1067
1104
1117
1139
1231
1297
1353
1375
1383
1393
1423
1433
1503
1540
1544
1601
1612
1625
1636
1661
1668
1742
1787
1788
1790
1800
1825
1961
1980
2016
2018
2049
2083
2127
2131
2186
2256
2300
2335
2338
2371
2411
2414
2439
2465
2481
2537
2563
2576
2578
2585
2595
2597
2619
2625
2641
2653
2667
2677
2679
2685
2734
2775
2781
2876
2894
2936
2962
2969
3090
3131
3165
3223
3317
3378
3388
3403
3491
3533
3681
3729
3758
3799
3828
3853
3862
3864
3872
3874
3901
3909
3930
3969
4019
4126
4174
4217
4347
4403
4662
4707
4711
4714
4720
4757
4781
4813
4834
4843
4870
4896
4934
4937
4945
4994
5014
5033
5048
5075
5102
5120
5139
5140
5141
5187
5267
5301
5308
5487
5528
5542
5553
5555
5579
5587
5670
5675
5678
5679
5710
5776
5805


[298,
 315,
 340,
 431,
 490,
 558,
 572,
 667,
 671,
 684,
 746,
 827,
 897,
 907,
 971,
 975,
 1060,
 1067,
 1104,
 1117,
 1139,
 1231,
 1297,
 1353,
 1375,
 1383,
 1393,
 1423,
 1433,
 1503,
 1540,
 1544,
 1601,
 1612,
 1625,
 1636,
 1661,
 1668,
 1742,
 1787,
 1788,
 1790,
 1800,
 1825,
 1961,
 1980,
 2016,
 2018,
 2049,
 2083,
 2127,
 2131,
 2186,
 2256,
 2300,
 2335,
 2338,
 2371,
 2411,
 2414,
 2439,
 2465,
 2481,
 2537,
 2563,
 2576,
 2578,
 2585,
 2595,
 2597,
 2619,
 2625,
 2641,
 2653,
 2667,
 2677,
 2679,
 2685,
 2734,
 2775,
 2781,
 2876,
 2894,
 2936,
 2962,
 2969,
 3090,
 3131,
 3165,
 3223,
 3317,
 3378,
 3388,
 3403,
 3491,
 3533,
 3681,
 3729,
 3758,
 3799,
 3828,
 3853,
 3862,
 3864,
 3872,
 3874,
 3901,
 3909,
 3930,
 3969,
 4019,
 4126,
 4174,
 4217,
 4347,
 4403,
 4662,
 4707,
 4711,
 4714,
 4720,
 4757,
 4781,
 4813,
 4834,
 4843,
 4870,
 4896,
 4934,
 4937,
 4945,
 4994,
 5014,
 5033,
 5048,
 5075,
 5102,
 5120,
 5139,
 5140,
 5141,
 5187,
 5267,
 5301,
 5308,
 