In [None]:
import numpy as np
import pandas as pd

In [None]:
# load data
df = pd.read_csv("psmb5_class_bioactivity.csv")

In [None]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL381735,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](...,2.4,active
1,CHEMBL207670,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N1CCC[C@...,150000.0,inactive
2,CHEMBL383529,CC(=O)N[C@@H](CCC(=O)OC(C)(C)C)C(=O)N[C@@H](C)...,260.0,active
3,CHEMBL207336,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,17.0,active
4,CHEMBL208015,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,6.1,active
...,...,...,...,...
813,CHEMBL4519899,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](...,3.0,active
814,CHEMBL4555159,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,10.0,active
815,CHEMBL4646371,CC(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1...,10000.0,inactive
816,CHEMBL4649310,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,10000.0,inactive


In [None]:
# data preparation to transform IC50 standard values
df_no_smiles = df.drop(columns='canonical_smiles')

In [None]:
smiles = []

for i in df.canonical_smiles.tolist():
  cpd = str(i).split('.')
  cpd_longest = max(cpd, key = len)
  smiles.append(cpd_longest)

smiles = pd.Series(smiles, name = 'canonical_smiles')

In [None]:
# clean chemical structure (smiles notation)
df_clean_smiles = pd.concat([df_no_smiles,smiles], axis=1)
df_clean_smiles

Unnamed: 0,molecule_chembl_id,standard_value,class,canonical_smiles
0,CHEMBL381735,2.4,active,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](...
1,CHEMBL207670,150000.0,inactive,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N1CCC[C@...
2,CHEMBL383529,260.0,active,CC(=O)N[C@@H](CCC(=O)OC(C)(C)C)C(=O)N[C@@H](C)...
3,CHEMBL207336,17.0,active,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...
4,CHEMBL208015,6.1,active,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...
...,...,...,...,...
813,CHEMBL4519899,3.0,active,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](...
814,CHEMBL4555159,10.0,active,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...
815,CHEMBL4646371,10000.0,inactive,CC(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1...
816,CHEMBL4649310,10000.0,inactive,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...


In [None]:
# tranformation function

def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x

In [None]:
# fixing very high values
df.standard_value.describe()

count    8.180000e+02
mean     2.199089e+04
std      2.560980e+05
min      2.000000e-04
25%      1.700000e+01
50%      2.455000e+02
75%      5.890250e+03
max      7.100000e+06
Name: standard_value, dtype: float64

In [None]:
-np.log10( (10**-9)* 100000000 )

1.0

In [None]:
-np.log10( (10**-9)* 10000000000 )

-1.0

In [None]:
# Function to normalize values
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)
        
    return x

In [None]:
df_norm = norm_value(df)
df_norm

Unnamed: 0,molecule_chembl_id,canonical_smiles,class,standard_value_norm
0,CHEMBL381735,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](...,active,2.4
1,CHEMBL207670,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N1CCC[C@...,inactive,150000.0
2,CHEMBL383529,CC(=O)N[C@@H](CCC(=O)OC(C)(C)C)C(=O)N[C@@H](C)...,active,260.0
3,CHEMBL207336,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,active,17.0
4,CHEMBL208015,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,active,6.1
...,...,...,...,...
813,CHEMBL4519899,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](...,active,3.0
814,CHEMBL4555159,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,active,10.0
815,CHEMBL4646371,CC(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1...,inactive,10000.0
816,CHEMBL4649310,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,inactive,10000.0


In [None]:
df_norm.standard_value_norm.describe()

count    8.180000e+02
mean     2.199089e+04
std      2.560980e+05
min      2.000000e-04
25%      1.700000e+01
50%      2.455000e+02
75%      5.890250e+03
max      7.100000e+06
Name: standard_value_norm, dtype: float64

In [None]:
df_final = pIC50(df_norm)
df_final

Unnamed: 0,molecule_chembl_id,canonical_smiles,class,pIC50
0,CHEMBL381735,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](...,active,8.619789
1,CHEMBL207670,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N1CCC[C@...,inactive,3.823909
2,CHEMBL383529,CC(=O)N[C@@H](CCC(=O)OC(C)(C)C)C(=O)N[C@@H](C)...,active,6.585027
3,CHEMBL207336,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,active,7.769551
4,CHEMBL208015,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,active,8.214670
...,...,...,...,...
813,CHEMBL4519899,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](...,active,8.522879
814,CHEMBL4555159,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,active,8.000000
815,CHEMBL4646371,CC(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1...,inactive,5.000000
816,CHEMBL4649310,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,inactive,5.000000


In [None]:
df_final.pIC50.describe()

count    818.000000
mean       6.542848
std        1.487884
min        2.148742
25%        5.229872
50%        6.609971
75%        7.769551
max       12.698970
Name: pIC50, dtype: float64

Delete intermediate activity

In [None]:
df_act_inac = df_final[df_final['class'] != 'intermediate']
df_act_inac

Unnamed: 0,molecule_chembl_id,canonical_smiles,class,pIC50
0,CHEMBL381735,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](...,active,8.619789
1,CHEMBL207670,CC[C@H](C)[C@H](NC(=O)OCc1ccccc1)C(=O)N1CCC[C@...,inactive,3.823909
2,CHEMBL383529,CC(=O)N[C@@H](CCC(=O)OC(C)(C)C)C(=O)N[C@@H](C)...,active,6.585027
3,CHEMBL207336,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,active,7.769551
4,CHEMBL208015,CC(C)C[C@@H](/C=C/S(C)(=O)=O)NC(=O)[C@H](C)NC(...,active,8.214670
...,...,...,...,...
813,CHEMBL4519899,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](...,active,8.522879
814,CHEMBL4555159,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,active,8.000000
815,CHEMBL4646371,CC(=O)NCC(=O)N1CCC[C@H]1C(=O)N[C@@H](Cc1ccccc1...,inactive,5.000000
816,CHEMBL4649310,CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,inactive,5.000000


In [None]:
# saving active and inactive df
df_act_inac.to_csv('psmb5_active_inactive_pIC50.csv', index=False)