In [1]:
from bioservices import UniProt
u = UniProt(verbose=False)
data = u.search("Estrogen receptor+and+human+and+alpha", frmt="tab", limit=3,
                 columns="entry name,length,id, genes") # zap70 specy in human organism
print(data)

Entry name	Length	Entry	Gene names
ESR1_HUMAN	595	P03372	ESR1 ESR NR3A1
ANDR_HUMAN	920	P10275	AR DHTR NR3C4
EGFR_HUMAN	1210	P00533	EGFR ERBB ERBB1 HER1



In [2]:
import numpy as np
import pandas as pd
from time import time

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
%matplotlib inline

In [3]:
# Target object handler for API access and check connection to the db
from chembl_webresource_client import *

targets = TargetResource()
print targets.status()

True


In [4]:
# Load targets in data frame

targetsDF = pd.DataFrame.from_dict(targets.get(uniprot=['P03372'])) #ESR1_HUMAN
targetsDF

Unnamed: 0,bioactivityCount,chemblId,compoundCount,description,geneNames,organism,preferredName,proteinAccession,synonyms,targetType
0,10936,CHEMBL206,5942,Estrogen receptor alpha,Unspecified,Homo sapiens,Estrogen receptor alpha,P03372,"ESR1,Estradiol receptor,ER,ER-alpha,ESR ,Estro...",SINGLE PROTEIN


In [5]:
# Download all bioactivities for targets

bioactsDF = pd.DataFrame.from_dict(targets.bioactivities('CHEMBL206'))
len(bioactsDF)

10936

In [6]:
bioactsDF.head(1)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,ingredient_cmpd_chemblid,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value
0,Unspecified,CHEMBL677870,Binding affinity for estrogen receptor alpha,B,IC50,CHEMBL219763,4,=,Homo sapiens,CHEMBL219763,"Bioorg. Med. Chem. Lett., (2003) 13:22:4089",CHEMBL206,9,Estrogen receptor alpha,nM,9.57


In [7]:
bioactsDF = bioactsDF[(bioactsDF['bioactivity_type'] == 'IC50')]  # keep ony IC50
len(bioactsDF)

3641

In [8]:
bioactsDF.units.shape

(3641,)

In [9]:
bioactsDF['units'].value_counts()

nM             2686
Unspecified     955
Name: units, dtype: int64

In [10]:
bioactsDF = bioactsDF[(bioactsDF['units'] == 'nM')]
len(bioactsDF)

2686

In [11]:
fix = []

for i in bioactsDF.value:
    fix.append(float(i))


In [12]:
# define active, inactive, intermediate compounds

STATUS = []

for i in fix:
    if i <=1000:
        STATUS.append("active") #active
        
    elif i >=10000:
        STATUS.append("inactive") #inactive
        
    else:
        STATUS.append("intermediate") #intermediate

In [13]:
bioactsDF = bioactsDF.drop('value', 1)
bioactsDF['value'] = fix
bioactsDF['STATUS'] = STATUS
bioactsDF.head(3)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,ingredient_cmpd_chemblid,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
0,Unspecified,CHEMBL677870,Binding affinity for estrogen receptor alpha,B,IC50,CHEMBL219763,4,=,Homo sapiens,CHEMBL219763,"Bioorg. Med. Chem. Lett., (2003) 13:22:4089",CHEMBL206,9,Estrogen receptor alpha,nM,9.57,active
9,Unspecified,CHEMBL873603,Displacement of [3H]17-beta-estradiol from ful...,B,IC50,CHEMBL92720,19,=,Homo sapiens,CHEMBL92720,"Bioorg. Med. Chem. Lett., (2004) 14:11:2741",CHEMBL206,8,Estrogen receptor alpha,nM,41.1,active
10,Unspecified,CHEMBL679886,Potency in cellular transactivation assay util...,F,IC50,CHEMBL92720,19,=,Homo sapiens,CHEMBL92720,"Bioorg. Med. Chem. Lett., (2004) 14:11:2741",CHEMBL206,8,Estrogen receptor alpha,nM,274.0,active


In [14]:
bioactsDF['STATUS'] = STATUS
bioactsDF.head(3)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,ingredient_cmpd_chemblid,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
0,Unspecified,CHEMBL677870,Binding affinity for estrogen receptor alpha,B,IC50,CHEMBL219763,4,=,Homo sapiens,CHEMBL219763,"Bioorg. Med. Chem. Lett., (2003) 13:22:4089",CHEMBL206,9,Estrogen receptor alpha,nM,9.57,active
9,Unspecified,CHEMBL873603,Displacement of [3H]17-beta-estradiol from ful...,B,IC50,CHEMBL92720,19,=,Homo sapiens,CHEMBL92720,"Bioorg. Med. Chem. Lett., (2004) 14:11:2741",CHEMBL206,8,Estrogen receptor alpha,nM,41.1,active
10,Unspecified,CHEMBL679886,Potency in cellular transactivation assay util...,F,IC50,CHEMBL92720,19,=,Homo sapiens,CHEMBL92720,"Bioorg. Med. Chem. Lett., (2004) 14:11:2741",CHEMBL206,8,Estrogen receptor alpha,nM,274.0,active


In [15]:
bioactsDF['operator'].value_counts()

=              2237
>               404
<                29
Unspecified      16
Name: operator, dtype: int64

In [16]:
bioactsDF['assay_description'].value_counts()

PUBCHEM_BIOASSAY: Estrogen Receptor-alpha Coactivator Binding Inhibitors Dose Response Confirmation. (Class of assay: confirmatory) [Related pubchem assays: 629 (Primary screen preceding this dose response confirmation assay.)]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               439
Inhibitory concentration against human ER alpha expressed in Esche

In [17]:
bioactsDF = bioactsDF.rename(columns={'ingredient_cmpd_chemblid': 'chemblId'})

In [18]:
# Filter bioactivities
bioactsDF_Train = bioactsDF[(bioactsDF['operator'] == '=') & # only exact measurements
                            (bioactsDF['assay_type'] == 'B') & # only binding data
                            (bioactsDF['target_confidence'] == 9)] # only high target confidence
bioactsDF_Test_gra = bioactsDF[(bioactsDF['operator'] == '>') & 
                            (bioactsDF['assay_type'] == 'B') & # only binding data
                            (bioactsDF['target_confidence'] == 9)] # only high target confidence
bioactsDF_Test_les = bioactsDF[(bioactsDF['operator'] == '<') & 
                            (bioactsDF['assay_type'] == 'B') & # only binding data
                            (bioactsDF['target_confidence'] == 9)] # only high target confidence

print len(bioactsDF_Train), len(bioactsDF_Test_gra), len(bioactsDF_Test_les)

1708 294 19


In [19]:
bioactsDF_Train[(bioactsDF_Train['assay_description'] == 'Inhibition of estrogen receptor alpha-mediated human MCF7 cell growth inhibition')]

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
9601,Unspecified,CHEMBL2445311,Inhibition of estrogen receptor alpha-mediated...,B,IC50,CHEMBL2441817,S58,=,Homo sapiens,CHEMBL2441817,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,64908.26,inactive
9608,Unspecified,CHEMBL2445311,Inhibition of estrogen receptor alpha-mediated...,B,IC50,CHEMBL2441818,S59,=,Homo sapiens,CHEMBL2441818,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,64908.26,inactive
9609,Unspecified,CHEMBL2445311,Inhibition of estrogen receptor alpha-mediated...,B,IC50,CHEMBL2441819,S60,=,Homo sapiens,CHEMBL2441819,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,64908.26,inactive


In [20]:
bioactsDF_Test_les['assay_description'].value_counts()

Inhibition of estrogen receptor alpha-mediated human MCF7 cell growth inhibition                                                                                                                                                       16
PUBCHEM_BIOASSAY: Estrogen Receptor-alpha Coactivator Binding Inhibitors Dose Response Confirmation. (Class of assay: confirmatory) [Related pubchem assays: 629 (Primary screen preceding this dose response confirmation assay.)]     2
Binding affinity to ER alpha (unknown origin) by LanthaScreen TR-FRET competitive binding assay                                                                                                                                         1
Name: assay_description, dtype: int64

In [21]:
bioactsDF_Test_gra['assay_description'].value_counts()

PUBCHEM_BIOASSAY: Estrogen Receptor-alpha Coactivator Binding Inhibitors Dose Response Confirmation. (Class of assay: confirmatory) [Related pubchem assays: 629 (Primary screen preceding this dose response confirmation assay.)]                                                                        157
Binding affinity to human ERalpha                                                                                                                                                                                                                                                                           16
Binding affinity to ERalpha                                                                                                                                                                                                                                                                                 11
Binding affinity to human recombinant ERalpha by scintillation proximity assay             

In [22]:
len(bioactsDF_Train), len(bioactsDF_Train['chemblId'].unique())

(1708, 1424)

In [23]:
bioactsDF_Train.tail(2)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
10934,Unspecified,CHEMBL3829134,Displacement of fluorescein-labeled estrogen l...,B,IC50,CHEMBL3828117,20d,=,Homo sapiens,CHEMBL3828117,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,38300.0,inactive
10935,Unspecified,CHEMBL3825990,Inhibition of ER-alpha (unknown origin) by Lan...,B,IC50,CHEMBL3823192,86,=,Homo sapiens,CHEMBL3823192,"Bioorg. Med. Chem., (2016) 24:18:4075",CHEMBL206,9,Estrogen receptor alpha,nM,574.0,active


In [24]:
len(bioactsDF_Test_gra), len(bioactsDF_Test_gra['chemblId'].unique())

(294, 290)

In [25]:
len(bioactsDF_Test_les), len(bioactsDF_Test_les['chemblId'].unique())

(19, 19)

In [26]:
bioactsDF_Train_dup = pd.concat(g for _, 
                                g in bioactsDF_Train.groupby("chemblId") 
                                  if len(g) > 1)
bioactsDF_Train_non = bioactsDF_Train.loc[~bioactsDF_Train.index.isin(bioactsDF_Train_dup.index)]

print (str(len(bioactsDF_Train_non)), 
       str(len(bioactsDF_Train_dup)), 
       str(len(bioactsDF_Train_dup)+len(bioactsDF_Train_non)))

('1259', '449', '1708')


In [27]:
len(bioactsDF_Train_non), len(bioactsDF_Train_non['chemblId'].unique())

(1259, 1259)

In [28]:
bioactsDF_Train_dup

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
6530,Unspecified,CHEMBL831663,Inhibition of transcriptional activation by hu...,B,IC50,CHEMBL100231,13,=,Homo sapiens,CHEMBL100231,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,5.2000,active
6531,Unspecified,CHEMBL831661,Inhibition of binding to human estrogen recept...,B,IC50,CHEMBL100231,13,=,Homo sapiens,CHEMBL100231,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,70.9600,active
3114,Unspecified,CHEMBL3736957,Antagonist activity at ERalpha in human T47D-K...,B,IC50,CHEMBL10041,4-OHT,=,Homo sapiens,CHEMBL10041,"Bioorg. Med. Chem., (2015) 23:24:7597",CHEMBL206,9,Estrogen receptor alpha,nM,10.0000,active
3115,Unspecified,CHEMBL3736956,Antagonist activity at luciferase-fused ERalph...,B,IC50,CHEMBL10041,4-OHT,=,Homo sapiens,CHEMBL10041,"Bioorg. Med. Chem., (2015) 23:24:7597",CHEMBL206,9,Estrogen receptor alpha,nM,70.0000,active
6798,Unspecified,CHEMBL831663,Inhibition of transcriptional activation by hu...,B,IC50,CHEMBL100595,19,=,Homo sapiens,CHEMBL100595,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,16.1100,active
6799,Unspecified,CHEMBL831661,Inhibition of binding to human estrogen recept...,B,IC50,CHEMBL100595,19,=,Homo sapiens,CHEMBL100595,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,46.0300,active
1739,Unspecified,CHEMBL831663,Inhibition of transcriptional activation by hu...,B,IC50,CHEMBL100617,15,=,Homo sapiens,CHEMBL100617,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,0.6998,active
1740,Unspecified,CHEMBL831661,Inhibition of binding to human estrogen recept...,B,IC50,CHEMBL100617,15,=,Homo sapiens,CHEMBL100617,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,33.9600,active
7559,Unspecified,CHEMBL831663,Inhibition of transcriptional activation by hu...,B,IC50,CHEMBL100763,10,=,Homo sapiens,CHEMBL100763,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,3.3040,active
7560,Unspecified,CHEMBL831661,Inhibition of binding to human estrogen recept...,B,IC50,CHEMBL100763,10,=,Homo sapiens,CHEMBL100763,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,26.9800,active


In [29]:
# mean and std of all duplicae

mean_std = bioactsDF_Train_dup.groupby(['chemblId'], as_index=False).agg(
                      {'value':['mean','std']})
mean_std.head(2)

Unnamed: 0_level_0,chemblId,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
0,CHEMBL100231,38.08,46.499342
1,CHEMBL10041,40.0,42.426407


In [30]:
len(mean_std)

165

In [31]:
bioactsDF_Train_dup = bioactsDF_Train_dup.merge(mean_std, on='chemblId', how='inner')
len(bioactsDF_Train_dup)

  self.right = self.right.drop(right_drop, axis=1)


449

In [32]:
bioactsDF_Train_dup.head(3)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS,"(value, mean)","(value, std)"
0,Unspecified,CHEMBL831663,Inhibition of transcriptional activation by hu...,B,IC50,CHEMBL100231,13,=,Homo sapiens,CHEMBL100231,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,5.2,active,38.08,46.499342
1,Unspecified,CHEMBL831661,Inhibition of binding to human estrogen recept...,B,IC50,CHEMBL100231,13,=,Homo sapiens,CHEMBL100231,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,70.96,active,38.08,46.499342
2,Unspecified,CHEMBL3736957,Antagonist activity at ERalpha in human T47D-K...,B,IC50,CHEMBL10041,4-OHT,=,Homo sapiens,CHEMBL10041,"Bioorg. Med. Chem., (2015) 23:24:7597",CHEMBL206,9,Estrogen receptor alpha,nM,10.0,active,40.0,42.426407


In [34]:
#keep only SD less thab 2SD

bioactsDF_Train_dup = bioactsDF_Train_dup[(bioactsDF_Train_dup[('value', 'std')] < 2)]
len(bioactsDF_Train_dup), len(bioactsDF_Train_dup['chemblId'].unique())

(80, 31)

In [35]:
bioactsDF_Train_dup['select'] = (bioactsDF_Train_dup['value']- bioactsDF_Train_dup[('value', 'mean')]).abs()
#value_median = keep.groupby('chemblId')['value'].transform('median')

bioactsDF_Train_dup = bioactsDF_Train_dup.groupby(["chemblId"]).apply(lambda x: x.sort_values(["select"], ascending = True)).reset_index(drop=True)
bioactsDF_Train_dup.head(5)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS,"(value, mean)","(value, std)",select
0,Unspecified,CHEMBL831007,Inhibition of 17-beta-estradiol mediated lucif...,B,IC50,CHEMBL101382,60,=,Homo sapiens,CHEMBL101382,"J. Med. Chem., (2005) 48:2:364",CHEMBL206,9,Estrogen receptor alpha,nM,16.5,active,17.7475,1.452065,1.2475
1,Unspecified,CHEMBL831528,Inhibition of [3H]estradiol binding to human e...,B,IC50,CHEMBL101382,60,=,Homo sapiens,CHEMBL101382,"J. Med. Chem., (2005) 48:2:364",CHEMBL206,9,Estrogen receptor alpha,nM,19.0,active,17.7475,1.452065,1.2525
2,Unspecified,CHEMBL831661,Inhibition of binding to human estrogen recept...,B,IC50,CHEMBL101382,2,=,Homo sapiens,CHEMBL101382,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,19.01,active,17.7475,1.452065,1.2625
3,Unspecified,CHEMBL831663,Inhibition of transcriptional activation by hu...,B,IC50,CHEMBL101382,2,=,Homo sapiens,CHEMBL101382,"Bioorg. Med. Chem. Lett., (2005) 15:4:957",CHEMBL206,9,Estrogen receptor alpha,nM,16.48,active,17.7475,1.452065,1.2675
4,Unspecified,CHEMBL832667,Inhibition of bindign to recombinant human est...,B,IC50,CHEMBL180792,19,=,Homo sapiens,CHEMBL180792,"Bioorg. Med. Chem. Lett., (2005) 15:1:107",CHEMBL206,9,Estrogen receptor alpha,nM,1.7,active,1.5,0.282843,0.2


In [36]:
bioactsDF_Train_dup = bioactsDF_Train_dup.drop_duplicates(subset='chemblId', keep='first')
bioactsDF_Train_dup.head(2)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS,"(value, mean)","(value, std)",select
0,Unspecified,CHEMBL831007,Inhibition of 17-beta-estradiol mediated lucif...,B,IC50,CHEMBL101382,60,=,Homo sapiens,CHEMBL101382,"J. Med. Chem., (2005) 48:2:364",CHEMBL206,9,Estrogen receptor alpha,nM,16.5,active,17.7475,1.452065,1.2475
4,Unspecified,CHEMBL832667,Inhibition of bindign to recombinant human est...,B,IC50,CHEMBL180792,19,=,Homo sapiens,CHEMBL180792,"Bioorg. Med. Chem. Lett., (2005) 15:1:107",CHEMBL206,9,Estrogen receptor alpha,nM,1.7,active,1.5,0.282843,0.2


In [37]:
len(bioactsDF_Train_dup), len(bioactsDF_Train_dup['chemblId'].unique())

(31, 31)

In [38]:
bioactsDF_Train_dup.tail(2)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS,"(value, mean)","(value, std)",select
76,Unspecified,CHEMBL3776153,Downregulation of ERalpha in human MCF7 cells ...,B,IC50,CHEMBL3775908,22,=,Homo sapiens,CHEMBL3775908,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,0.37,active,0.835,0.657609,0.465
78,Unspecified,CHEMBL1909145,DRUGMATRIX: Estrogen ERalpha radioligand bindi...,B,IC50,CHEMBL691,ETHINYLESTRADIOL,=,Homo sapiens,CHEMBL691,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,0.448,active,1.224,1.09743,0.776


In [39]:
bioactsDF_Train_dup = bioactsDF_Train_dup.drop('select', 1)
bioactsDF_Train_dup = bioactsDF_Train_dup.drop(('value', 'mean'), 1)
bioactsDF_Train_dup = bioactsDF_Train_dup.drop(('value', 'std'), 1)

In [40]:
bioactsDF_Train_final = pd.concat([bioactsDF_Train_non, bioactsDF_Train_dup])
len(bioactsDF_Train_dup), len(bioactsDF_Train_non), len(bioactsDF_Train_final)

(31, 1259, 1290)

In [41]:
bioactsDF_Train_final.tail(2)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
76,Unspecified,CHEMBL3776153,Downregulation of ERalpha in human MCF7 cells ...,B,IC50,CHEMBL3775908,22,=,Homo sapiens,CHEMBL3775908,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,0.37,active
78,Unspecified,CHEMBL1909145,DRUGMATRIX: Estrogen ERalpha radioligand bindi...,B,IC50,CHEMBL691,ETHINYLESTRADIOL,=,Homo sapiens,CHEMBL691,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,0.448,active


In [42]:
bioactsDF_Test_gra_dup = pd.concat(g for _, 
                                g in bioactsDF_Test_gra.groupby("chemblId") 
                                if len(g) > 1)
bioactsDF_Test_gra_non = bioactsDF_Train.loc[~bioactsDF_Test_gra.index.isin(bioactsDF_Test_gra_dup.index)]

print (str(len(bioactsDF_Test_gra_non)), 
       str(len(bioactsDF_Test_gra_dup)), 
       str(len(bioactsDF_Test_gra_dup)+len(bioactsDF_Test_gra_non)))

('286', '8', '294')


In [43]:
len(bioactsDF_Test_gra_dup), len(bioactsDF_Test_gra_non)

(8, 286)

In [44]:
check = bioactsDF_Test_gra_dup[['chemblId','value']]
check

Unnamed: 0,chemblId,value
7577,CHEMBL103,1000.0
7578,CHEMBL103,1000.0
8294,CHEMBL246140,10000.0
8297,CHEMBL246140,10000.0
5597,CHEMBL33899,100000.0
5598,CHEMBL33899,1270.0
4200,CHEMBL386630,10000.0
4201,CHEMBL386630,10000.0


In [45]:
# drop variable ID 
bioactsDF_Test_gra = bioactsDF_Test_gra[~bioactsDF_Test_gra['chemblId'].isin(['CHEMBL33899'])]
len(bioactsDF_Test_gra)

292

In [46]:
print (str(len(bioactsDF_Train_final)), 
       str(len(bioactsDF_Test_gra)), 
       str(len(bioactsDF_Test_les)))

('1290', '292', '19')


In [47]:
# Load the compounds in the dataframe

compounds = CompoundResource()

cpdsDF_Train_final = pd.DataFrame.from_dict(compounds.get(
                list(bioactsDF_Train_final['chemblId'])))

cpdsDF_Test_gra = pd.DataFrame.from_dict(compounds.get(
                list(bioactsDF_Test_gra['chemblId'].unique())))

cpdsDF_Test_les = pd.DataFrame.from_dict(compounds.get(
                list(bioactsDF_Test_les['chemblId'])))

print (str(len(cpdsDF_Train_final)), 
       str(len(cpdsDF_Test_gra)), 
       str(len(cpdsDF_Test_les)))

('1290', '289', '19')


In [48]:
# making sure everything is float

bioactsDF_Train_final['value'] = bioactsDF_Train_non['value'].astype(float)
bioactsDF_Test_gra   ['value'] = bioactsDF_Test_gra ['value'].astype(float)
bioactsDF_Test_les   ['value'] = bioactsDF_Test_les ['value'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [49]:
bioactsDF_Train_final.tail(2)

Unnamed: 0,activity_comment,assay_chemblid,assay_description,assay_type,bioactivity_type,chemblId,name_in_reference,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
76,Unspecified,CHEMBL3776153,Downregulation of ERalpha in human MCF7 cells ...,B,IC50,CHEMBL3775908,22,=,Homo sapiens,CHEMBL3775908,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,,active
78,Unspecified,CHEMBL1909145,DRUGMATRIX: Estrogen ERalpha radioligand bindi...,B,IC50,CHEMBL691,ETHINYLESTRADIOL,=,Homo sapiens,CHEMBL691,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,,active


In [50]:
cpdsDF_Train_final.head(2)

Unnamed: 0,acdAcidicPka,acdBasicPka,acdLogd,acdLogp,alogp,chemblId,knownDrug,molecularFormula,molecularWeight,numRo5Violations,passesRuleOfThree,preferredCompoundName,rotatableBonds,smiles,species,stdInChiKey,synonyms
0,,,,,,CHEMBL219763,No,C8H5B10O,,,No,,,,,,
1,7.69,,0.33,0.5,3.31,CHEMBL370037,No,C16H12O3,252.26,0.0,No,,1.0,CC1=C(C(=O)c2ccc(O)cc12)c3ccc(O)cc3,NEUTRAL,IOAPPLGKQYAVIJ-UHFFFAOYSA-N,


In [52]:
print (str(len(bioactsDF_Train_final)), str(len(cpdsDF_Train_final)))
print (str(len(bioactsDF_Test_gra)),  str(len(cpdsDF_Test_gra)))
print (str(len(bioactsDF_Test_les)),  str(len(cpdsDF_Test_les)))

('1290', '1290')
('292', '289')
('19', '19')


In [53]:
# merge cpd to bio activity

Train_ER_alpha_RAW    = cpdsDF_Train_final.merge(bioactsDF_Train_final, on='chemblId', how='inner')
Test_gra_ER_alpha_RAW = cpdsDF_Test_gra.merge (bioactsDF_Test_gra , on='chemblId', how='inner')
Test_les_ER_alpha_RAW = cpdsDF_Test_les.merge (bioactsDF_Test_les , on='chemblId', how='inner')

print len(Train_ER_alpha_RAW)
print len(Test_gra_ER_alpha_RAW)
print len(Test_les_ER_alpha_RAW)

1290
292
19


In [54]:
Train_ER_alpha_RAW.tail(2)

Unnamed: 0,acdAcidicPka,acdBasicPka,acdLogd,acdLogp,alogp,chemblId,knownDrug,molecularFormula,molecularWeight,numRo5Violations,...,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
1288,4.06,6.57,-0.54,2.45,2.11,CHEMBL3775908,No,C23H24F3NO3,419.44,0.0,...,=,Homo sapiens,CHEMBL3775908,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,,active
1289,10.24,,4.11,4.11,4.89,CHEMBL691,Yes,C20H24O2,296.4,0.0,...,=,Homo sapiens,CHEMBL691,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,,active


In [55]:
# drop Nan in SMILES

Train_ER_alpha_RAW1     = Train_ER_alpha_RAW   [pd.notnull(Train_ER_alpha_RAW   ['smiles'])]
Test_gra_ER_alpha_RAW1  = Test_gra_ER_alpha_RAW[pd.notnull(Test_gra_ER_alpha_RAW['smiles'])]
Test_les_ER_alpha_RAW1  = Test_les_ER_alpha_RAW[pd.notnull(Test_les_ER_alpha_RAW['smiles'])]

print 'Train_ER_alpha of '+str(len(Train_ER_alpha_RAW))+' after drop duplicate SMILEs are reduce to '+ str(len(Train_ER_alpha_RAW1)) 
print 'Test_gra_ER_alpha of '+str(len(Test_gra_ER_alpha_RAW))+' after drop duplicate SMILEs are reduce to '+ str(len(Test_gra_ER_alpha_RAW1)) 
print 'Test_les_ER_alpha of '+str(len(Test_les_ER_alpha_RAW))+' after drop duplicate SMILEs are reduce to '+ str(len(Test_les_ER_alpha_RAW1))

Train_ER_alpha of 1290 after drop duplicate SMILEs are reduce to 1282
Test_gra_ER_alpha of 292 after drop duplicate SMILEs are reduce to 292
Test_les_ER_alpha of 19 after drop duplicate SMILEs are reduce to 19


In [56]:
Train_ER_alpha_RAW1.tail(2)

Unnamed: 0,acdAcidicPka,acdBasicPka,acdLogd,acdLogp,alogp,chemblId,knownDrug,molecularFormula,molecularWeight,numRo5Violations,...,operator,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS
1288,4.06,6.57,-0.54,2.45,2.11,CHEMBL3775908,No,C23H24F3NO3,419.44,0.0,...,=,Homo sapiens,CHEMBL3775908,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,,active
1289,10.24,,4.11,4.11,4.89,CHEMBL691,Yes,C20H24O2,296.4,0.0,...,=,Homo sapiens,CHEMBL691,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,,active


In [57]:
len(Test_gra_ER_alpha_RAW1), len(Test_gra_ER_alpha_RAW1['chemblId'].unique())

(292, 289)

In [58]:
Test_gra_ER_alpha_RAW2 = Test_gra_ER_alpha_RAW1.drop_duplicates(subset='chemblId', keep='last')

len(Test_gra_ER_alpha_RAW1), len(Test_gra_ER_alpha_RAW2)

(292, 289)

In [59]:
# only smiles exist

Train_ER_alpha = Train_ER_alpha_RAW1[pd.notnull(
                    Train_ER_alpha_RAW1['smiles'])]

Test_gra_ER_alpha = Test_gra_ER_alpha_RAW2[pd.notnull(
                    Test_gra_ER_alpha_RAW2['smiles'])]


Test_les_ER_alpha = Test_les_ER_alpha_RAW1[pd.notnull(
                    Test_les_ER_alpha_RAW1['smiles'])]


print "RAW data of " + str(len(Train_ER_alpha_RAW1)) + \
          " Train compounds has been reduced to " \
          + str(len(Train_ER_alpha)) + " Compounds."

print "RAW data of " + str(len(Test_gra_ER_alpha_RAW2)) + \
          " Test_gracompounds has been reduced to " \
          + str(len(Test_gra_ER_alpha)) + " Compounds."
        
print "RAW data of " + str(len(Test_les_ER_alpha_RAW1)) + \
          " Test_les compounds has been reduced to " \
          + str(len(Test_les_ER_alpha)) + " Compounds."

RAW data of 1282 Train compounds has been reduced to 1282 Compounds.
RAW data of 289 Test_gracompounds has been reduced to 289 Compounds.
RAW data of 19 Test_les compounds has been reduced to 19 Compounds.


In [60]:
# clean smiles

from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import MolFromSmiles,MolToSmiles

def clean_smiles (ListSMILEs):
    remover = SaltRemover()
    len(remover.salts)

    SMILES_desalt = []

    for i in ListSMILEs:
        mol = MolFromSmiles(i) 
        mol_desalt = remover.StripMol(mol)
        mol_SMILES = MolToSmiles(mol_desalt)
        SMILES_desalt.append(mol_SMILES)
    return SMILES_desalt

In [61]:
Train_ER_alpha['SMILES_desalt'] = clean_smiles(Train_ER_alpha.smiles)

In [62]:
Test_gra_ER_alpha['SMILES_desalt'] = clean_smiles(Test_gra_ER_alpha.smiles)

In [63]:
Test_les_ER_alpha['SMILES_desalt'] = clean_smiles(Test_les_ER_alpha.smiles)

In [64]:
Train_ER_alpha2 = Train_ER_alpha.drop_duplicates(\
                    subset='SMILES_desalt', keep='last')

print "RAW data of " + str(len(Train_ER_alpha)) + \
          " SMILES has been reduced to " \
          + str(len(Train_ER_alpha2)) + " SMILES."

RAW data of 1282 SMILES has been reduced to 1238 SMILES.


In [65]:
Test_gra_ER_alpha2 = Test_gra_ER_alpha.drop_duplicates(\
                    subset='SMILES_desalt', keep='last')

print "RAW data of " + str(len(Test_gra_ER_alpha)) + \
          " SMILES has been reduced to " \
          + str(len(Test_gra_ER_alpha2)) + " SMILES."

RAW data of 289 SMILES has been reduced to 283 SMILES.


In [66]:
Test_les_ER_alpha2 = Test_les_ER_alpha.drop_duplicates(\
                    subset='SMILES_desalt', keep='last')

print "RAW data of " + str(len(Test_les_ER_alpha)) + \
          " SMILES has been reduced to " \
          + str(len(Test_les_ER_alpha2)) + " SMILES."

RAW data of 19 SMILES has been reduced to 19 SMILES.


In [67]:
# save model
Train_ER_alpha2.to_csv   ('model/Train_ER_alpha.csv'   , sep=',' ,index=False)
Test_gra_ER_alpha2.to_csv('model/Test_gra_ER_alpha.csv', sep=',' ,index=False)
Test_les_ER_alpha2.to_csv('model/Test_les_ER_alpha.csv', sep=',' ,index=False)

In [68]:
Train_smiles = Train_ER_alpha2[['SMILES_desalt','chemblId']]
Train_smiles.to_csv('smiles/Train_ER_alpha.smi', sep='\t' ,header=False ,index=False)

In [69]:
Test_gra = Test_gra_ER_alpha2[['SMILES_desalt','chemblId']]
Test_gra.to_csv('smiles/Test_gra_ER_alpha.smi', sep='\t' ,header=False ,index=False)

In [70]:
Test_les = Test_les_ER_alpha2[['SMILES_desalt','chemblId']]
Test_les.to_csv('smiles/Test_les_ER_alpha.smi', sep='\t' ,header=False ,index=False)

In [71]:
Train_QSAR = Train_ER_alpha2[['chemblId','value']]

Train_QSAR.to_csv('Train_QSAR.csv', sep=',' ,index=False)

In [72]:
Train_ER_alpha2.tail(3)

Unnamed: 0,acdAcidicPka,acdBasicPka,acdLogd,acdLogp,alogp,chemblId,knownDrug,molecularFormula,molecularWeight,numRo5Violations,...,organism,parent_cmpd_chemblid,reference,target_chemblid,target_confidence,target_name,units,value,STATUS,SMILES_desalt
1287,4.06,7.14,0.31,3.11,2.6,CHEMBL3775766,No,C24H27F2NO3,415.47,0.0,...,Homo sapiens,CHEMBL3775766,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,,active,CC(C)CN1C(c2c(F)cc(C=CC(=O)O)cc2F)c2ccc(O)cc2C...
1288,4.06,6.57,-0.54,2.45,2.11,CHEMBL3775908,No,C23H24F3NO3,419.44,0.0,...,Homo sapiens,CHEMBL3775908,"ACS Med. Chem. Lett., (2016) 7:1:94",CHEMBL206,9,Estrogen receptor alpha,nM,,active,CC(CF)CN1C(C)Cc2cc(O)ccc2C1c1c(F)cc(C=CC(=O)O)...
1289,10.24,,4.11,4.11,4.89,CHEMBL691,Yes,C20H24O2,296.4,0.0,...,Homo sapiens,CHEMBL691,Unspecified,CHEMBL206,9,Estrogen receptor alpha,nM,,active,C#CC1(O)CCC2C3CCc4cc(O)ccc4C3CCC21C
