<a href="https://colab.research.google.com/github/gowravmannem/Aromatase-Drug-Discovery/blob/main/aromatase_project_part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Installing Libraries

In [None]:
# importing Chembl Database
! pip install chembl_webresource_client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Importing Libraries


In [None]:
import pandas as pd
from chembl_webresource_client.new_client import new_client


##Target Search for Aromatase

In [None]:
# Searching for aromatase in chembl database
target = new_client.target
target_query = target.search('aromatase')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P11511', 'xref_name': None, 'xre...",Homo sapiens,Cytochrome P450 19A1,20.0,False,CHEMBL1978,"[{'accession': 'P11511', 'component_descriptio...",SINGLE PROTEIN,9606
1,"[{'xref_id': 'P22443', 'xref_name': None, 'xre...",Rattus norvegicus,Cytochrome P450 19A1,20.0,False,CHEMBL3859,"[{'accession': 'P22443', 'component_descriptio...",SINGLE PROTEIN,10116


In [None]:
# picking the aromatase data on Homo Sapiens(Humans)
aromatase_target= targets.target_chembl_id[0]
aromatase_target

'CHEMBL1978'

In [None]:
# Selecting aromatase data that have IC50 values
activity = new_client.activity
res = activity.filter(target_chembl_id=aromatase_target).filter(standard_type="IC50")

In [None]:
# loading all the IC50 aromatase data into 'aromatase_df' variable
aromatase_df=pd.DataFrame.from_dict(res)

#Checking our work

In [None]:
# checking if our code works
aromatase_df.head(3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,82585,[],CHEMBL666794,Inhibition of Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,7.1
1,,94540,[],CHEMBL666794,Inhibition of Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,50.0
2,,112960,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.238


In [None]:
# Switching standard_value from object type to float
aromatase_df['standard_value']=pd.to_numeric(aromatase_df['standard_value'],errors = 'coerce')

In [None]:
aromatase_df.standard_value

0        7100.00
1       50000.00
2         238.00
3          57.00
4          54.00
          ...   
2889        5.24
2890      104.00
2891       17.70
2892       60.40
2893      439.00
Name: standard_value, Length: 2894, dtype: float64

In [None]:
# saving the raw data into a csv file
aromatase_df.to_csv('aromatase_data_raw.csv', index=False)

# Handling missing Data

In [None]:
# Because of the large size of the dataframe the most efficeint way to clean data will be to drop the rows with misssing 'standard value' and 'canonical_smiles'
aromatase_df_clean = aromatase_df[aromatase_df.standard_value.notna()]
aromatase_df_clean = aromatase_df_clean[aromatase_df.canonical_smiles.notna()]
aromatase_df_clean

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,82585,[],CHEMBL666794,Inhibition of Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,7.1
1,,94540,[],CHEMBL666794,Inhibition of Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,50.0
2,,112960,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.238
3,,116766,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.057
4,,118017,[],CHEMBL661700,In vitro inhibition of human Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,uM,UO_0000065,,0.054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2889,,22819795,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4716146,Inhibition of recombinant human aromatase incu...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,nM,,,5.24
2890,,22819796,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4716146,Inhibition of recombinant human aromatase incu...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,nM,,,104.0
2891,,22819797,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4716146,Inhibition of recombinant human aromatase incu...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,nM,,,17.7
2892,,22819798,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4716146,Inhibition of recombinant human aromatase incu...,B,,,BAO_0000179,BAO_0000357,...,Homo sapiens,Cytochrome P450 19A1,9606,,,IC50,nM,,,60.4


# Data pre-processing

In [None]:
# We are classfying and isolating each datapoint's standard value into 3 classes
# inactice, active, and intermediate
# standad_value > 10000 => "inactive"
# standard_value <= 1000 => "active"
# else => intermediate
bioactivity_class=[]
for i in aromatase_df_clean.standard_value:
  if i>10000:
    bioactivity_class.append("inactive")
  elif i <=1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")


In [None]:
#Isolating the mol_id of each data point
mol_cid=[]
for i in aromatase_df_clean.molecule_chembl_id:
  mol_cid.append(i)

In [None]:
#Isolating the canonical_smiles of each data point
canonical_smiles = []
for i in aromatase_df_clean.canonical_smiles:
  canonical_smiles.append(i)

In [None]:
#Isolating the standard_value of each data point
standard_value = []
for i in aromatase_df_clean.standard_value:
  standard_value.append(float(i))

In [None]:
# combining the vairbale we just isolated into one dataframe
# combining bioactivity_class, mol_cid, cononical_smiles, standard_value
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
aromatase_currated = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])
aromatase_currated

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL341591,CC12CCC(O)CC1=CCC1C2CCC2(C)C(CC3CN3)CCC12,intermediate,7100.00
1,CHEMBL2111947,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@@]43...,inactive,50000.00
2,CHEMBL431859,CCn1c(C(c2ccc(F)cc2)n2ccnc2)c(C)c2cc(Br)ccc21,active,238.00
3,CHEMBL113637,CCn1cc(C(c2ccc(F)cc2)n2ccnc2)c2ccccc21,active,57.00
4,CHEMBL112021,Clc1ccccc1Cn1cc(Cn2ccnc2)c2ccccc21,active,54.00
...,...,...,...,...
2813,CHEMBL4755831,Nc1ccc(C(Cn2ccnc2)=C(c2ccc(O)cc2)c2ccc(O)cc2)cc1,active,5.24
2814,CHEMBL4750835,Nc1ccc(C(Cn2cncn2)=C(c2ccc(O)cc2)c2ccc(O)cc2)cc1,active,104.00
2815,CHEMBL4745681,Nc1ccc(C(Cn2cnnc2)=C(c2ccc(O)cc2)c2ccc(O)cc2)cc1,active,17.70
2816,CHEMBL4778401,Oc1ccc(C(=C(Cn2ccnc2)c2cccc(O)c2)c2ccc(O)cc2)cc1,active,60.40


#Saving Currated Dataframe to CSV File

In [None]:
aromatase_currated.to_csv('aromatase_currated.csv', index=False)
! ls -l

total 1604
-rw-r--r-- 1 root root  203145 Jul  8 15:46 aromatase_currated.csv
-rw-r--r-- 1 root root 1432391 Jul  8 15:45 aromatase_data_raw.csv
drwxr-xr-x 1 root root    4096 Jul  6 13:22 sample_data
