In [None]:
# Import the needed libraries
import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import AllChem
from molvs import standardize_smiles

In [None]:
# Initiate important functions
## pIC50 conversion. Source: https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)

    return x

## Value normalization if IC50 is too large
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)

    return x

In [None]:

# Protein Target List
target_id = [
    'CHEMBL1957', # Insulin-like growth factor I receptor
    'CHEMBL262', # Glycogen synthase kinase-3 beta
    'CHEMBL1871', # Androgen Receptor
    'CHEMBL3717', # Hepatocyte growth factor receptor
    'CHEMBL325', # Histone deacetylase 1
    'CHEMBL3713062', # Tissue factor pathway inhibitor
    'CHEMBL2971', # Tyrosine-protein kinase JAK2 - Protein Kinase
    'CHEMBL2842', # Serine/threonine-protein kinase mTOR - Protein Kinase
    'CHEMBL1862', # Tyrosine Protein Kinase ABL - Protein Kinase
    'CHEMBL258', # Tyrosine-protein kinase LCK - Protein Kinase
    'CHEMBL4282', # Serine/threonine-protein kinase AKT - Protein Kinase
    'CHEMBL3650', # Fibroblast growth factor receptor 1 - Protein Kinase
    'CHEMBL4005', # PI3-kinase p110-alpha subunit - Transferase
    'CHEMBL3130', # PI3-kinase p110-delta subunit - Transferase
    'CHEMBL3105', # Poly [ADP-ribose] polymerase-1 - Transferase
    'CHEMBL3267', # PI3-kinase p110-gamma subunit - Transferase
    'CHEMBL3145', # PI3-kinase p110-beta subunit - Transferase
    'CHEMBL4158', # Fatty acid synthase - Transferase
    'CHEMBL220', # Acetylcholinesterase - Hydrolase
    'CHEMBL1914', # Butyrylcholinesterase - Hydrolase
    'CHEMBL2243', # Anandamide amidohydrolase - Hydrolase
    'CHEMBL4191', # Monoglyceride lipase - Hydrolase
    'CHEMBL3559', # Steryl-sulfatase - Hydrolase
    'CHEMBL5080', # Endothelial lipase - Hydrolase
    'CHEMBL217', # Dopamine D2 receptor - Family A GPR
    'CHEMBL218', # Cannabinoid CB1 receptor - Family A GPR
    'CHEMBL233', # Mu opioid receptor - Family A GPR
    'CHEMBL253', # Cannabinoid CB2 receptor - Family A GPR
    'CHEMBL224', # Serotonin 2a (5-HT2a) receptor - Family A GPR
    'CHEMBL210', # Beta-2 adrenergic receptor - Family A GPR
    'CHEMBL230', # Cyclooxygenase-2 - Oxidoreductase
    'CHEMBL1951', # Monoamine oxidase A - Oxidoreductase
    'CHEMBL4685', # Indoleamine 2,3-dioxygenase - Oxidoreductase
    'CHEMBL4235', # 11-beta-hydroxysteroid dehydrogenase 1 - Oxidoreductase
    'CHEMBL202', # Dihydrofolate reductase - Oxidoreductase
    'CHEMBL215', # Arachidonate 5-lipoxygenase - Oxidoreductase 
    'CHEMBL204', # Thrombin - Protease
    'CHEMBL4822', # Beta-secretase 1 - Protease
    'CHEMBL244', # Coagulation factor X - Protease
    'CHEMBL248', # Leukocyte elastase - Protease
    'CHEMBL332', # Matrix metalloproteinase-1 - Protease
    'CHEMBL284', # Dipeptidyl peptidase IV - Protease
    'CHEMBL2535', # Glucose transporter - Transporter
    'CHEMBL228', # Serotonin transporter - Transporter
    'CHEMBL238', # Dopamine transporter - Transporter
    'CHEMBL3884' # Sodium/glucose cotransporter 2 - Transporter
]

In [None]:

# Initiate CHEMBL Web Search
activity = new_client.activity

In [None]:

# For loop for fingerprint creation
for x in target_id:
    results = activity.filter(target_chembl_id = x).filter(standard_type = 'IC50') # Create a new client search with the filter target id and IC50
    df1 = pd.DataFrame.from_dict(results) # Load the results to a dataframe and view
    df_combined = df1[df1.standard_value.notna()]
    df_combined = df_combined[df_combined.canonical_smiles.notna()]
    df_combined = df_combined.drop_duplicates(['canonical_smiles'])
    df_combined['standard_value'] = df_combined['standard_value'].astype(float)
    df_combined.reset_index(drop=True, inplace=True)
    df_combined = df_combined[df_combined['standard_units']=='nM']
    print(pd.unique(df_combined['standard_units']))
    df_combined.reset_index(drop=True, inplace=True)
    smiles = []
    smiles = [standardize_smiles(i) for i in df_combined['canonical_smiles'].values] 
    smiles_series = pd.Series(smiles, name = 'canonical_smiles')
    df_complete = norm_value(df_combined)
    df_complete = pIC50(df_complete)
    df_complete = pd.concat([df_complete.molecule_chembl_id, df_complete.pIC50], axis=1)
    df_complete = pd.concat([df_complete,smiles_series], axis=1)
    df_complete.reset_index(drop=True, inplace=True)
    df_complete.pIC50.describe()
    df_complete.to_csv(f'smiles/{x}_smiles.csv', index=False)
    print(f'{x} smiles completed')