In [2]:
# Import the needed libraries
import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import AllChem

In [3]:
# Define function to convert IC50 to pIC50
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)

    return x

# Normalized the value if its too large as to not return a negative pIC50
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)

    return x

# Custom function to convert Mol from Smiles
def molsmile (smiles):
    moldata = []
    for m in smiles: # iterate through iterables in smiles
        mol = Chem.MolFromSmiles(m) # convert smiles to mols
        moldata.append(mol)
    return moldata

In [4]:
tyr_kinase_dict = {
    'prot_name' : [],
    'mol_number' : [],
    'Chembl_ID' : []
}

In [5]:
target_df = pd.read_csv('RTK.csv', sep = ';')
target_list = target_df.ChemblD.tolist()
target_list = target_list[25:] # Re run due to failure

In [6]:
for n in target_list:
    target_id = n
    activity = new_client.activity
    results = activity.filter(target_chembl_id = target_id).filter(standard_type = 'IC50')
    df1 = pd.DataFrame.from_dict(results) # Load the results to a dataframe and view
    df2 = df1[df1.standard_value.notna()]
    df2 = df2[df2.canonical_smiles.notna()]
    df2_nr = df2.drop_duplicates(['canonical_smiles'])
    df2_nr = df2_nr.loc[df2_nr['standard_units'] == 'nM']

    protein_name = df1.target_pref_name[0]

    smiles = []
    for i in df2_nr.canonical_smiles.tolist():
        smi = str(i).split('.')
        smi_longest = max(smi, key=len)
        smiles.append(smi_longest)
    df2_nr.standard_value = df2_nr.standard_value.astype(float)

    df1 = norm_value(df2_nr)
    df1 = pIC50(df1)
    smiles_series = pd.Series(smiles, name = 'canonical_smiles')
    df1 = df1[['pIC50', 'molecule_chembl_id']].reset_index()
    df1 = pd.concat([df1, smiles_series], axis=1)
    df1.replace([np.inf, -np.inf], np.nan, inplace = True)

    molecule_number = len(df1)

    smile_series = df1.canonical_smiles # Create smile series
    mol_series = molsmile(smile_series) # Create mol series
    fingerprint_ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,3, nBits = 1024) for x in mol_series] # Create efcp6 fingerprints
    fingerprint_ECFP6_lists = [list(l) for l in fingerprint_ECFP6] # turn fingerprints to list
    ecfp6_name = [f'ecfp_bit_{i}' for i in range(len(fingerprint_ECFP6[1]))] # create bit name
    fingerprint_ECFP6_df = pd.DataFrame(fingerprint_ECFP6_lists, index=df1.index, columns=ecfp6_name) # create efcp dataframe
    model1 = pd.concat([fingerprint_ECFP6_df, df1.pIC50], axis=1) # add the labels
    model1.to_csv(f'fingerprints/{target_id}_fingerprints.csv', index=False)

    tyr_kinase_dict['prot_name'].append(protein_name)
    tyr_kinase_dict['mol_number'].append(molecule_number)
    tyr_kinase_dict['Chembl_ID'].append(target_id)

  x = input.drop('standard_value', 1)
  x = input.drop('standard_value_norm', 1)


In [8]:
df_report = pd.DataFrame.from_dict(tyr_kinase_dict)
df_report.to_csv('fingerprints/report_2.csv', index=False)