In [1]:
# import necessary modules
import os
import math
import time
import gzip
import requests
import pandas as pd
from pymol import cmd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from biopandas.pdb import PandasPdb

In [2]:
PATH_TO_COMPLEXES_LIST='expanded_Astex_count.csv'
PATH_TO_PDB_FOLDER='pdb_files'
PATH_TO_REFERENCE_LIGANDS_FOLDER='reference_ligands'
PATH_TO_SMILES_TO_GENERATE_CONFORMER='smiles_to_genconformer'

In [3]:
# functions:
def get_PDB(pdb_id):
    '''download .pdb file'''
    response = requests.get('https://files.rcsb.org/download/{}.pdb'.format(pdb_id))

    if response.status_code == 200:
        return response.text
    else:
        return None

def get_smiles_from_PDB_chem_comp(chem_comp, program='OpenEye OEToolkits'):
    '''get Isomeric SMILES from the PDB'''
    variables=dict(comp_id = chem_comp)
    query = '''query($comp_id: String!) {
            chem_comp(comp_id: $comp_id) {
               pdbx_chem_comp_descriptor {
                         type
                         program
                         descriptor
                        }
                      }
               }'''
    response = requests.post('http://data.rcsb.org/graphql', json={'query': query, 'variables': variables})
    dictionary = {}
    if response.status_code == 200:
        response = response.json()['data']['chem_comp']
        if response is not None:
            response = response['pdbx_chem_comp_descriptor']
            for content in response:
                if content['type'] == 'SMILES_CANONICAL' and content['program'] == program:
                    dictionary[content['type']] = content['descriptor']
            if dictionary=={}:
                return None
            return dictionary
        else:
            return None
    else:
        return None

def get_smiles_from_PDB_chem_comp_list(chem_comp_list, program='OpenEye OEToolkits'):
    program='OpenEye OEToolkits'

    dictionary = {}
    for chem in chem_comp_list:
        dictionary[chem]=None

    chem_comps='["'
    for chem in chem_comp_list[:-1]:
        chem_comps+=chem+'", "'
    chem_comps+=chem_comp_list[-1]+'"]'

    query='''{
      chem_comps(comp_ids: '''+chem_comps+''') {
        pdbx_chem_comp_descriptor {
          comp_id
          type
          program
          descriptor
        }
      }
    }'''

    response = requests.post('http://data.rcsb.org/graphql', json={'query': query})
    response.json()

    try:
        response = response.json()['data']['chem_comps']
        for ligand in response:
            ligand = ligand['pdbx_chem_comp_descriptor']
            for content in ligand:
                if content['type'] == 'SMILES_CANONICAL' and content['program'] == program:
                    dictionary[content['comp_id']]=content['descriptor']
        return dictionary
    except:
        return None

def get_model_ligand(pdbblock, comp_id):
    ''''''
    pdbblock = pdbblock.split('\n')
    keep = []
    for i in pdbblock:
        if ('HETATM' in i) and (comp_id in i):
            keep += [i]
    keep = '\n'.join(keep)
    mol = Chem.MolFromPDBBlock(keep, sanitize=True)
    mol = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)[0]

    return mol

In [4]:
# read .csv file with all complexes into a dataframe
df = pd.read_csv(PATH_TO_COMPLEXES_LIST)
df.head()

Unnamed: 0,Astex_Diverse_Set,Protein_Class,Uniprot_ID,Protein_ID,Ligand_Name,Chain,resolution,mutated,count
0,1OQ5,4.2.1.1,P00918,3T85,SG7,A,2.4,0,303
1,1OQ5,4.2.1.1,P00918,1OKN,STB,A,2.4,0,303
2,1OQ5,4.2.1.1,P00918,1BNV,AL7,A,2.4,0,303
3,1OQ5,4.2.1.1,P00918,1BNQ,AL4,A,2.4,0,303
4,1OQ5,4.2.1.1,P00918,1CNY,EG3,A,2.3,0,303


In [5]:
# make list with the protein targets uniprot identifiers
uniprot_ids = df['Uniprot_ID'].drop_duplicates()

0       P00918
303     P29476
498     P07900
685     P24941
862     P00734
         ...  
2273    O93077
2274    P15917
2275    Q07785
2276    Q00535
2277    P9WIL5
Name: Uniprot_ID, Length: 80, dtype: object


In [6]:
# save all .pdb files
for uniprot_id in uniprot_ids:
    os.makedirs('pdb_files_originals/'+uniprot_id, exist_ok=True)
    list_of_pdb=df[df['Uniprot_ID']==uniprot_id]['Protein_ID'].drop_duplicates().tolist()
    o=open('pdb_files_originals/'+uniprot_id+'/list_of_pdb.txt', 'w')
    o.write(','.join(list_of_pdb))
    o.close()
    os.system('nohup' +
              ' ./batch_download.sh' +
              ' -f ' + 'pdb_files_originals/' + uniprot_id + '/list_of_pdb.txt' +
              ' -o ' + 'pdb_files_originals/' + uniprot_id +
              ' -p ' +
              '  > ' + 'pdb_files_originals/' + uniprot_id + '/download_log.txt &')

In [13]:
# get all the smiles
list_of_ligand_names=df['Ligand_Name'].drop_duplicates().tolist()
ligand_smiles_dictionary=get_smiles_from_PDB_chem_comp_list(chem_comp_list=list_of_ligand_names,
                                                            program='OpenEye OEToolkits')

# parse the smiles with RDKit
for ligand_name in ligand_smiles_dictionary:
    try:
        ligand_smiles_dictionary[ligand_name]=Chem.MolToSmiles(Chem.MolFromSmiles(ligand_smiles_dictionary[ligand_name]))
    except Exception as e:
        ligand_smiles_dictionary[ligand_name]=None

In [17]:
# generate reference ligands and organize summary dataframe
log=open('log.txt', 'w')

df['Smiles'] = ''
for uniprot_id in uniprot_ids:
    print('Start with protein target:', uniprot_id)
    log.write('\n')
    log.write('Start with protein target: '+ uniprot_id +'\n')

    os.makedirs(PATH_TO_PDB_FOLDER + '/' + uniprot_id, exist_ok=True)
    os.makedirs(PATH_TO_REFERENCE_LIGANDS_FOLDER + '/' + uniprot_id, exist_ok=True)

    smiles_writer=open(PATH_TO_SMILES_TO_GENERATE_CONFORMER + '/' + uniprot_id + '_smiles_to_genconformer.smi', 'w')

    proteins_ligands = df.loc[df['Uniprot_ID'] == uniprot_id][['Protein_ID', 'Ligand_Name']].itertuples()
    for index, pdb_id, ligand_id in proteins_ligands:

        try:
            pdb=gzip.open('pdb_files_originals/' + uniprot_id + '/' + pdb_id + '.pdb.gz', 'rt')
            pdb=pdb.read()
            smiles_canonical=ligand_smiles_dictionary[ligand_id]
        except Exception as e:
            pdb=None
            smiles_canonical=None

        if pdb!=None and smiles_canonical!=None:
            try:
                model_mol = get_model_ligand(pdbblock=pdb, comp_id=ligand_id)

                refmol = Chem.MolFromSmiles(smiles_canonical)

                model_mol = AllChem.AssignBondOrdersFromTemplate(refmol=refmol, mol=model_mol)
                AllChem.AssignStereochemistryFrom3D(model_mol)

                model_mol.SetProp('_Name', '_'.join([pdb_id, ligand_id]))

                if Chem.MolToSmiles(refmol) != Chem.MolToSmiles(model_mol):
                    df=df.drop(index=index)
                    #print('Why are the smiles not equal?', pdb_id, ligand_id)
                    log.write(pdb_id+' '+ligand_id+': Why are the smiles not equal?'+'\n')
                else:
                    file = open(PATH_TO_PDB_FOLDER + '/' + uniprot_id + '/' + pdb_id + '.pdb', 'w')
                    file.write(pdb)
                    file.close()
                    sdf = Chem.SDWriter(PATH_TO_REFERENCE_LIGANDS_FOLDER + '/' + uniprot_id + '/' + '_'.join([pdb_id, ligand_id]) + '.sdf')
                    sdf.write(model_mol)
                    sdf.close()
                    log.write(pdb_id+' '+ligand_id+': success!'+'\n')
                    df.at[index, 'Smiles'] = smiles_canonical
                    smiles_writer.write(smiles_canonical + ' ' + model_mol.GetProp('_Name') + '\n')

            except Exception as e:
                df=df.drop(index=index)
                #print('Exception:', e)
                #print(pdb_id, ligand_id)
                log.write(pdb_id+' '+ligand_id+': Exception: '+str(e)+'\n')
        else:
            df=df.drop(index=index)
            log.write(pdb_id+' '+ligand_id+': pdb or smiles equal to None!'+'\n')

    smiles_writer.close()
    log.close()
    df[df['Uniprot_ID']==uniprot_id].to_csv('astex-diverse-set/'+uniprot_id+'.csv', index=False)
    log=open('log.txt', 'a')
log.close()

df.to_csv('expanded_Astex_with_smiles.csv', index=False)

Start with protein target: P00918
Start with protein target: P29476
Start with protein target: P07900
Start with protein target: P24941
Start with protein target: P00734
Start with protein target: P02766
Start with protein target: P00742
Start with protein target: O14757
Start with protein target: P15121
Start with protein target: Q08499
Start with protein target: P00374
Start with protein target: P28720
Start with protein target: P08709
Start with protein target: P04058
Start with protein target: P53779
Start with protein target: Q24451
Start with protein target: P04585
Start with protein target: P10275
Start with protein target: P11086
Start with protein target: P27487
Start with protein target: P26281
Start with protein target: P11473
Start with protein target: P00811
Start with protein target: P49841
Start with protein target: P50579
Start with protein target: P00749
Start with protein target: P16083
Start with protein target: P15090
Start with protein target: P35557
Start with pro

In [24]:
for repre in df['Astex_Diverse_Set'].drop_duplicates():
    df.loc[df['Astex_Diverse_Set']==repre, 'count']=len(df[df['Astex_Diverse_Set']==repre])

df.head()

Unnamed: 0,Astex_Diverse_Set,Protein_Class,Uniprot_ID,Protein_ID,Ligand_Name,Chain,resolution,mutated,count,Smiles
0,1OQ5,4.2.1.1,P00918,3T85,SG7,A,2.4,0,273,CC(=O)O[C@@H]1[C@H](O)[C@@H](O)O[C@H](COS(N)(=...
1,1OQ5,4.2.1.1,P00918,1OKN,STB,A,2.4,0,273,NS(=O)(=O)c1ccc(C(=O)NCCCCNCS)cc1
2,1OQ5,4.2.1.1,P00918,1BNV,AL7,A,2.4,0,273,CN[C@@H]1CN(c2cccc(OC)c2)S(=O)(=O)c2sc(S(N)(=O...
3,1OQ5,4.2.1.1,P00918,1BNQ,AL4,A,2.4,0,273,CCN[C@H]1CN(CCOC)S(=O)(=O)c2sc(S(N)(=O)=O)cc21
4,1OQ5,4.2.1.1,P00918,1CNY,EG3,A,2.3,0,273,N[C@@H](Cc1ccccc1)C(=O)NCCOCCOCCNC(=O)c1ccc(S(...


In [None]:
df.to_csv('expanded_Astex_with_smiles.csv', index=False)

In [30]:
df_less_or_equal_to_three = df.loc[df['count']<3]

In [31]:
df_less_or_equal_to_three.to_csv('expanded_Astex_with_smiles_less_or_equal_to_three.csv', index=False)

In [34]:
len(df_less_or_equal_to_three['Uniprot_ID'].drop_duplicates())

11

In [35]:
len(df['Uniprot_ID'].drop_duplicates())

79

In [5]:
df = pd.read_csv('expanded_Astex_with_smiles.csv')

Unnamed: 0,Astex_Diverse_Set,Protein_Class,Uniprot_ID,Protein_ID,Ligand_Name,Chain,resolution,mutated,count,Smiles
0,1OQ5,4.2.1.1,P00918,3T85,SG7,A,2.40,0,273,CC(=O)O[C@@H]1[C@H](O)[C@@H](O)O[C@H](COS(N)(=...
1,1OQ5,4.2.1.1,P00918,1OKN,STB,A,2.40,0,273,NS(=O)(=O)c1ccc(C(=O)NCCCCNCS)cc1
2,1OQ5,4.2.1.1,P00918,1BNV,AL7,A,2.40,0,273,CN[C@@H]1CN(c2cccc(OC)c2)S(=O)(=O)c2sc(S(N)(=O...
3,1OQ5,4.2.1.1,P00918,1BNQ,AL4,A,2.40,0,273,CCN[C@H]1CN(CCOC)S(=O)(=O)c2sc(S(N)(=O)=O)cc21
4,1OQ5,4.2.1.1,P00918,1CNY,EG3,A,2.30,0,273,N[C@@H](Cc1ccccc1)C(=O)NCCOCCOCCNC(=O)c1ccc(S(...
...,...,...,...,...,...,...,...,...,...,...
1987,1N46,18.-.-.-,P10828,6KKB,8HO,X,1.70,0,2,Cc1nc(C(=O)NCC(=O)O)c(O)c2ccc(Oc3ccccc3)cc12
1988,1YVF,2.7.7.48,O93077,1YVF,PH7,A,2.50,0,1,O=C(O)/C(=C/c1ccc(Oc2ccccc2Br)cc1)NC(=O)c1ccccc1
1989,1YQY,3.4.24.83,P15917,4DV8,0LX,A,1.63,0,1,CO[C@H](C[C@H](CCCCNCc1ccc(F)cc1)C(=O)NO)c1ccc...
1990,1V0P,2.7.1.23,Q07785,1V0O,INR,"A,B",1.90,0,1,O=C1Nc2ccc(S(=O)(=O)O)cc2/C1=C1/Nc2ccccc2C1=O


In [8]:
df_less_or_equal_to_20 = df.loc[df['count']<20]
df_less_or_equal_to_20

Unnamed: 0,Astex_Diverse_Set,Protein_Class,Uniprot_ID,Protein_ID,Ligand_Name,Chain,resolution,mutated,count,Smiles
1587,1OWE,3.4.21.73,P00749,1OWD,497,A,2.32,0,9,CC[C@H]1CNCc2ccc(NC(=O)c3ccc4cc(C(=N)N)ccc4c3)...
1588,1OWE,3.4.21.73,P00749,2VNT,QGG,"A,B,C,D,E,F",2.20,0,9,NC(N)=Nc1ncc(Cl)c2ccc(S(=O)(=O)N3CCC[C@@H]3C(=...
1589,1OWE,3.4.21.73,P00749,1SQA,UI1,A,2.00,0,9,N=C(N)c1ccc2cc(C(=O)Nc3ccc(CN)cc3)cc(Nc3ncccn3...
1590,1OWE,3.4.21.73,P00749,1SQO,UI2,A,1.84,0,9,N=C(N)c1ccc2cccc(Nc3ncccn3)c2c1
1591,1OWE,3.4.21.73,P00749,3IG6,438,"B,D",1.83,0,9,CN(C)c1ccc(C(=O)O)c(Oc2nc(Oc3cccc(-c4cccc(CN)c...
...,...,...,...,...,...,...,...,...,...,...
1987,1N46,18.-.-.-,P10828,6KKB,8HO,X,1.70,0,2,Cc1nc(C(=O)NCC(=O)O)c(O)c2ccc(Oc3ccccc3)cc12
1988,1YVF,2.7.7.48,O93077,1YVF,PH7,A,2.50,0,1,O=C(O)/C(=C/c1ccc(Oc2ccccc2Br)cc1)NC(=O)c1ccccc1
1989,1YQY,3.4.24.83,P15917,4DV8,0LX,A,1.63,0,1,CO[C@H](C[C@H](CCCCNCc1ccc(F)cc1)C(=O)NO)c1ccc...
1990,1V0P,2.7.1.23,Q07785,1V0O,INR,"A,B",1.90,0,1,O=C1Nc2ccc(S(=O)(=O)O)cc2/C1=C1/Nc2ccccc2C1=O


In [9]:
df_less_or_equal_to_20.to_csv('less_than_20.csv', index=False)