In [5]:
import pandas as pd
import gzip 

In [2]:
# From CCDs linked here https://www.wwpdb.org/documentation/carbohydrate-remediation
list_of_monosaccharides = pd.read_csv("CCD_carbohydrate_list.tsv", sep="\t", names=["ID", "ref"])
# Remove obsolete entries
list_of_monosaccharides = list_of_monosaccharides[list_of_monosaccharides["ref"] == "REL"]["ID"].tolist()

# List of amino acid IDs (for peptide ligands)
# https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/primary-sequences-and-the-pdb-format
standard_aa = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR', 'PYL', 'SEC']
modified_aa = ['ACE', 'ALY', 'AME', 'BWB', 'CAF', 'CAS', 'CME', 'COM', 'CSD', 'CSO', 'CSS', 'CSX', 'CXM', 'CY0', 'CYO', 'KCX', 'LGY', 'MHO', 'MK8', 'MLY', 'MSE', 'NEP', 'NMM',
                   'OCS', 'OCY', 'PHD', 'PTR', 'SCS', 'SEP', 'T8L', 'TPO', 'UNK']

# Nucleotides are included in bindingMOAD ligands and identifiers are difficult to distinguish so are kept here


In [3]:
############################# From DiffSBDD repo #########################################
from collections import defaultdict
from rdkit import Chem
from rdkit.Chem import QED
from rdkit import RDLogger 
import random

RDLogger.DisableLog('rdApp.*')

def read_label_file(csv_path):
    """
    Read BindingMOAD's label file
    Args:
        csv_path: path to 'every.csv'
    Returns:
        Nested dictionary with all ligands. First level: EC number,
            Second level: PDB ID, Third level: list of ligands. Each ligand is
            represented as a tuple (ligand name, validity, SMILES string)
    """
    ligand_dict = {}

    with open(csv_path, 'r') as f:
        for line in f.readlines():
            row = line.split(',')

            # new protein class
            if len(row[0]) > 0:
                curr_class = row[0]
                ligand_dict[curr_class] = {}
                continue

            # new protein
            if len(row[2]) > 0:
                curr_prot = row[2]
                ligand_dict[curr_class][curr_prot] = []
                continue

            # new small molecule
            if len(row[3]) > 0:
                ligand_dict[curr_class][curr_prot].append(
                    # (ligand name, validity, SMILES string)
                    [row[3], row[4], row[9]]
                )

    return ligand_dict

def compute_druglikeness(ligand_dict):
    """
    Computes RDKit's QED value and adds it to the dictionary
    Args:
        ligand_dict: nested ligand dictionary
    Returns:
        the same ligand dictionary with additional QED values
    """
    print("Computing QED values...")
    for p, m in [(p, m) for c in ligand_dict for p in ligand_dict[c]
                      for m in ligand_dict[c][p]]:
        mol = Chem.MolFromSmiles(m[2])
        if mol is None or m[2] == '':
            mol_id = f'{p}_{m}'
            continue
        m.append(QED.qed(mol))
    return ligand_dict


def filter_and_flatten(ligand_dict, qed_thresh, max_occurences, seed):

    filtered_examples = []
    filtered_examples_without_oligo_and_peptide = []
    all_examples = [(c, p, m) for c in ligand_dict for p in ligand_dict[c]
                    for m in ligand_dict[c][p]]

    # shuffle to select random examples of ligands that occur more than
    # max_occurences times
    random.seed(seed)
    random.shuffle(all_examples)

    oligosaccharide_ligands = {}
    peptide_or_aa_ligands = {}

    ligand_name_counter = defaultdict(int)
    filtered_ligand_dict = {}
    print("Filtering examples...")
    for c, p, m in all_examples:
        ligand_name, ligand_chain, ligand_resi = m[0].split(':')

        oligosaccharide = False
        peptide = False

        # QED only includes ligands that seem drug-like
        if m[1] == 'valid' and len(m) > 3 and m[3] > qed_thresh:
            # Check if oligosaccharide (monosaccharides can be considered) or peptide
            if len(ligand_name.split(' ')) > 1:
                for aa_or_monosaccharide in list(set(ligand_name.split(' '))):
                    if aa_or_monosaccharide in list_of_monosaccharides:
                        oligosaccharide = True
                        if p not in oligosaccharide_ligands.keys():
                            oligosaccharide_ligands[p] = []
                        oligosaccharide_ligands[p].append(m)

                    if (aa_or_monosaccharide in standard_aa) or (aa_or_monosaccharide in modified_aa):
                        peptide = True
                        if p not in peptide_or_aa_ligands.keys():
                            peptide_or_aa_ligands[p] = []
                        peptide_or_aa_ligands[p].append(m)

            if c not in list(filtered_ligand_dict.keys()):
                filtered_ligand_dict[c] = []

            if p not in filtered_ligand_dict[c]:
                filtered_ligand_dict[c].append(p)

            if ligand_name_counter[ligand_name] < max_occurences:
                filtered_examples.append(
                    (c, p, m)
                )
                ligand_name_counter[ligand_name] += 1

                if not oligosaccharide and not peptide:
                    filtered_examples_without_oligo_and_peptide.append((c, p, m))

    return filtered_examples, filtered_ligand_dict, oligosaccharide_ligands, peptide_or_aa_ligands, filtered_examples_without_oligo_and_peptide

In [4]:
# Create ligand dict with the same filtering from DiffSBDD
# Set a large amount of max occurences to get all ligands back

ligand_dict = read_label_file("BindingMOAD/every.csv")
ligand_dict = compute_druglikeness(ligand_dict)
filtered_examples, filtered_ligand_dict, oligosaccharide_ligands, peptide_or_aa_ligands, filtered_examples_without_oligo_and_peptide = filter_and_flatten(ligand_dict, 0.3, 5000000, 42)


Computing QED values...
Filtering examples...


In [7]:
# Map PDBs to Uniprot IDs
# From https://www.ebi.ac.uk/pdbe/docs/sifts/quick.html
with gzip.open("pdb_chain_uniprot.tsv.gz") as uniprot_pdb_mapping:
    df = pd.read_csv(uniprot_pdb_mapping, sep="\t", skiprows=1)
    df["PDB"] = df["PDB"].str.upper()

filtered_PDBs_without_oligo_peptide_ligs = [i[1] for i in filtered_examples_without_oligo_and_peptide]

filtered_PDB_Uniprot_mapping_no_oligo_peptide_ligs = df[df["PDB"].isin(filtered_PDBs_without_oligo_peptide_ligs)][["PDB", "CHAIN", "SP_PRIMARY"]].drop_duplicates()

df_final_set_data = []

for (c,p,m) in filtered_examples_without_oligo_and_peptide:
    ligand_id, ligand_chain, ligand_chain_id = m[0].split(":")
    uniprot_id = ','.join(filtered_PDB_Uniprot_mapping_no_oligo_peptide_ligs[(filtered_PDB_Uniprot_mapping_no_oligo_peptide_ligs["PDB"] == p) & (filtered_PDB_Uniprot_mapping_no_oligo_peptide_ligs["CHAIN"] == ligand_chain)]["SP_PRIMARY"].values)
    df_final_set_data.append([c, p, uniprot_id, ligand_id, ligand_chain, ligand_chain_id, m[2], m[3]])

df_final_set = pd.DataFrame(df_final_set_data, columns=["Class", "PDB ID", "Uniprot ID", "Ligand ID", "Ligand chain", "Ligand chain ID", "SMILES", "QED"])

Unnamed: 0,Class,PDB ID,Uniprot ID,Ligand ID,Ligand chain,Ligand chain ID,SMILES,QED
0,4.2.1.81,2DW6,Q89FH0,TAR,C,1003,[C@H]([C@@H](C(=O)O)O)(C(=O)O)O,0.365229
1,2.7.11.1,3UOD,O14965,0C3,A,1,c1ccc(c(c1)C(F)(F)F)Nc2ccnc(n2)Nc3ccc(cc3)C(=O)O,0.601064
2,3.6.4.10,5J64,P07900,6G7,A,301,c1ccc(c(c1)N2C(=NNC2=O)c3ccc(cc3O)O)F,0.669688
3,3.6.1.-,4CD3,O35795,8E9,A,1463,Cc1cccc(c1)Nc2cc(c(c3c2C(=O)c4ccccc4C3=O)N)S(=...,0.350402
4,3.4.11.1,3KR4,Q8IL11,BES,K,1003,CC(C)C[C@@H](C(=O)O)NC(=O)[C@H]([C@@H](Cc1cccc...,0.559494
...,...,...,...,...,...,...,...,...
50735,12.-.-.-,5R0E,P32357,SY4,B,401,CCCS(=O)(=O)Nc1cc(c(cc1F)F)N,0.798740
50736,6.1.1.26,2ZIN,Q8PWY1,LBY,A,601,CC(C)(C)OC(=O)NCCCC[C@@H](C(=O)O)N,0.608878
50737,2.7.-.-,5BVC,B3VI55,4PW,A,502,C1[C@@H]2[C@H]([C@@H]([C@H]([C@H](O1)O2)O)O)O,0.377329
50738,2.1.1.43,5VAH,B9RU15,SAH,A,401,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,0.352410


In [11]:
blind_set_one_class_pdbs = ["6CT5", "1W6K", "1W6J", "5CQD", "5CQK", "5CQH", "5CQI", "3RDE", "3HI7", "3HIG", "3HII", "6RYO", "6RYP", 
"4MK0", "4L9I", "4Q6R", "1PL6", "1PL7", "1PL8", "6R7D", "1ITU", "1ITQ", "2V77", "2C92", "2C94", "2C97", "2C9B", "2C9D", "5JWA", 
"5JWB", "5JWC", "6Z80", "6Z85", "6Z86", "6Z87", "6Z88", "6Z89", "7ACC", "7AL9", "7ALA", "7ALB", "7ALC", "5YJW", "5YJX", "5YJY", 
"6XI9", "6XIG", "1KQB", "1KQC", "1KQD", "4KX7", "4KX8", "4KX9", "4KXA", "4KXB", "4KXC", "4KXD", "4YT3", "5IKI"]

blind_set_dockstring_and_other_pdbs = ["4L7S", "1FBZ", "4AF3", "5WO4", "7Q7I", "7Q7K", "7Q7L", "7Q7W", "7Q6H", "3LXN", "4BQS"]

df_final_set_no_blind_pdbs = df_final_set[~df_final_set["PDB ID"].isin(list(set(blind_set_dockstring_and_other_pdbs)|set(blind_set_one_class_pdbs)))]
df_final_set_no_blind_pdbs

Unnamed: 0,Class,PDB ID,Uniprot ID,Ligand ID,Ligand chain,Ligand chain ID,SMILES,QED
0,4.2.1.81,2DW6,Q89FH0,TAR,C,1003,[C@H]([C@@H](C(=O)O)O)(C(=O)O)O,0.365229
1,2.7.11.1,3UOD,O14965,0C3,A,1,c1ccc(c(c1)C(F)(F)F)Nc2ccnc(n2)Nc3ccc(cc3)C(=O)O,0.601064
2,3.6.4.10,5J64,P07900,6G7,A,301,c1ccc(c(c1)N2C(=NNC2=O)c3ccc(cc3O)O)F,0.669688
3,3.6.1.-,4CD3,O35795,8E9,A,1463,Cc1cccc(c1)Nc2cc(c(c3c2C(=O)c4ccccc4C3=O)N)S(=...,0.350402
4,3.4.11.1,3KR4,Q8IL11,BES,K,1003,CC(C)C[C@@H](C(=O)O)NC(=O)[C@H]([C@@H](Cc1cccc...,0.559494
...,...,...,...,...,...,...,...,...
50735,12.-.-.-,5R0E,P32357,SY4,B,401,CCCS(=O)(=O)Nc1cc(c(cc1F)F)N,0.798740
50736,6.1.1.26,2ZIN,Q8PWY1,LBY,A,601,CC(C)(C)OC(=O)NCCCC[C@@H](C(=O)O)N,0.608878
50737,2.7.-.-,5BVC,B3VI55,4PW,A,502,C1[C@@H]2[C@H]([C@@H]([C@H]([C@H](O1)O2)O)O)O,0.377329
50738,2.1.1.43,5VAH,B9RU15,SAH,A,401,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,0.352410


In [14]:
df_final_set.to_csv("Benchmarking_Tasks/Task1/BindingMOAD_filtered_full_set.csv", index=False)


In [13]:
df_final_set_no_blind_pdbs.to_csv("Benchmarking_Tasks/Task1/BindingMOAD_filtered_set_for_retraining.csv", index=False)
