In [None]:
# Get all proteins with same sequence and create another dataset with the labels of these sequences combined
import numpy as np
from os import listdir
from constants import TRAIN_FOLDER, TEST_FOLDER, VAL_FOLDER
from collections import defaultdict

folders = [TRAIN_FOLDER, TEST_FOLDER, VAL_FOLDER]
dic = {}
unique = [set(), set(), set()]

for i, folder in enumerate(folders):
    for file in listdir(folder):
        data = np.load(folder + file, allow_pickle=True)
        protein = data["protein"].item()
        metadata = data["metadata"].item()
        seq = protein["sequence"]
        unique[i].add(seq)
        if seq in dic:
            for key in metadata:
                dic[seq]["metadata"][key].append(metadata[key])
            dic[seq]["protein"]["labels"] += protein["labels"]
        else:
            dic[seq] = {}
            dic[seq]["metadata"] = {}
            for key in metadata:
                dic[seq]["metadata"][key] = [metadata[key]]
            dic[seq]["protein"] = protein

In [None]:
# We've added all the labels together so if a binding site occurs in more than half the ligands, then we assume it is the important site and consider it as our label
for i, key in enumerate(dic):
    ln = len(dic[key]["metadata"]["pdb_id"]) // 2
    dic[key]["protein"]["labels"] = (dic[key]["protein"]["labels"] > 0).astype(float)

In [None]:
# Some statistics on the common sequences in test train and val data
print(len(s[0]))
print(len(s[1]))
print(len(s[2]))
print(len(s[1]-s[0]))
print(len(s[2]-s[0]))

In [None]:
# Check whether the dictionary made is correct or not
for key in dic:
    if len(dic[key]["metadata"]["pdb_id"]) > 1:
        print(dic[key])
        break

In [None]:
# Save the data
for key in dic:
    ln = len(dic[key]["metadata"]["pdb_id"])
    pdb_id = dic[key]["metadata"]["pdb_id"][0] + "-" + str(ln)
    np.savez(
        "./data/PDBbind/preprocessed/unique2/" + pdb_id + ".npz",
        metadata=dic[key]["metadata"],
        protein=dic[key]["protein"],
    )

In [2]:
# Check whether a pdb has 2 or more chains surrounding a binding site
import numpy as np
from os import listdir, path
from Bio.PDB import PDBParser, PPBuilder
from rdkit import Chem
from collections import defaultdict

parser = PDBParser(QUIET=True)
ppb = PPBuilder()
raw_dir = "./data/scPDB/raw/"
write_fil = open("./multiple", "w")

for fil in sorted(listdir(raw_dir)):
    print(fil[:4])
    pdb_prefix = path.join(raw_dir, fil)
    protein_structure = parser.get_structure(fil, path.join(pdb_prefix, "downloaded.pdb"))
    residues = []
    for seq in ppb.build_peptides(protein_structure, aa_only=False):
        for res in seq:
            residues.append(res)

    ligand_supplier = Chem.SDMolSupplier(
        path.join(pdb_prefix, "ligand.sdf"), sanitize=False
    )
    ligand_supplier = ligand_supplier[0]
    ligand_coords = ligand_supplier.GetConformer().GetPositions()
    ligand_num_atoms = ligand_supplier.GetNumAtoms()
    ligand_atom_types = np.array(
        [atom.GetSymbol() for atom in ligand_supplier.GetAtoms()]
    )

    binding_site = []
    for ind, residue in enumerate(residues):
        flg = 0
        for atom in residue.get_atoms():
            if atom.get_fullname()[1] == "H":
                continue
            for i in range(ligand_num_atoms):
                if ligand_atom_types[i] == "H":
                    continue
                if np.linalg.norm(atom.get_coord() - ligand_coords[i]) <= 4.5:
                    flg = 1
                    binding_site.append(residue)
                    break
            if flg:
                break
    dic = defaultdict(bool)
    for ind, residue in enumerate(binding_site):
        dic[residue.get_parent().get_id()] = True
    if len(dic) > 1:
        # 2 chains cover 1 ligand
        write_fil.write(fil + "\n")
        write_fil.write(str(dic.keys()) + "\n")

write_file.close()

10mh
11bg
12gs
  % residue.get_resname()
13gs
17gs
19gs
1a26
1a27
1a29
  % residue.get_resname()
1a2b
1a2n
1a42
1a4i
1a4l
1a4r
1a4w
  % residue.get_resname()
1a4z
1a50
1a59
1a5b
1a5s
1a5u
