In [1]:
from Bio.PDB import *
from rdkit import Chem

# import nglview as nv
import numpy as np

REFINED_FOLDER = "./data/PDBbind/pdbbind_v2018_refined/refined-set/"
INDEX_FOLDER = "./data/PDBbind/PDBbind_2018_plain_text_index/index/"
parser = PDBParser()
ppb = PPBuilder()
# pdb_id = "3aqt"
pdb_id = "1ezq"

# Protein structure
structure = parser.get_structure(
    pdb_id, REFINED_FOLDER + pdb_id + "/" + pdb_id + "_protein.pdb"
)

# Ligand structure
suppl = Chem.SDMolSupplier(
    REFINED_FOLDER + pdb_id + "/" + pdb_id + "_ligand.sdf", sanitize=False
)
assert len(suppl) == 1
assert suppl[0].GetNumConformers() == 1

ligand_coords = suppl[0].GetConformer().GetPositions()
ligand_num_atoms = suppl[0].GetNumAtoms()
assert ligand_num_atoms == len(ligand_coords)
ligand_atom_types = np.array([atom.GetSymbol() for atom in suppl[0].GetAtoms()])

ModuleNotFoundError: No module named 'Bio'

In [0]:
# Check the distance cut-off for a protein ligand interaction
residues = [residue for residue in structure.get_residues() if is_aa(residue)]

labels = np.zeros(len(residues))

for ind, residue in enumerate(residues):
    for atom in residue.get_atoms():
        if atom.get_fullname()[1] == 'H':
            continue
        for i in range(ligand_num_atoms):
            if ligand_atom_types[i] == 'H':
                continue
            if np.linalg.norm(atom.get_coord() - ligand_coords[i]) < 4.5:
                labels[ind] = 1
                # print(residue.get_resname(), residue.get_segid())
                break
        if labels[ind]:
            break

# Manually check in VMD whether these amino acids are the ones close to the ligand
print((np.where(labels == 1)))

In [0]:
# Check the number of unique proteins in our dataset

with open(INDEX_FOLDER + "INDEX_refined_name.2018") as f:
    lines = f.readlines()

# lengths=[]
dic = {}
for line in lines:
    if line[0] == "#":
        continue
    line = line.strip().split()
    uniprot = line[3]
    dic[uniprot] = 1

print(len(dic.keys()))

In [1]:
# Get all proteins with same sequence and create another dataset with the labels of these sequences combined
import numpy as np
from os import listdir
from constants import TRAIN_FOLDER, TEST_FOLDER, VAL_FOLDER
from collections import defaultdict

folders = [TRAIN_FOLDER, TEST_FOLDER, VAL_FOLDER]
dic = {}
unique = [set(), set(), set()]

for i, folder in enumerate(folders):
    for file in listdir(folder):
        data = np.load(folder + file, allow_pickle=True)
        protein = data["protein"].item()
        metadata = data["metadata"].item()
        seq = protein["sequence"]
        unique[i].add(seq)
        if seq in dic:
            for key in metadata:
                dic[seq]["metadata"][key].append(metadata[key])
            dic[seq]["protein"]["labels"] += protein["labels"]
        else:
            dic[seq] = {}
            dic[seq]["metadata"] = {}
            for key in metadata:
                dic[seq]["metadata"][key] = [metadata[key]]
            dic[seq]["protein"] = protein

In [2]:
# We've added all the labels together so if a binding site occurs in more than half the ligands, then we assume it is the important site and consider it as our label
for i, key in enumerate(dic):
    ln = len(dic[key]["metadata"]["pdb_id"]) // 2
    dic[key]["protein"]["labels"] = (dic[key]["protein"]["labels"] > 0).astype(float)

In [59]:
# Some statistics on the common sequences in test train and val data
print(len(s[0]))
print(len(s[1]))
print(len(s[2]))
print(len(s[1]-s[0]))
print(len(s[2]-s[0]))

2392
383
381
237
232


In [56]:
# Check whether the dictionary made is correct or not
for key in dic:
    if len(dic[key]["metadata"]["pdb_id"]) > 1:
        print(dic[key])
        break

{'metadata': {'pdb_id': ['2j94', '2uwl', '1nfu', '2vh0'], 'resolution': [2.1, 1.9, 2.05, 1.7], 'release_year': [2007, 2007, 2003, 2008], 'neg_log_k': [6.27, 8.4, 7.74, 8.51], 'k': [5.34e-07, 4e-09, 1.8000000000000002e-08, 3.1000000000000005e-09], 'ligand_name': ['G15', '895', 'RRP', 'GSI']}, 'protein': {'sequence': 'IVGGQECKDGECPWQALLINEENEGFCGGTILSEFYILTAAHCLYQAKRFKVRVGDRNTEQEEGGEAVHEVEVVIKHNRFTKETYDFDIAVLRLKTPITFRMNVAPACLPERDWAESTLMTQKTGIVSGFGRTHEKGRQSTRLKMLEVPYVDRNSCKLSSSFIITQNMFCAGYDTKQEDACQGDSGGPHVTRFKDTYFVTGIVSWGEGCARKGKYGIYTKVTAFLKWIDRSMKTRKLCSLDNGDCDQFCHEEQNSVVCSCARGYTLADNGKACIPTGPYPCGKQTL', 'length': 286, 'labels': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1

In [4]:
# Save the data
for key in dic:
    ln = len(dic[key]["metadata"]["pdb_id"])
    pdb_id = dic[key]["metadata"]["pdb_id"][0] + "-" + str(ln)
    np.savez(
        "./data/PDBbind/preprocessed/unique2/" + pdb_id + ".npz",
        metadata=dic[key]["metadata"],
        protein=dic[key]["protein"],
    )

In [5]:
# Find all sequences in PDBbind refined dataset from the rcsb dataset
# And compare both
from collections import defaultdict
from os import path
import numpy as np
from Bio.PDB import PDBParser, PPBuilder
from rdkit import Chem

PROJECT_FOLDER = "./"
parser = PDBParser()
ppb = PPBuilder()
RCSB_SEQUENCES = path.join(PROJECT_FOLDER, "data/pdb_seqres.txt")
data_dir = path.join(PROJECT_FOLDER, "data/PDBbind")
refined_dir = path.join(data_dir, "pdbbind_v2018_refined/refined-set")
index_dir = path.join(data_dir, "PDBbind_2018_plain_text_index/index")
index_file = path.join(index_dir, "INDEX_refined_data.2018")

def initialize_dataset_from_index_file():
    dataset = []
    with open(index_file) as f:
        line = f.readline()
        while line:
            if line[0] != "#":
                dataset.append(line.strip().split())
            line = f.readline()
    return dataset

def get_sequences_from_rcsb(dataset):
    sequences = defaultdict(str)
    with open(RCSB_SEQUENCES) as file:
        pdb_id = file.readline()[1:5]
        for data in sorted(dataset):
            flg = 0
            while pdb_id != data[0]:
                file.readline()
                pdb_id = file.readline()[1:5]
            # Each id can have multiple chains
            while pdb_id == data[0]:
                flg = 1
                seq = file.readline().strip()
                sequences[pdb_id] += seq
                pdb_id = file.readline()[1:5]
            if not flg:
                print(pdb_id)
    print(len(sequences))
    return sequences

def get_sequence_from_structure(protein_structure):
    sequences = [
        str(seq.get_sequence())
        for seq in ppb.build_peptides(protein_structure, aa_only=False)
    ]
    return "".join(sequences)


In [10]:
dataset = initialize_dataset_from_index_file()
sequences = get_sequences_from_rcsb(dataset)
cnt = 0
for element in dataset:
    pdb_id = element[0]
    pdb_prefix = path.join(refined_dir, pdb_id, pdb_id)
    protein_structure = parser.get_structure(
        pdb_id, pdb_prefix + "_protein.pdb"
    )
    sequence = get_sequence_from_structure(protein_structure)
    if sequences[pdb_id] != sequence:
        cnt += 1
        # print(sequences[pdb_id], sequence)
print(cnt)

  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
  "amino acid" % residue.get_resname())
