In [1]:
from Bio.PDB import *
from rdkit import Chem

# import nglview as nv
import numpy as np

REFINED_FOLDER = "./data/PDBbind/pdbbind_v2018_refined/refined-set/"
INDEX_FOLDER = "./data/PDBbind/PDBbind_2018_plain_text_index/index/"
parser = PDBParser()
ppb = PPBuilder()
# pdb_id = "3aqt"
pdb_id = "1ezq"

# Protein structure
structure = parser.get_structure(
    pdb_id, REFINED_FOLDER + pdb_id + "/" + pdb_id + "_protein.pdb"
)

# Ligand structure
suppl = Chem.SDMolSupplier(
    REFINED_FOLDER + pdb_id + "/" + pdb_id + "_ligand.sdf", sanitize=False
)
assert len(suppl) == 1
assert suppl[0].GetNumConformers() == 1

ligand_coords = suppl[0].GetConformer().GetPositions()
ligand_num_atoms = suppl[0].GetNumAtoms()
assert ligand_num_atoms == len(ligand_coords)
ligand_atom_types = np.array([atom.GetSymbol() for atom in suppl[0].GetAtoms()])



FileNotFoundError: [Errno 2] No such file or directory: './data/PDBbind/pdbbind_v2018_refined/refined-set/1ezq/1ezq_protein.pdb'

In [None]:
# Check the distance cut-off for a protein ligand interaction
residues = [residue for residue in structure.get_residues() if is_aa(residue)]

labels = np.zeros(len(residues))

for ind, residue in enumerate(residues):
    for atom in residue.get_atoms():
        if atom.get_fullname()[1] == 'H':
            continue
        for i in range(ligand_num_atoms):
            if ligand_atom_types[i] == 'H':
                continue
            if np.linalg.norm(atom.get_coord() - ligand_coords[i]) < 4.5:
                labels[ind] = 1
                # print(residue.get_resname(), residue.get_segid())
                break
        if labels[ind]:
            break

# Manually check in VMD whether these amino acids are the ones close to the ligand
print((np.where(labels == 1)))

In [None]:
# Check the number of unique proteins in our dataset

with open(INDEX_FOLDER + "INDEX_refined_name.2018") as f:
    lines = f.readlines()

# lengths=[]
dic = {}
for line in lines:
    if line[0] == "#":
        continue
    line = line.strip().split()
    uniprot = line[3]
    dic[uniprot] = 1

print(len(dic.keys()))

In [None]:
# Find all sequences in PDBbind refined dataset from the rcsb dataset
# And compare both
from collections import defaultdict
from os import path
import numpy as np
from Bio.PDB import PDBParser, PPBuilder
from rdkit import Chem

PROJECT_FOLDER = "./"
parser = PDBParser()
ppb = PPBuilder()
RCSB_SEQUENCES = path.join(PROJECT_FOLDER, "data/pdb_seqres.txt")
data_dir = path.join(PROJECT_FOLDER, "data/PDBbind")
refined_dir = path.join(data_dir, "pdbbind_v2018_refined/refined-set")
index_dir = path.join(data_dir, "PDBbind_2018_plain_text_index/index")
index_file = path.join(index_dir, "INDEX_refined_data.2018")

def initialize_dataset_from_index_file():
    dataset = []
    with open(index_file) as f:
        line = f.readline()
        while line:
            if line[0] != "#":
                dataset.append(line.strip().split())
            line = f.readline()
    return dataset

def get_sequences_from_rcsb(dataset):
    sequences = defaultdict(str)
    with open(RCSB_SEQUENCES) as file:
        pdb_id = file.readline()[1:5]
        for data in sorted(dataset):
            flg = 0
            while pdb_id != data[0]:
                file.readline()
                pdb_id = file.readline()[1:5]
            # Each id can have multiple chains
            while pdb_id == data[0]:
                flg = 1
                seq = file.readline().strip()
                sequences[pdb_id] += seq
                pdb_id = file.readline()[1:5]
            if not flg:
                print(pdb_id)
    print(len(sequences))
    return sequences

def get_sequence_from_structure(protein_structure):
    sequences = [
        str(seq.get_sequence())
        for seq in ppb.build_peptides(protein_structure, aa_only=False)
    ]
    return "".join(sequences)


In [None]:
dataset = initialize_dataset_from_index_file()
print(dataset[:10])
sequences = get_sequences_from_rcsb(dataset)
cnt = 0
for element in dataset:
    pdb_id = element[0]
    pdb_prefix = path.join(refined_dir, pdb_id, pdb_id)
    protein_structure = parser.get_structure(
        pdb_id, pdb_prefix + "_protein.pdb"
    )
    sequence = get_sequence_from_structure(protein_structure)
    if sequences[pdb_id] != sequence:
        cnt += 1
        # print(sequences[pdb_id], sequence)
print(cnt)

In [None]:
# Testing preprocessing of sc-pdb using protein mol2
from os import path, listdir
from biopandas.mol2 import PandasMol2
from collections import defaultdict
from constants import THREE_TO_ONE
FOLDER = "./data/scPDB/raw"

def get_aa_location(res_name, res_id):
    aa = THREE_TO_ONE[res_name[:3]]
    offset = int(res_name[3:]) - int(res_id) + 1
    

for i, pdb_id in enumerate(sorted(listdir(FOLDER))):
    print(pdb_id)
    pmol = PandasMol2().read_mol2(path.join(FOLDER, pdb_id, "protein.mol2"))
    lmol = PandasMol2().read_mol2(path.join(FOLDER, pdb_id, "ligand.mol2"))
    ligand_coords = lmol.df[lmol.df['atom_type'] != 'H'][['x', 'y', 'z']]
    protein_heavy = pmol.df[pmol.df['atom_type'] != 'H']
    binding_site = {}
    for j, atom_coord in enumerate(ligand_coords.values):
        pmol.df["distances"] = pmol.distance_df(protein_heavy, atom_coord)
        cutoff = pmol.df[pmol.df["distances"] <= 4.5]
        for k, aa in enumerate(cutoff.values):
            binding_site[aa[7]] = aa[6]
        # print(cutoff['subst_name'])
    print(binding_site)
    # if i == 4:
    break

In [6]:
# Testing pre-processing of scPDB using downloaded proteins
from Bio.PDB import *
from rdkit import Chem
import os
from reindex_pdb import reindex_pdb

# import nglview as nv
import numpy as np

raw_dir = "./data/scPDB/raw"
parser = PDBParser()
pdb_id = "1fdi_2"
chain_id = "A"

pre = os.path.join(raw_dir, pdb_id)
dest = os.path.join(pre, "tmp.pdb")
PDBtxt_reindex = reindex_pdb(
    os.path.join(pre, chain_id + ".fasta"),
    os.path.join(pre, "downloaded.pdb"),
    True,
)
with open(dest, "w") as fp:
    fp.write(PDBtxt_reindex)
# Protein structure
structure = parser.get_structure(
    pdb_id,  dest
)
for res in structure[0][chain_id]:
    id = res.get_id()
    if is_aa(res) and id[0] == ' ':
        print(res, end=' ')
# os.remove(dest)


ModuleNotFoundError: No module named 'NWalign'

In [None]:
# Checking statistics of the cross-validation splits
from os import path, listdir
from collections import defaultdict
FOLDER = "./data/scPDB/"

folds = []
for i in range(10):
    with open(path.join(FOLDER, "splits", "train_ids_fold" + str(i))) as f:
        folds.append(set([line.strip() for line in f.readlines()]))

all = folds[0].union(folds[1])
print(len(folds[0]))
print(len(all))

available = defaultdict(set)
for file in listdir(path.join(FOLDER, "raw")):
    available[file[:4]].add(file)

print(len(available))
with open(path.join(FOLDER, "splits", "scPDB_blacklist.txt")) as f:
    for line in f.readlines():
        line = line.strip()
        available[line[:4]].remove(line)
        if available[line[:4]] == set():
            del available[line[:4]]

with open(path.join(FOLDER, "splits", "scPDB_leakage.txt")) as f:
    for line in f.readlines():
        line = line.strip()
        available[line[:4]].remove(line)
        if available[line[:4]] == []:
            del available[line[:4]]
print(len(available))

for key in set(available.keys()) - all:
    del available[key]

print(len(available))

cnt = 0
for key, val in available.items():
    cnt += len(val)

print(cnt)

In [None]:
# Convert all mol2 files to pdb files in the scPDB raw data
from os import system, path, listdir

FOLDER = "./data/scPDB/raw"

for pdb_id in sorted(listdir(FOLDER)):
    # print(pdb_id)
    err = system(
        "obabel -imol2 "
        + path.join(FOLDER, pdb_id, "protein.mol2")
        + " -opdb -O "
        + path.join(FOLDER, pdb_id, "converted_protein.pdb")
    )
    if err != 0:
        print(pdb_id)

In [None]:
# Delete all converted_pdbs to save space
from os import remove, path, listdir

folder = "./data/scPDB/raw"

for file in sorted(listdir(folder)):
    pdb = path.join(folder, file, "converted_protein.pdb")
    if path.exists(pdb):
        remove(pdb)

In [None]:
# Create a .npz file containing a dictionary of sequences of all PDBs
import numpy as np
from collections import defaultdict

sequences = defaultdict(str)
with open("./data/pdb_seqres.txt") as f:
    lines = f.readlines()

for i, line in enumerate(lines):
    if i % 2 == 0:
        pdb_id = line[1:5]
        mol = line[12:14]
        if mol == "na":
            break
    else:
        sequences[pdb_id] += line.strip()

np.savez("./data/pdb_seqres.npz", **sequences)

In [None]:
# Check whether the data that has been preprocessed has the exact sequence or just the corners missing. 
# THIS IS INCORRECT because we concatenated the chains of a protein
import numpy as np
from os import listdir, path

rcsb = np.load("./data/pdb_seqres.npz")
folder = "./data/scPDB/preprocessed/"
missing_residues = []
obseleted = []
for file in sorted(listdir(folder)):
    pdb_id = file[:4]
    prot = np.load(path.join(folder, file))
    try:
        seq = rcsb[pdb_id].item()
    except:
        obseleted.append(pdb_id)
        continue
    if prot["sequence"].item() in seq:
        continue
    missing_residues.append(pdb_id)
# print(rcsb["2pin"].item())
# print(np.load(folder + "2pin_2.npz")["sequence"].item())
print(missing_residues[0])
# len(obseleted)
# obseleted

In [1]:
# Download sequence and PDB files from RCSB for easier matching of labels
import urllib
from os import listdir, path

folder = "./data/scPDB/raw/"

for file in sorted(listdir(folder)):
    pdb_id = file[:4]
    print(pdb_id)

    pdb_save = path.join(folder, file, "downloaded.pdb")
    if not path.exists(pdb_save):
        try:
            urllib.request.urlretrieve('http://files.rcsb.org/download/' + pdb_id + ".pdb", pdb_save)
        except:
            print("Err: pdb " + pdb_id)

    fasta_save = path.join(folder, file, "sequence.fasta")
    if not path.exists(fasta_save):
        try:
            urllib.request.urlretrieve('https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=' + pdb_id + '&compressionType=uncompressed', fasta_save)
        except:
            print("Err: fasta " + pdb_id)


4ead
4eag
4eaj
4eak
4ean
4ear
4eaw
4eb3
4eb4
4eb4
4eb5
4eb7
4ebf
4ec0
4ec3
4ece
4ech
4ed4
4edf
4edy
4edz
4ee0
4ee3
4eep
4eer
4ees
4eet
4eeu
4eev
4ef4
4ef8
4ef9
4efk
4efk
4eft
4efu
4egb
4egh
4egi
4egk
4egn
4egr
4egu
4eh2
4eh2
4eh3
4eh3
4eh4
4eh4
4eh5
4eh5
4eh6
4eh6
4eh7
4eh7
4eh8
4eh8
4eh9
4eh9
4eht
4ehu
4ehv
4ehy
4ehz
4ei4
4ei7
4eil
4eil
4eil
4ein
4eip
4eiq
4eix
4eix
4eiy
4ej0
4ej1
4ej4
4ej7
4ejg
4ejh
4eji
4ejj
4ejm
4ejn
4ejv
4ek1
4ek6
4ek8
4ek9
4eke
4ekg
4eki
4ekk
4ekl
4el0
4el4
4el5
4el9
4elb
4elb
4ele
4elf
4elg
4elh
4em3
4em4
4em9
4ema
4emd
4emi
4emj
4emr
4emt
4emw
4emy
4en4
4enh
4enx
4eny
4eo3
4eo8
4eoi
4eok
4eol
4eom
4eon
4eoo
4eop
4eor
4eos
4eox
4ep6
4ep9
4eph
4epl
4epq
4epw
4epx
4epx
4eq4
4eqc
4eqe
4eqg
4eqk
4eql
4eqr
4eqr
4eqs
4eqs
4equ
4eqw
4eqw
4eqx
4er6
4er7
4ere
4erf
4erk
4erw
4es5
4eso
4eu4
4eu5
4eu6
4eu8
4eu9
4eua
4eub
4euc
4eud
4eue
4euf
4eut
4eux
4eux
4ev0
4ew2
4ew3
4ewh
4ewn
4ewo
4ewq
4ewv
4exg
4exs
4ey2
4ey6
4ey7
4eyb
4eyf
4eyj
4eym
4eyw
4ez3
4ez5
4ez6
4ez7
4ez7
4ez8


In [None]:
# Check whether downloaded PDB and sequence files are correct
from os import listdir, path

folder = "./data/scPDB/raw/"

for file in sorted(listdir(folder)):
    pdb_id = file[:4]
    pdb_save = path.join(folder, file, "downloaded.pdb")
    with open(pdb_save, "r") as f:
        line = f.readline()
        if line[:3] != "HEA":
            print("Err: PDB " + file)
    fasta_save = path.join(folder, file, "sequence.fasta")
    with open(fasta_save, "r") as f:
        line = f.readline()
        if line[0] != ">":
            print("Err: FASTA " + file)

In [2]:
# Split a sequence fasta into it's corresponding chains
import os
import re

raw_dir = "./data/scPDB/raw/"

# def match(strg, search=re.compile(r"[^AGTCUX]").search):
#     return not bool(search(strg))

for file in sorted(os.listdir(raw_dir)):
    file = file.strip()
    pre = os.path.join(raw_dir, file)
    fasta = os.path.join(pre, "sequence.fasta")
    with open(fasta, "r") as f:
        header = f.readline()
        while 1:
            chain = header[6:7]
            sequence = ""
            line = f.readline()
            while line != "" and line is not None and line[0] != ">":
                sequence += line.strip()
                line = f.readline()
            # if not match(sequence):
                # if os.path.exists(os.path.join(pre, chain + ".fasta")):
                    # print(file + " " + chain)
                    # os.remove(os.path.join(pre, chain + ".fasta"))
                # else:
            with open(os.path.join(pre, chain + ".fasta"), "w") as hlp:
                hlp.write(header)
                hlp.write(sequence + "\n")
            if line == "" or line is None:
                break
            header = line