In [None]:
from Bio.PDB import *
from rdkit import Chem

# import nglview as nv
import numpy as np

REFINED_FOLDER = "./data/PDBbind/pdbbind_v2018_refined/refined-set/"
INDEX_FOLDER = "./data/PDBbind/PDBbind_2018_plain_text_index/index/"
parser = PDBParser()
ppb = PPBuilder()
# pdb_id = "3aqt"
pdb_id = "1ezq"

# Protein structure
structure = parser.get_structure(
    pdb_id, REFINED_FOLDER + pdb_id + "/" + pdb_id + "_protein.pdb"
)

# Ligand structure
suppl = Chem.SDMolSupplier(
    REFINED_FOLDER + pdb_id + "/" + pdb_id + "_ligand.sdf", sanitize=False
)
assert len(suppl) == 1
assert suppl[0].GetNumConformers() == 1

ligand_coords = suppl[0].GetConformer().GetPositions()
ligand_num_atoms = suppl[0].GetNumAtoms()
assert ligand_num_atoms == len(ligand_coords)
ligand_atom_types = np.array([atom.GetSymbol() for atom in suppl[0].GetAtoms()])

In [0]:
# Check the distance cut-off for a protein ligand interaction
residues = [residue for residue in structure.get_residues() if is_aa(residue)]

labels = np.zeros(len(residues))

for ind, residue in enumerate(residues):
    for atom in residue.get_atoms():
        if atom.get_fullname()[1] == 'H':
            continue
        for i in range(ligand_num_atoms):
            if ligand_atom_types[i] == 'H':
                continue
            if np.linalg.norm(atom.get_coord() - ligand_coords[i]) < 4.5:
                labels[ind] = 1
                # print(residue.get_resname(), residue.get_segid())
                break
        if labels[ind]:
            break

# Manually check in VMD whether these amino acids are the ones close to the ligand
print((np.where(labels == 1)))

In [0]:
# Check the number of unique proteins in our dataset

with open(INDEX_FOLDER + "INDEX_refined_name.2018") as f:
    lines = f.readlines()

# lengths=[]
dic = {}
for line in lines:
    if line[0] == "#":
        continue
    line = line.strip().split()
    uniprot = line[3]
    dic[uniprot] = 1

print(len(dic.keys()))

In [16]:
# Find all sequences in PDBbind refined dataset from the rcsb dataset
# And compare both
from collections import defaultdict
from os import path
import numpy as np
from Bio.PDB import PDBParser, PPBuilder
from rdkit import Chem

PROJECT_FOLDER = "./"
parser = PDBParser()
ppb = PPBuilder()
RCSB_SEQUENCES = path.join(PROJECT_FOLDER, "data/pdb_seqres.txt")
data_dir = path.join(PROJECT_FOLDER, "data/PDBbind")
refined_dir = path.join(data_dir, "pdbbind_v2018_refined/refined-set")
index_dir = path.join(data_dir, "PDBbind_2018_plain_text_index/index")
index_file = path.join(index_dir, "INDEX_refined_data.2018")

def initialize_dataset_from_index_file():
    dataset = []
    with open(index_file) as f:
        line = f.readline()
        while line:
            if line[0] != "#":
                dataset.append(line.strip().split())
            line = f.readline()
    return dataset

def get_sequences_from_rcsb(dataset):
    sequences = defaultdict(str)
    with open(RCSB_SEQUENCES) as file:
        pdb_id = file.readline()[1:5]
        for data in sorted(dataset):
            flg = 0
            while pdb_id != data[0]:
                file.readline()
                pdb_id = file.readline()[1:5]
            # Each id can have multiple chains
            while pdb_id == data[0]:
                flg = 1
                seq = file.readline().strip()
                sequences[pdb_id] += seq
                pdb_id = file.readline()[1:5]
            if not flg:
                print(pdb_id)
    print(len(sequences))
    return sequences

def get_sequence_from_structure(protein_structure):
    sequences = [
        str(seq.get_sequence())
        for seq in ppb.build_peptides(protein_structure, aa_only=False)
    ]
    return "".join(sequences)




In [17]:
dataset = initialize_dataset_from_index_file()
print(dataset[:10])
sequences = get_sequences_from_rcsb(dataset)
cnt = 0
for element in dataset:
    pdb_id = element[0]
    pdb_prefix = path.join(refined_dir, pdb_id, pdb_id)
    protein_structure = parser.get_structure(
        pdb_id, pdb_prefix + "_protein.pdb"
    )
    sequence = get_sequence_from_structure(protein_structure)
    if sequences[pdb_id] != sequence:
        cnt += 1
        # print(sequences[pdb_id], sequence)
print(cnt)

FileNotFoundError: [Errno 2] No such file or directory: './data/PDBbind/PDBbind_2018_plain_text_index/index/INDEX_refined_data.2018'

In [26]:
# Testing preprocessing of sc-pdb using protein mol2
from os import path, listdir
from biopandas.mol2 import PandasMol2
from collections import defaultdict
from constants import THREE_TO_ONE
FOLDER = "./data/scPDB/raw"

def get_aa_location(res_name, res_id):
    aa = THREE_TO_ONE[res_name[:3]]
    offset = int(res_name[3:]) - int(res_id) + 1
    

for i, pdb_id in enumerate(sorted(listdir(FOLDER))):
    print(pdb_id)
    pmol = PandasMol2().read_mol2(path.join(FOLDER, pdb_id, "protein.mol2"))
    lmol = PandasMol2().read_mol2(path.join(FOLDER, pdb_id, "ligand.mol2"))
    ligand_coords = lmol.df[lmol.df['atom_type'] != 'H'][['x', 'y', 'z']]
    protein_heavy = pmol.df[pmol.df['atom_type'] != 'H']
    binding_site = {}
    for j, atom_coord in enumerate(ligand_coords.values):
        pmol.df["distances"] = pmol.distance_df(protein_heavy, atom_coord)
        cutoff = pmol.df[pmol.df["distances"] <= 4.5]
        for k, aa in enumerate(cutoff.values):
            binding_site[aa[7]] = aa[6]
        # print(cutoff['subst_name'])
    print(binding_site)
    # if i == 4:
    break

10mh_1
{'PHE18': 42, 'ALA19': 43, 'PHE24': 48, 'ASN304': 328, 'VAL306': 330, 'HOH340': 355, 'GLY20': 44, 'SER305': 329, 'GLY78': 102, 'LEU21': 45, 'GLY22': 46, 'GLY23': 47, 'HOH331': 353, 'TYR285': 309, 'PRO80': 104, 'GLU40': 64, 'TRP41': 65, 'ASP42': 66, 'ASP60': 84, 'ILE61': 85, 'LEU100': 124, 'ASN39': 63}


In [15]:
# Checking statistics of the cross-validation splits
from os import path, listdir
from collections import defaultdict
FOLDER = "./data/scPDB/"

folds = []
for i in range(10):
    with open(path.join(FOLDER, "splits", "train_ids_fold" + str(i))) as f:
        folds.append(set([line.strip() for line in f.readlines()]))

all = folds[0].union(folds[1])
print(len(folds[0]))
print(len(all))

available = defaultdict(set)
for file in listdir(path.join(FOLDER, "raw")):
    available[file[:4]].add(file)

print(len(available))
with open(path.join(FOLDER, "splits", "scPDB_blacklist.txt")) as f:
    for line in f.readlines():
        line = line.strip()
        available[line[:4]].remove(line)
        if available[line[:4]] == set():
            del available[line[:4]]

with open(path.join(FOLDER, "splits", "scPDB_leakage.txt")) as f:
    for line in f.readlines():
        line = line.strip()
        available[line[:4]].remove(line)
        if available[line[:4]] == []:
            del available[line[:4]]
print(len(available))

for key in set(available.keys()) - all:
    del available[key]

print(len(available))

cnt = 0
for key, val in available.items():
    cnt += len(val)

print(cnt)

14274
15860
16612
16341
15860
16776


In [1]:
# Convert all mol2 files to pdb files in the scPDB raw data
from os import system, path, listdir

FOLDER = "./data/scPDB/raw"

for pdb_id in sorted(listdir(FOLDER)):
    # print(pdb_id)
    err = system(
        "obabel -imol2 "
        + path.join(FOLDER, pdb_id, "protein.mol2")
        + " -opdb -O "
        + path.join(FOLDER, pdb_id, "converted_protein.pdb")
    )
    if err != 0:
        print(pdb_id)

1bmf_3
1bmf_4
1cer_5
1e1c_1
1e1q_1
1e1q_4
1e79_1
1e79_4
1ea0_1
1h5q_2
1h6v_1
1h6v_6
1h8h_1
1h8h_4
1ht2_4
1k5d_3
1kfl_6
1llu_8
1mx0_1
1nbm_1
1nbm_4
1nvm_3
1ofd_1
1ofe_2
1ohh_3
1on3_2
1q3s_2
1qvr_2
1qzf_2
1qzf_7
1req_1
1rfu_11
1rfu_4
1ryw_5
1s20_4
1s3s_1
1s4d_11
1sej_9
1sxj_1
1sxj_5
1tf7_4
1u9i_4
1w0k_1
1w88_1
1xjn_4
1xjn_6
1zm4_3
2buf_18
2c12_2
2c2b_1
2cfy_2
2ck3_1
2g82_7
2gbl_10
2h12_2
2j3n_1
2j3n_4
2j4l_6
2nu9_1
2ome_1
2qfx_6
2v7q_1
2v7q_4
2vig_5
2vig_6
2wbb_1
2wbd_6
2wgg_5
2x06_8
2xka_4
3dxj_2


In [5]:
# Delete all converted_pdbs to save space
from os import remove, path, listdir

folder = "./data/scPDB/raw"

for file in sorted(listdir(folder)):
    pdb = path.join(folder, file, "converted_protein.pdb")
    if path.exists(pdb):
        remove(pdb)

In [5]:
# Create a .npz file containing a dictionary of sequences of all PDBs
import numpy as np
from collections import defaultdict

sequences = defaultdict(str)
with open("./data/pdb_seqres.txt") as f:
    lines = f.readlines()

for i, line in enumerate(lines):
    if i % 2 == 0:
        pdb_id = line[1:5]
        mol = line[12:14]
        if mol == "na":
            break
    else:
        sequences[pdb_id] += line.strip()

np.savez("./data/pdb_seqres.npz", **sequences)

In [3]:
# Check whether the data that has been preprocessed has the exact sequence or just the corners missing. This is incorrect because we concatenated the chains of a protein
import numpy as np
from os import listdir, path

rcsb = np.load("./data/pdb_seqres.npz")
folder = "./data/scPDB/preprocessed/"
missing_residues = []
obseleted = []
for file in sorted(listdir(folder)):
    pdb_id = file[:4]
    prot = np.load(path.join(folder, file))
    try:
        seq = rcsb[pdb_id].item()
    except:
        obseleted.append(pdb_id)
        continue
    if prot["sequence"].item() in seq:
        continue
    missing_residues.append(pdb_id)
# print(rcsb["2pin"].item())
# print(np.load(folder + "2pin_2.npz")["sequence"].item())
print(missing_residues[0])
# len(obseleted)
# obseleted

12gs


In [1]:
# Download sequence and PDB files from RCSB for easier matching of labels
import urllib
from os import listdir, path

folder = "./data/scPDB/raw/"

for file in sorted(listdir(folder)):
    pdb_id = file[:4]
    print(pdb_id)

    pdb_save = path.join(folder, file, "downloaded.pdb")
    if not path.exists(pdb_save):
        try:
            urllib.request.urlretrieve('http://files.rcsb.org/download/' + pdb_id + ".pdb", pdb_save)
        except:
            print("Err: pdb " + pdb_id)

    fasta_save = path.join(folder, file, "sequence.fasta")
    if not path.exists(fasta_save):
        try:
            urllib.request.urlretrieve('https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList=' + pdb_id + '&compressionType=uncompressed', fasta_save)
        except:
            print("Err: fasta " + pdb_id)


10mh
11bg
12gs
13gs
17gs
19gs
1a26
1a27
1a29
1a2b
1a2n
1a42
1a4i
1a4l
1a4r
1a4w
1a4z
1a50
1a59
1a5b
1a5s
1a5u
1a69
1a71
1a72
1a7k
1a7x
1a80
1a8g
1a8k
1a8p
1a8r
1a8t
1a94
1a9c
1a9m
1a9p
1a9q
1a9r
1a9s
1a9t
1a9x
1a9y
1a9z
1aa6
1acj
1ad3
1ad5
1adb
1adc
1adf
1ads
1ae1
1ae8
1af0
1af7
1afe
1afs
1agn
1agw
1ah0
1ah3
1ah4
1ahb
1ahg
1ahh
1ahi
1ahn
1ai0
1ai9
1aid
1aiq
1aiy
1aj0
1aj0
1aj2
1aj8
1ajv
1ajx
1aka
1akb
1akc
1ake
1akr
1aku
1akw
1aky
1al7
1al8
1am1
1am4
1amo
1amo
1amo
1amw
1an5
1ank
1ao0
1ao0
1ao8
1aob
1aoe
1aq1
1aq2
1aqb
1aqi
1aqu
1aqv
1aqx
1arg
1arh
1arz
1asb
1asc
1atl
1atn
1atr
1ats
1aux
1av5
1avd
1axe
1axg
1axw
1ay0
1ay5
1ayl
1ayp
1az1
1az2
1azl
1azt
1b11
1b14
1b15
1b16
1b2l
1b2l
1b2r
1b38
1b39
1b3d
1b3r
1b48
1b4p
1b4v
1b5d
1b5e
1b5q
1b5t
1b6k
1b6l
1b6m
1b6p
1b7a
1b7t
1b7y
1b87
1b8n
1b8o
1b8s
1b8u
1b8v
1b8y
1b9i
1bai
1bc5
1bcp
1bcu
1bd4
1bdb
1bdi
1bdl
1bdm
1bdq
1bdr
1bdu
1be4
1beu
1bfd
1bgq
1bh5
1bi9
1bid
1bif
1bil
1bim
1biw
1bjk
1bjq
1bk0
1bkf
1bkg
1bl4
1bl6
1bl7
1blz
1bmd
1bmf
1bmf


1opm
1oq5
1oqc
1oqm
1orh
1ori
1ork
1orr
1orr
1os1
1osf
1osv
1ot3
1ot7
1ot7
1oty
1ou4
1ouk
1ouy
1ov4
1ove
1ovm
1ow3
1owd
1owe
1owi
1owk
1owl
1owm
1own
1owo
1owp
1oxa
1oxo
1oxr
1oya
1oyb
1oyc
1oyn
1oyt
1oyy
1oz0
1oz1
1ozh
1ozp
1ozq
1ozv
1p0b
1p0e
1p0f
1p0h
1p0h
1p0n
1p0p
1p0y
1p16
1p1b
1p1c
1p1h
1p1i
1p2a
1p2y
1p31
1p33
1p3d
1p3d
1p3j
1p44
1p44
1p45
1p4a
1p4f
1p4m
1p4m
1p4r
1p52
1p5e
1p5r
1p6j
1p6j
1p6k
1p6x
1p72
1p7c
1p7r
1p7t
1p84
1p84
1p8d
1p91
1p93
1p9b
1p9l
1p9p
1p9w
1pag
1pax
1pd8
1pd9
1pdh
1pdh
1peo
1peq
1pf7
1pf8
1pf9
1pfy
1pg0
1pg2
1pg3
1pg8
1pgt
1phd
1phe
1phg
1phh
1phk
1php
1phq
1pi3
1piv
1piw
1pj2
1pj3
1pj4
1pj6
1pj7
1pjc
1pjk
1pjl
1pk7
1pk8
1pk9
1pkd
1pke
1pkf
1pkg
1pkv
1pl1
1pl2
1pl6
1pl6
1pl8
1pl9
1pme
1pmn
1pmu
1pmv
1pn3
1pn4
1pn9
1pnl
1pno
1pnq
1pnr
1pnv
1po7
1pow
1pow
1pox
1pox
1pp9
1ppj
1ppj
1ppk
1ppk
1ppl
1ppm
1ppr
1pq6
1pq9
1pqc
1pr0
1pr1
1pr4
1pr5
1pr6
1pr9
1prc
1pro
1ps9
1ps9
1psa
1psd
1pt5
1pt8
1pt9
1ptj
1ptj
1pu8
1pu9
1pua
1pvd
1pvg
1pvo
1pvs
1pw1
1pw6
1pw7
1pwl


2cn8
2cnd
2cns
2cnt
2cpp
2csn
2ct8
2cts
2cu0
2cul
2cv1
2cv2
2cvd
2cvj
2cvq
2cvu
2cvu
2cvv
2cvw
2cvw
2cvx
2cvy
2cvz
2cwf
2cww
2cx8
2cy0
2cy2
2cye
2czc
2czf
2d06
2d09
2d0k
2d0o
2d1c
2d1n
2d1o
2d1s
2d1t
2d1y
2d29
2d2g
2d2h
2d2i
2d32
2d3a
2d3m
2d3u
2d4e
2d4h
2d4v
2d52
2d5n
2d61
2d7d
2d7z
2d81
2d82
2d8a
2daa
2db3
2dbq
2dbr
2dbv
2dbz
2dc1
2dcm
2dcn
2ddo
2ddy
2de4
2dek
2dfd
2dft
2dfv
2dg3
2dg4
2dg9
2dgn
2dhf
2dji
2djl
2djx
2dkh
2dki
2dkk
2dkn
2dld
2doo
2dph
2dpm
2dpx
2dq7
2dqm
2dqs
2dr3
2dr8
2dra
2drc
2ds1
2dsa
2dsc
2dsg
2dsh
2dsi
2dt5
2dte
2du2
2du8
2dub
2duv
2dux
2duz
2dv0
2dv3
2dv4
2dv5
2dv7
2dvl
2dvm
2dvo
2dw0
2dw4
2dwb
2dwp
2dxd
2dxe
2dxf
2dxi
2dxs
2dxv
2dxw
2dxx
2dy9
2dza
2dzb
2e07
2e08
2e0a
2e0i
2e0n
2e15
2e16
2e17
2e1m
2e1t
2e1z
2e20
2e2b
2e2p
2e2q
2e37
2e40
2e41
2e48
2e4n
2e5a
2e5m
2e77
2e7f
2e7r
2e7z
2e82
2e83
2e8h
2e8q
2e8r
2e8s
2e8t
2e8u
2e8w
2e91
2e93
2e94
2e95
2e98
2e99
2e9a
2e9c
2e9d
2e9n
2e9o
2e9p
2e9u
2e9v
2e9z
2ea1
2ea2
2ea4
2eat
2eat
2eau
2eb3
2eba
2ec9
2eck
2ed3
2ed4
2ed4


2w6m
2w6n
2w6o
2w6p
2w6q
2w6z
2w71
2w7x
2w8f
2w8g
2w8r
2w8y
2w8y
2w93
2w98
2w9g
2w9h
2w9s
2wa2
2waj
2wat
2wb2
2wb5
2wba
2wba
2wbb
2wbd
2wbg
2wca
2wcg
2wd2
2wd4
2wd7
2wd8
2wd9
2wdq
2wdz
2we3
2wea
2web
2wec
2wed
2weg
2weh
2wej
2wek
2weo
2wep
2wer
2wes
2wet
2wey
2wez
2wf0
2wf1
2wf2
2wf3
2wf4
2wge
2wgg
2wgh
2wgj
2wgs
2wh8
2whd
2whf
2who
2whq
2whw
2whx
2wi1
2wi2
2wi4
2wi5
2wi6
2wi7
2wi9
2wih
2wip
2wjo
2wkm
2wks
2wky
2wkz
2wl0
2wl9
2wle
2wlf
2wlg
2wm3
2wm4
2wmd
2wme
2wmq
2wmr
2wms
2wmt
2wmu
2wmv
2wmw
2wmx
2wn6
2wnb
2wns
2wo7
2wo8
2wo8
2wo9
2wo9
2woa
2woa
2woe
2wog
2won
2wot
2wou
2wow
2wow
2wp5
2wp5
2wp9
2wpa
2wpw
2wpx
2wq6
2wq7
2wqb
2wqe
2wqo
2wqp
2wr8
2ws7
2wsa
2wsa
2wsb
2wsi
2wtc
2wtd
2wti
2wtj
2wtk
2wu1
2wu2
2wu4
2wu5
2wue
2wuf
2wug
2wuu
2wuz
2wvj
2wvl
2wvm
2ww4
2wwj
2wx2
2wxv
2wya
2wyj
2wyv
2wzb
2wzg
2wzm
2wzv
2wzw
2wzw
2wzy
2x06
2x0e
2x0f
2x0i
2x0q
2x0r
2x0v
2x0w
2x0y
2x19
2x1e
2x1h
2x1l
2x1n
2x1z
2x20
2x21
2x22
2x23
2x2k
2x2l
2x2m
2x2n
2x2r
2x2r
2x3f
2x3j
2x3n
2x45
2x4f
2x4z
2x5w
2x5z


3h0r
3h0s
3h0v
3h0w
3h18
3h1j
3h1q
3h1v
3h23
3h24
3h2c
3h2l
3h2n
3h2s
3h30
3h3c
3h3f
3h3f
3h3j
3h3q
3h3r
3h3s
3h3t
3h4g
3h4l
3h4v
3h4v
3h59
3h5n
3h5s
3h5u
3h65
3h6f
3h6i
3h6k
3h6v
3h7r
3h7u
3h86
3h89
3h8b
3h8c
3h8g
3h98
3h9f
3h9j
3ha3
3ha5
3ha7
3ha8
3had
3hat
3hav
3haz
3haz
3hb8
3hbb
3hbb
3hbf
3hco
3hcr
3hdh
3hdm
3hdq
3hdy
3hdy
3hdy
3he3
3hek
3hf3
3hf6
3hf8
3hfb
3hfw
3hgg
3hgm
3hgo
3hgr
3hgr
3hgs
3hgx
3hhu
3hia
3hii
3hiv
3hiy
3hj9
3hja
3hji
3hjk
3hjo
3hk1
3hko
3hku
3hkv
3hkw
3hky
3hl0
3hl4
3hl7
3hl7
3hll
3hll
3hlw
3hml
3hmm
3hmv
3hna
3hng
3hnz
3ho2
3ho9
3hp2
3hp5
3hpq
3hpr
3hpy
3hq5
3hq8
3hqd
3hr4
3hrc
3hrf
3hrr
3hsc
3hsn
3hsw
3hth
3hti
3htj
3hu1
3hu2
3hu3
3hub
3huc
3huk
3hv3
3hv4
3hv5
3hv6
3hv7
3hvc
3hvi
3hvk
3hvt
3hwr
3hwx
3hx3
3hx4
3hxb
3hxc
3hxd
3hxe
3hxf
3hy3
3hy4
3hy7
3hy9
3hyg
3hyo
3hyv
3hyw
3hyw
3hyx
3hyz
3hz1
3hz1
3hz5
3hz6
3hzc
3hzg
3hzi
3hzl
3hzt
3i0a
3i0o
3i0p
3i0r
3i0s
3i12
3i1f
3i1l
3i1y
3i25
3i28
3i2l
3i3s
3i4b
3i4d
3i4l
3i53
3i58
3i59
3i59
3i5u
3i5x
3i5z
3i60
3i64
3i68


3swq
3swr
3swz
3sx2
3sx6
3sxi
3sxj
3sxn
3sxp
3sxv
3syi
3sys
3syt
3sz0
3szc
3szf
3szg
3szw
3t01
3t02
3t0i
3t0k
3t0z
3t10
3t11
3t14
3t19
3t1a
3t1k
3t1t
3t2g
3t2k
3t2s
3t2v
3t2w
3t2y
3t2z
3t31
3t37
3t3c
3t3f
3t3i
3t3q
3t3r
3t3z
3t40
3t42
3t4e
3t4h
3t4k
3t4l
3t4n
3t4q
3t4s
3t50
3t54
3t58
3t59
3t5i
3t64
3t6i
3t6i
3t6i
3t70
3t74
3t7o
3t7p
3t7q
3t7r
3t7s
3t7t
3t7v
3t7x
3t80
3t82
3t84
3t85
3t87
3t88
3t8c
3t8d
3t8h
3t8o
3t8v
3t8w
3t8x
3t94
3t99
3t9a
3t9d
3t9e
3t9f
3t9i
3ta0
3ta1
3ta2
3tam
3tb9
3tb9
3tba
3tc5
3td7
3td8
3tda
3tdj
3tdk
3tdk
3tdl
3tdt
3tdv
3tdw
3te4
3te5
3tea
3tei
3tfj
3tfn
3tfp
3tfq
3tfu
3tfv
3tfy
3tfy
3tg4
3tgp
3tgs
3tgu
3tgy
3th5
3th8
3thr
3thy
3ti1
3tif
3tij
3tik
3tin
3tiy
3tiz
3tjl
3tjs
3tjz
3tk0
3tk3
3tkh
3tki
3tkl
3tku
3tky
3tl1
3tl9
3tlc
3tle
3tlh
3tlj
3tlk
3tlx
3tm0
3tm4
3tm5
3tmz
3tn7
3tn8
3tne
3tnf
3tnl
3tnw
3to0
3to3
3to6
3tof
3tog
3toh
3tos
3tox
3toz
3tp0
3tpr
3tpt
3tpv
3tq8
3tq9
3tqa
3tqb
3tqc
3tqh
3tqx
3tr0
3tr6
3tr9
3tri
3ts1
3ts4
3tsc
3tsk
3tso
3tti
3ttj
3ttp
3tu5


4fkk
4fkl
4fko
4fkp
4fkq
4fkr
4fks
4fkt
4fkv
4fkw
4fkx
4fkz
4fl0
4fl1
4fl2
4fl3
4flh
4fli
4flj
4flk
4fll
4fln
4flp
4fm5
4fm8
4fmq
4fmx
4fn4
4fnc
4fnd
4fny
4fnz
4fob
4foc
4fod
4fog
4foj
4fok
4for
4fou
4fox
4fox
4fp9
4fps
4fpy
4fqf
4fqs
4fqt
4fr0
4fr1
4fr4
4fr8
4fr8
4fr8
4fri
4frj
4frk
4frs
4fry
4fs4
4fsa
4fse
4fsl
4fsm
4fsn
4fsq
4fsr
4fst
4fsu
4fsw
4fsy
4fsz
4ft0
4ft3
4ft4
4ft5
4ft7
4ft8
4ft9
4fta
4ftc
4fti
4ftj
4ftk
4ftl
4ftm
4ftn
4fto
4ftq
4ftr
4ftt
4ftu
4fu0
4fu8
4fub
4fud
4fue
4fuf
4fui
4fuj
4ful
4fux
4fuy
4fv0
4fv1
4fv2
4fv3
4fv4
4fv5
4fv6
4fv7
4fv8
4fv9
4fvq
4fvr
4fvx
4fvy
4fvz
4fvz
4fw0
4fw0
4fw3
4fw4
4fw8
4fwe
4fwf
4fwj
4fwn
4fx2
4fx3
4fx9
4fx9
4fxf
4fxy
4fyh
4fyn
4fyo
4fyx
4fyx
4fz6
4fz7
4fzb
4fzv
4g01
4g09
4g0k
4g0n
4g16
4g17
4g1c
4g1d
4g1f
4g1n
4g1q
4g1w
4g1y
4g1z
4g20
4g21
4g27
4g28
4g2f
4g2g
4g2h
4g2i
4g2j
4g2l
4g2r
4g2w
4g2y
4g31
4g34
4g36
4g37
4g3e
4g3f
4g3j
4g3p
4g3q
4g3r
4g3s
4g47
4g48
4g56
4g5d
4g5q
4g5y
4g67
4g6g
4g6h
4g6h
4g6n
4g6o
4g73
4g73
4g74
4g77
4g7g
4g8b
4g8c


4qii
4qij
4qim
4qin
4qjc
4qjk
4qjl
4qjq
4qkn
4ql3
4qly
4qm6
4qml
4qmn
4qmp
4qms
4qmt
4qmu
4qmw
4qmx
4qmy
4qmz
4qna
4qnq
4qnr
4qnu
4qnv
4qnw
4qny
4qo9
4qoe
4qof
4qog
4qoh
4qoi
4qos
4qpm
4qpp
4qpz
4qq3
4qq8
4qqz
4qrc
4qt3
4qtn
4qtu
4quv
4qvb
4qvh
4qvt
4qx5
4qxa
4qxm
4qxp
4qxq
4qxr
4qyp
4qys
4qzs
4qzt
4qzu
4r07
4r08
4r09
4r1f
4r1l
4r20
4r21
4r29
4r2l
4r2m
4r38
4r39
4r3a
4r3a
4r3c
4r3k
4r43
4r4u
4r57
4r5w
4r6x
4r7i
4r7u
4r7u
4r81
4r87
4r8h
4r8q
4r9u
4ra3
4ram
4raw
4rbs
4rcv
4rd2
4rdi
4rek
4rel
4req
4rf2
4rf8
4rf9
4rfl
4rgq
4rgs
4rh1
4rhe
4rht
4ri1
4riv
4riy
4riy
4rje
4rkd
4rkf
4rlh
4rls
4rmj
4rmn
4rn6
4rnh
4rnu
4rnv
4rnv
4rnw
4rnx
4ros
4rp8
4rpg
4rpg
4rph
4rph
4rpj
4rqk
4rqu
4rqv
4rqz
4rqz
4rr6
4rr7
4rr8
4rr9
4rra
4rrb
4rrc
4rrd
4rrf
4rrg
4rrh
4rri
4rrj
4rrk
4rrl
4rrm
4rrq
4rrr
4rrv
4rrw
4rrx
4rrz
4rsl
4rsy
4rsy
4rtb
4rtj
4rtk
4rtl
4rtm
4rtn
4rto
4rtp
4rtq
4rtr
4rts
4ruj
4ruo
4rup
4rv9
4rvd
4rvf
4rvg
4rvj
4rvn
4rvo
4rvu
4rw4
4rw6
4rwn
4rwt
4rww
4rx0
4rx6
4rxp
4rxq
4rxr
4rxs
4ryv
4rz3
4rzu


In [4]:
# Check whether downloaded PDB and sequence files are correct
from os import listdir, path

folder = "./data/scPDB/raw/"

for file in sorted(listdir(folder)):
    pdb_id = file[:4]
    pdb_save = path.join(folder, file, "downloaded.pdb")
    with open(pdb_save, "r") as f:
        line = f.readline()
        if line[:3] != "HEA":
            print("Err: PDB " + file)
    fasta_save = path.join(folder, file, "sequence.fasta")
    with open(fasta_save, "r") as f:
        line = f.readline()
        if line[0] != ">":
            print("Err: FASTA " + file)

Err: FASTA 1hwz_5
Err: FASTA 1qy5_1
Err: FASTA 1u0y_1
Err: FASTA 2cmj_1
Err: FASTA 2cmv_1
Err: FASTA 2pdt_2
Err: FASTA 3g07_6
Err: FASTA 3kwn_1
Err: FASTA 3lns_4
Err: FASTA 3lv1_1
Err: FASTA 3mpe_2
Err: FASTA 3mvq_7
Err: FASTA 3mw9_6
Err: FASTA 3n3n_1
Err: FASTA 3q9k_1
Err: FASTA 3ql6_1
Err: FASTA 3tuw_1
Err: FASTA 4dgo_1
Err: FASTA 4egb_5
Err: FASTA 4gdc_2
Err: FASTA 4gdc_7
Err: FASTA 4gdd_4
Err: FASTA 4gdd_6
Err: FASTA 4ka6_4
Err: FASTA 4kg1_1
Err: FASTA 4knz_1
Err: FASTA 4n3l_1
Err: FASTA 4n7a_1
Err: FASTA 4nt3_1
Err: FASTA 4nze_1
Err: FASTA 4oa9_1
Err: FASTA 4oac_1
Err: FASTA 4otw_1
Err: FASTA 4p7p_1
Err: FASTA 4pt0_1
Err: FASTA 4pt3_2
Err: FASTA 4utd_1
Err: FASTA 4wbn_1
Err: FASTA 4y9q_1
Err: FASTA 5aaj_2
Err: FASTA 5cto_4
Err: FASTA 5li5_1
