In [None]:
# Download the scPDB dataset and extract the dataset in "data/scPDB/raw"
!aria2c -c -x 8 -s 8 -d "../data/scPDB" http://bioinfo-pharma.u-strasbg.fr/scPDB/ressources/2016/scPDB.tar.gz --out 'scPDB.tar.gz'
!tar xvzf ../data/scPDB/scPDB.tar.gz -C ../data/scPDB/raw/

In [None]:
# For 10-fold Cross Validation, we will use the splits that were generated by https://arxiv.org/abs/1904.06517
!aria2c -c -x 8 -s 8 -d "../data/scPDB" https://gitlab.com/cheminfIBB/kalasanty/-/archive/master/kalasanty-master.tar.gz?path=data --out 'kalasanty-master-data.tar.gz'
!tar xvzf ../data/scPDB/kalasanty-master-data.tar.gz -C ../data/scPDB/

In [1]:
# Some constants that will be required
# ALWAYS RUN THIS CODE CELL
import os
from glob import glob

data_dir = os.path.abspath("../data/scPDB")
raw_dir = os.path.join(data_dir, "raw")
pssm_dir = os.path.join(data_dir, "pssm")
splits_dir = os.path.join(data_dir, "splits")
preprocessed_dir = os.path.join(data_dir, "preprocessed")

In [None]:
# For sequence-based prediction, we need to use RCSB FASTA files and since scPDB has only mol2 files for the proteins, we will download the fasta file and PDB files of the given proteins ourself. This allows for proper calculation of the labels
# Note that the downloaded PDB automatically has all the structures of a particular PDB ID
# Hence, we just use the first structure instead of all
# Download sequence and PDB files from RCSB for easier matching of labels
import urllib

for file in sorted(os.listdir(raw_dir)):
    pdb_id = file[:4]
    print(pdb_id)

    pdb_save = path.join(folder, file, "downloaded.pdb")
    if not path.exists(pdb_save):
        try:
            urllib.request.urlretrieve(
                "http://files.rcsb.org/download/" + pdb_id + ".pdb", pdb_save
            )
        except:
            print("Err: pdb " + pdb_id)

    fasta_save = path.join(folder, file, "sequence.fasta")
    if not path.exists(fasta_save):
        try:
            urllib.request.urlretrieve(
                "https://www.rcsb.org/pdb/download/downloadFastaFiles.do?structureIdList="
                + pdb_id
                + "&compressionType=uncompressed",
                fasta_save,
            )
        except:
            print("Err: fasta " + pdb_id)

In [2]:
# Check whether downloaded PDB and sequence files are correct
# A few of the PDB files have been obseleted and hence will have to manually download them
# I wrote down the manual mapping of the pdb id's but seem to have lost it. So leaving it as a TODO
import subprocess


def last_line(f):
    proc = subprocess.Popen(["tail", "-n", "1", f], stdout=subprocess.PIPE)
    line = proc.stdout.readlines()[0].decode("utf-8").strip()
    return line


for file in sorted(os.listdir(raw_dir)):
    pdb_id = file[:4]
    pdb_save = os.path.join(raw_dir, file, "downloaded.pdb")
    if last_line(pdb_save) != "END":
        print("Err: PDB " + file)
    fasta_save = os.path.join(raw_dir, file, "sequence.fasta")
    with open(fasta_save, "r") as f:
        line = f.readline()
        if line[0] != ">":
            print("Err: FASTA " + file)

In [4]:
# We need to generate MSAs for the protein sequences in the dataset
# For that, we need to split the sequence.fasta file into respective chain.fasta files
# Also, we need to remove the fasta files of DNA/RNA seqeuences

for file in sorted(os.listdir(raw_dir)):
    file = file.strip()
    pre = os.path.join(raw_dir, file)

    # Read SEQRES entries in PDB file to determine whether a chain
    # has a protein sequence or not
    pdb_file = os.path.join(pre, "downloaded.pdb")
    do_not_include = set()
    with open(pdb_file, "r") as f:
        line = f.readline()
        while line[:6] != "SEQRES":
            line = f.readline()
        while line[:6] == "SEQRES":
            chain_id = line[11]
            residue = line[19:22]
            # Generally DNA/RNA have 1 or 2-letter codes
            if " " in residue:
                do_not_include.add(chain_id)
            line = f.readline()

    fasta = os.path.join(pre, "sequence.fasta")
    with open(fasta, "r") as f:
        header = f.readline()
        while 1:
            chain_id = header[6:7]
            sequence = ""
            line = f.readline()
            while line != "" and line is not None and line[0] != ">":
                sequence += line.strip()
                line = f.readline()
            if chain_id not in do_not_include:
                with open(os.path.join(pre, chain_id + ".fasta"), "w") as hlp:
                    hlp.write(header)
                    hlp.write(sequence + "\n")
            if line == "" or line is None:
                break
            header = line

In [3]:
# In case you want to delete the generated fasta files from the above cell, use this
# for file in sorted(os.listdir(raw_dir)):
#     for fasta in glob(os.path.join(raw_dir, file.strip(), "?.fasta")):
#         os.remove(fasta)

In [4]:
# Let us remove some other troublesome fasta files
# trouble = ["1m1d_1/D.fasta", # The PDB file does not contain structure of this sequence at all 
#             "2xbm_4/E.fasta", # RNA sequence that slipped past somehow
#             "2xbm_4/F.fasta"] # RNA sequence that slipped past somehow
# for chain in trouble:
#     file = os.path.join(raw_dir, chain)
#     if os.path.exists(file):
#         os.remove(file)

In [5]:
# The fasta files generated will have a lot of common sequences
# To speed up MSA generation, let us create a unique file that has common sequences
# Then we can generate the MSAs for only the first chain in every line
from collections import defaultdict

sequences = defaultdict(list)
for file in sorted(os.listdir(raw_dir)):
    pre = os.path.join(raw_dir, file.strip())
    for fasta in sorted(os.listdir(pre)):
        if fasta[2:] != "fasta":
            continue
        chain_id = fasta[0]
        with open(os.path.join(pre, fasta)) as f:
            f.readline()
            sequence = f.readline().strip()
            # This choice was made so that rsync would work much better and easier
            sequences[sequence].append(file + "/" + chain_id + "*")

keys = list(sequences.keys())

with open(os.path.join(data_dir, "unique"), "w") as f:
    for key in keys:
        line = ""
        for chain_id in sequences[key]:
            line += chain_id + " "
        f.write(line[:-1] + "\n")


In [2]:
# Run a regex search on the generated fasta files to ensure that we don't have any DNA/RNA sequences
# Will have to manually check the files to ensure that they are protein sequences
# All of them are protein sequences
import re


def match(strg, search=re.compile(r"[^ACGTURYKMSWBDHVN\-\.]").search):
    return not bool(search(strg))


with open(os.path.join(data_dir, "unique"), "r") as f:
    lines = f.readlines()

for line in lines:
    pdb_id_struct, chain_id = line.strip().split()[0].split("/")
    chain_id = chain_id[0]
    with open(os.path.join(raw_dir, pdb_id_struct, chain_id + ".fasta"), "r") as f:
        f.readline()
        seq = f.readline().strip()
    if match(seq):
        print(pdb_id_struct, chain_id)

1by5_1 B
1g5q_2 M
1kap_1 I
1nvq_1 B
2zl4_6 1
3b7t_1 B
3e4a_2 F
3h9j_6 E
3mpj_3 Y
3smk_1 P
3t64_2 F
3t70_3 F
3ut5_2 F
4lxl_1 D
4uzq_1 B
5cxv_1 C


In [None]:
# MSAs need to be generated for the fasta files
# Refer to https://github.com/crvineeth97/msa-generator

In [None]:
# MAKING OF .npz FILES FROM HERE
# Download NWalign.py, pdb2fasta.py and reindex_pdb.py
# https://zhanglab.ccmb.med.umich.edu/NW-align/NWalign.py (Make small changes for Python3 compatibility)
# https://zhanglab.ccmb.med.umich.edu/reindex_pdb/reindex_pdb.py (Make changes to allow for reindexing specific chains and also to replace celenocysteine, "U" with "X")
# https://zhanglab.ccmb.med.umich.edu/reindex_pdb/pdb2fasta.py (Almost the same)
# Let us preprocess ALL the available data and create .npz files which contain pdb_id, chain_id, sequence, length, labels
# %debug
# from IPython.core.debugger import set_trace
import numpy as np
from reindex_pdb import reindex_pdb
from time import time
from Bio.PDB import PDBParser, is_aa
from Bio import BiopythonWarning
import warnings
from rdkit import Chem

warnings.simplefilter("ignore", BiopythonWarning)
parser = PDBParser()


def initialize_protein_info(pdb_id_struct, chain_id):
    pre = os.path.join(raw_dir, pdb_id_struct)
    protein = {}
    protein["structure"] = parser.get_structure(
        pdb_id_struct, os.path.join(pre, "tmp.pdb")
    )

    protein["residues"] = []
    for res in protein["structure"][0][chain_id]:
        id = res.get_id()
        if is_aa(res, standard=False) and id[0] == " ":
            protein["residues"].append(res)

    protein["sequence"] = ""
    with open(os.path.join(pre, chain_id + ".fasta")) as f:
        line = f.readline()
        line = f.readline()
        while line != "" and line is not None:
            protein["sequence"] += line.strip()
            line = f.readline()
    return protein


def initialize_ligand_info(pdb_id_struct, chain_id):
    pre = os.path.join(raw_dir, pdb_id_struct)
    ligand = {}
    ligand["supplier"] = Chem.SDMolSupplier(
        os.path.join(pre, "ligand.sdf"), sanitize=False
    )
    assert len(ligand["supplier"]) == 1
    ligand["supplier"] = ligand["supplier"][0]
    assert ligand["supplier"].GetNumConformers() == 1
    ligand["coords"] = ligand["supplier"].GetConformer().GetPositions()
    ligand["num_atoms"] = ligand["supplier"].GetNumAtoms()
    assert ligand["num_atoms"] == len(ligand["coords"])
    ligand["atom_types"] = np.array(
        [atom.GetSymbol() for atom in ligand["supplier"].GetAtoms()]
    )
    return ligand


def find_residues_in_contact(protein, ligand):
    """
    Returns a numpy 1D array where a 1 represents that the amino acid is in
    contact with the ligand
    """
    labels = np.zeros(len(protein["sequence"]))
    for residue in protein["residues"]:
        res_ind = residue.get_id()[1] - 1
        for atom in residue.get_atoms():
            if atom.get_fullname()[1] == "H":
                continue
            for i in range(ligand["num_atoms"]):
                if ligand["atom_types"][i] == "H":
                    continue
                # We are considering the ligand to be in contact with the AA
                # if the distance between them is within 5A
                if np.linalg.norm(atom.get_coord() - ligand["coords"][i]) <= 5.0:
                    labels[res_ind] = 1
                    break
            # We know that the residue is in contact with ligand
            # So go to the next residue
            if labels[res_ind]:
                break
    return labels


def get_distance_map_true(protein, atom_type):
    length = len(protein["sequence"])
    num_residues = len(protein["residues"]) # Might be different from length because of missing residues
    distance_map = np.full((length, length), np.inf)  # Initialize to infinite distance
    for ind1 in range(num_residues):
        res1 = protein["residues"][ind1]
        if res1.has_id(atom_type) is False:
            continue
        res1_ind = res1.get_id()[1] - 1
        for ind2 in range(ind1 + 1, num_residues):
            res2 = protein["residues"][ind2]
            if res2.has_id(atom_type) is False:
                continue
            res2_ind = res2.get_id()[1] - 1
            dist = np.linalg.norm(res1[atom_type].get_coord() - res2[atom_type].get_coord())
            distance_map[res1_ind][res2_ind] = dist
            distance_map[res2_ind][res1_ind] = dist
    np.fill_diagonal(distance_map, 0.0)
    return distance_map

if not os.path.exists(preprocessed_dir):
    os.mkdir(preprocessed_dir)

process_time = 0
write_time = 0
for pdb_id_struct in sorted(os.listdir(raw_dir)):
    pre = os.path.join(raw_dir, pdb_id_struct)
    if not os.path.exists(os.path.join(pre, "downloaded.pdb")):
        print("Downloaded PDB does not exist for %s" % pdb_id_struct)
        continue
    process_time_start = time()
    for file in os.listdir(pre):
        # Get only the chain fasta sequences
        if file[2:] != "fasta":
            continue
        chain_id = file[0]

        # If our preprocessed file exists, continue
        if os.path.exists(
            os.path.join(preprocessed_dir, pdb_id_struct, chain_id + ".npz")
        ):
            continue

        print(pdb_id_struct, chain_id)
        # Reindex the chain and write to tmp.pdb
        dest = os.path.join(pre, "tmp.pdb")
        PDBtxt_reindex = reindex_pdb(
            os.path.join(pre, chain_id + ".fasta"),
            os.path.join(pre, "downloaded.pdb"),
            True,
        )
        # set_trace()
        if PDBtxt_reindex is None:
            print(pdb_id_struct, chain_id, "reindex fail")
            continue

        with open(dest, "w") as fp:
            fp.write(PDBtxt_reindex)

        # Initialize information required for the complex
        protein = initialize_protein_info(pdb_id_struct, chain_id)
        ligand = initialize_ligand_info(pdb_id_struct, chain_id)

        # Make the dictionary for storage
        try:
            data = {}
            data["pdb_id_struct"] = pdb_id_struct
            data["chain_id"] = chain_id
            data["sequence"] = protein["sequence"]
            data["length"] = len(data["sequence"])
            data["labels"] = find_residues_in_contact(protein, ligand)
            data["ca_dist_map_true"] = get_distance_map_true(protein, "CA")
            assert len(data["sequence"]) == len(data["labels"])
        except:
            print(pdb_id_struct, chain_id, "dictionary fail")
            continue

        # Remove the tmp.pdb file
        os.remove(dest)
        process_time += time() - process_time_start

        # Write the data to a numpy .npz file
        write_time_start = time()
        folder = os.path.join(preprocessed_dir, pdb_id_struct)
        if not os.path.exists(folder):
            os.mkdir(folder)
        np.savez(os.path.join(folder, chain_id + ".npz"), **data)
        write_time += time() - write_time_start

print("Processing time:", process_time)
print("Write time:", write_time)


1bdm_2 A
1bdm_2 A dictionary fail
1ctr_1 A
1ctr_1 A dictionary fail
1e94_1 F
1e94_1 F dictionary fail
1gar_1 A
1gar_1 A dictionary fail
1gar_1 B
1gar_1 B dictionary fail
1glb_1 F
1glb_1 F dictionary fail
1glb_1 G
1glb_1 G dictionary fail
1hjf_1 A
1hjf_1 A dictionary fail
1kms_1 A
1kms_1 A dictionary fail
1m1d_1 D
1m1d_1 D reindex fail
1mab_1 B
1mab_1 B dictionary fail
1mab_2 B
1mab_2 B dictionary fail
1oe5_1 A
1oe5_1 A dictionary fail
1ol5_1 B
1ol5_1 B dictionary fail
1olm_2 C
1olm_2 C dictionary fail
1olm_2 A
1olm_2 A dictionary fail
1pp9_1 B
1pp9_1 B dictionary fail
1pp9_1 T
1pp9_1 T dictionary fail
1pp9_1 G
1pp9_1 G dictionary fail
1ppj_2 W
1ppj_2 W dictionary fail
1ppj_2 O
1ppj_2 O dictionary fail
1ppj_2 J
1ppj_2 J dictionary fail
1ppj_2 T
1ppj_2 T dictionary fail
1ppj_2 G
1ppj_2 G dictionary fail
1ppj_3 W
1ppj_3 W dictionary fail
1ppj_3 O
1ppj_3 O dictionary fail
1ppj_3 J
1ppj_3 J dictionary fail
1ppj_3 T
1ppj_3 T dictionary fail
1ppj_3 G
1ppj_3 G dictionary fail
1pu8_2 A
1pu8_2 A

3mvq_7 J
3mvq_7 J reindex fail
3mvq_7 K
3mvq_7 K reindex fail
3mvq_7 H
3mvq_7 H reindex fail
3mvq_7 D
3mvq_7 I
3mvq_7 I reindex fail
3mvq_7 G
3mvq_7 G reindex fail
3mvq_7 L
3mvq_7 L reindex fail
3mvw_1 A
3mvw_1 B
3mvy_2 A
3mvy_2 B
3mvz_2 A
3mvz_2 B
3mw0_2 A
3mw0_2 B
3mw9_6 E
3mw9_6 F
3mw9_6 C
3mw9_6 A
3mw9_6 B
3mw9_6 D
3mws_1 A
3mws_1 B
3mwu_1 A
3mww_1 A
3mww_1 B
3mx2_2 C
3mx2_2 A
3mx2_2 B
3mx5_1 C
3mx5_1 A
3mx5_1 B
3mxd_1 A
3mxd_1 B
3mxe_1 A
3mxe_1 B
3my0_6 R
3my0_6 E
3my0_6 F
3my0_6 N
3my0_6 P
3my0_6 C
3my0_6 W
3my0_6 Q
3my0_6 X
3my0_6 O
3my0_6 A
3my0_6 V
3my0_6 B
3my0_6 J
3my0_6 M
3my0_6 K
3my0_6 U
3my0_6 H
3my0_6 T
3my0_6 S
3my0_6 D
3my0_6 I
3my0_6 G
3my0_6 L
3my1_1 A
3my1_1 B
3my5_2 C
3my5_2 A
3my5_2 B
3my5_2 D
3myk_1 X
3myk_2 X
3myq_1 A
3myt_1 C
3myt_1 A
3myt_1 B
3myt_1 D
3myu_1 A
3myu_1 B
3myz_1 A
3myz_1 B
3mz9_2 A
3mz9_2 B
3mzb_2 A
3mzb_2 B
3mzc_1 A
3mzh_3 A
3mzh_3 B
3mzi_6 E
3mzi_6 F
3mzi_6 C
3mzi_6 A
3mzi_6 B
3mzi_6 D
3mzs_2 C
3mzs_2 A
3mzs_2 B
3mzs_2 D
3mzt_1 E
3mzt_1 F
3mzt

3oee_3 E
3oee_3 F
3oee_3 1
3oee_3 N
3oee_3 P
3oee_3 C
3oee_3 Z
3oee_3 W
3oee_3 Q
3oee_3 X
3oee_3 O
3oee_3 Y
3oee_3 A
3oee_3 V
3oee_3 B
3oee_3 J
3oee_3 M
3oee_3 K
3oee_3 U
3oee_3 H
3oee_3 T
3oee_3 S
3oee_3 D
3oee_3 I
3oee_3 G
3oee_3 L
3oeh_2 R
3oeh_2 E
3oeh_2 F
3oeh_2 1
3oeh_2 N
3oeh_2 P
3oeh_2 C
3oeh_2 Z
3oeh_2 W
3oeh_2 Q
3oeh_2 X
3oeh_2 O
3oeh_2 Y
3oeh_2 A
3oeh_2 V
3oeh_2 B
3oeh_2 J
3oeh_2 M
3oeh_2 K
3oeh_2 U
3oeh_2 H
3oeh_2 T
3oeh_2 S
3oeh_2 D
3oeh_2 I
3oeh_2 G
3oeh_2 L
3oes_1 A
3oet_2 E
3oet_2 F
3oet_2 C
3oet_2 A
3oet_2 B
3oet_2 H
3oet_2 D
3oet_2 G
3oev_1 R
3oev_1 E
3oev_1 F
3oev_1 1
3oev_1 N
3oev_1 P
3oev_1 C
3oev_1 Z
3oev_1 W
3oev_1 Q
3oev_1 X
3oev_1 O
3oev_1 Y
3oev_1 A
3oev_1 V
3oev_1 B
3oev_1 J
3oev_1 M
3oev_1 K
3oev_1 U
3oev_1 H
3oev_1 T
3oev_1 S
3oev_1 2
3oev_1 D
3oev_1 I
3oev_1 G
3oev_1 L
3oew_1 A
3oey_1 A
3oez_1 A
3oez_1 B
3of1_1 A
3of2_1 A
3of3_9 E
3of3_9 F
3of3_9 C
3of3_9 A
3of3_9 B
3of3_9 J
3of3_9 K
3of3_9 H
3of3_9 D
3of3_9 I
3of3_9 G
3of3_9 L
3of4_2 C
3of4_2 A
3of4_2 B
3

3q0g_2 F
3q0g_2 C
3q0g_2 A
3q0g_2 B
3q0g_2 D
3q0j_2 E
3q0j_2 F
3q0j_2 C
3q0j_2 A
3q0j_2 B
3q0j_2 D
3q0u_1 A
3q0v_2 A
3q0v_2 B
3q0w_1 A
3q0z_2 A
3q0z_2 B
3q1f_1 A
3q1f_1 B
3q1k_1 C
3q1k_1 A
3q1k_1 B
3q1k_1 D
3q23_1 A
3q23_1 B


In [21]:
# Get amino acid properties from AA_INDEX database by using correlation and stuff
# Need to write the code for this
# TODO

def get_amino_acid_properties(csv_file):
    feats = {}
    with open(csv_file) as f:
        records = csv.reader(f)
        for i, row in enumerate(records):
            if i == 0:
                length = len(row) - 1
                continue
            feats[row[0]] = np.array(
                [float(el) if el != "" else 0.0 for el in row[1:]], dtype=np.float32
            )
        feats["X"] = np.zeros(length)
    feats = defaultdict(lambda: np.zeros(length), feats)
    return feats



In [32]:
# Assuming that all the PSSMs have been copied to data/scPDB/pssm
# We can have more features included as well. For now, let us consider PSSMs
# !rsync -avP --include="*/" --include="*pssm" --exclude="*" ~/Git/msa-generator/data/scPDB/ ~/Git/protein-binding-site-prediction/data/scPDB/pssm/
# Now, let us preprocess the files again to generate the features directly that can be imported into pytorch easily

# USING CONCATENATION STRATEGY

from collections import defaultdict
import numpy as np
import csv

# List of amino acids and their integer representation
AA_ID_DICT = {
    "X": 0,
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
}
AA_ID_DICT = defaultdict(lambda: 0, AA_ID_DICT)

# One-hot encoding and positional encoding
feat_vec_len = 21
feat_vec_len += 1

# We generated PSSMs for select sequences
# Mapping different sequences to the one for which we generated
common_pssms = defaultdict(str)
with open(os.path.join(data_dir, "unique"), "r") as f:
    for line in f.readlines():
        line = line.strip().split()
        pssm_generated_for = line[0][:-1]
        common_pssms[pssm_generated_for] = pssm_generated_for
        for pdb_id_struct_chain in line[1:]:
            pdb_id_struct_chain = pdb_id_struct_chain[:-1]
            common_pssms[pdb_id_struct_chain] = pssm_generated_for


def get_pssm(pdb_id_struct, chain_id, length):
    pssm_pdb_id_struct, pssm_chain_id = common_pssms[pdb_id_struct + "/" + chain_id].split("/")
    with open(os.path.join(pssm_dir, pssm_pdb_id_struct, pssm_chain_id + ".pssm"), "r") as f:
        lines = f.readlines()
    feature = np.zeros((21, length))
    for i, line in enumerate(lines):
        feature[i] = np.array(line.strip().split(), dtype=np.float32)
    return feature

# PSSM length
feat_vec_len += 21

# Amino acid physico-chemical features selected by removing highly correlated features
AA_sel_feats = get_amino_acid_properties(os.path.join(data_dir, "selected_features.csv"))
feat_vec_len += len(AA_sel_feats["X"])


def generate_input(sample):
    """
    Generate input for a single sample which is a dictionary containing required items
    """
    X = np.zeros((feat_vec_len, sample["length"]))

    # One-hot encoding
    X[:21] = np.array([np.eye(21)[AA_ID_DICT[el]] for el in sample["sequence"]]).T

    # Positional encoding
    X[21] = np.arange(1, sample["length"] + 1, dtype=np.float32) / sample["length"]

    # PSSM
    X[22:43] = sample["pssm"]

    # AA Properties
    X[43:] = np.array([AA_sel_feats[aa] for aa in sample["sequence"]]).T

    return X


for pdb_id_struct in sorted(os.listdir(preprocessed_dir)):
    flg = True
    pre = os.path.join(preprocessed_dir, pdb_id_struct)
    features_file = os.path.join(pre, "features.npy")
    labels_file = os.path.join(pre, "labels.npy")

    if os.path.exists(labels_file):
        continue
    print(pdb_id_struct)

    for file in sorted(os.listdir(pre)):
        # In case features were generated but not labels, redo it
        if file == "features.npz":
            continue
        chain_id = file[-len(".npz") - 1 : -len(".npz")]
        sample = np.load(os.path.join(pre, file))
        sample = {
            key: sample[key].item() if sample[key].shape is () else sample[key]
            for key in sample
        }
        sample["pssm"] = get_pssm(pdb_id_struct, chain_id, sample["length"])
        if flg:
            X = generate_input(sample)
            y = sample["labels"]
            flg = False
        else:
            # Using concatenation strategy
            tmp = generate_input(sample)
            X = np.concatenate((X, tmp), 1)
            y = np.concatenate((y, sample["labels"]), 0)

    np.save(features_file, X)
    np.save(labels_file, y)

# ISSUES

<input type="checkbox"> Improve downloading PDB and fasta by using something that resumes downloads

<input type="checkbox"> Not sure if checking for a space in the residue is the best way of checking. Can use the code_with_modified_residues dictionary from NWalign.py (https://zhanglab.ccmb.med.umich.edu/NW-align/NWalign.py) (Should be OK)

<input type="checkbox"> Make the pdb_id field in the preprocessed files into pdb_id_struct

