In [None]:
# delete this cell if working on Pycharm
!pip install Bio

Collecting Bio
  Downloading bio-1.3.8-py3-none-any.whl (269 kB)
[?25l[K     |█▏                              | 10 kB 20.9 MB/s eta 0:00:01[K     |██▍                             | 20 kB 23.4 MB/s eta 0:00:01[K     |███▋                            | 30 kB 15.5 MB/s eta 0:00:01[K     |████▉                           | 40 kB 13.0 MB/s eta 0:00:01[K     |██████                          | 51 kB 12.5 MB/s eta 0:00:01[K     |███████▎                        | 61 kB 14.3 MB/s eta 0:00:01[K     |████████▌                       | 71 kB 12.8 MB/s eta 0:00:01[K     |█████████▊                      | 81 kB 12.4 MB/s eta 0:00:01[K     |███████████                     | 92 kB 13.5 MB/s eta 0:00:01[K     |████████████▏                   | 102 kB 14.6 MB/s eta 0:00:01[K     |█████████████▍                  | 112 kB 14.6 MB/s eta 0:00:01[K     |██████████████▋                 | 122 kB 14.6 MB/s eta 0:00:01[K     |███████████████▉                | 133 kB 14.6 MB/s eta 0:00:01

In [None]:
from Bio.PDB import *
import numpy as np
import re
import os
import pickle
from tqdm import tqdm

In [None]:
NB_MAX_LENGTH = 140
AA_DICT = {"A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "K": 8, "L": 9, "M": 10, "N": 11,
           "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "W": 17, "Y": 18, "V": 19, "X": 20, "-": 21}
FEATURE_NUM = len(AA_DICT)
BACKBONE_ATOMS = ["N", "CA", "C", "O", "CB"]
OUTPUT_SIZE = len(BACKBONE_ATOMS) * 3
NB_CHAIN_ID = "H"


In [None]:
def get_seq_aa(pdb_file, chain_id):
    """
    returns the sequence (String) and a list of all the aa residue objects of the given protein chain.
    :param pdb_file: path to a pdb file
    :param chain_id: chain letter (char)
    :return: sequence, [aa objects]
    """
    # load model
    chain = PDBParser(QUIET=True).get_structure(pdb_file, pdb_file)[0][chain_id]

    aa_residues = []
    seq = ""

    for residue in chain.get_residues():
        aa = residue.get_resname()
        if not is_aa(aa) or not residue.has_id('CA'): # Not amino acid
            continue
        elif aa == "UNK":  # unkown amino acid
            seq += "X"
        else:
            seq += Polypeptide.three_to_one(residue.get_resname())
        aa_residues.append(residue)

    return seq, aa_residues

In [None]:
def generate_input(pdb_file): # TODO: implement this!
    """
    receives a pdb file and returns its sequence in a one-hot encoding matrix (each row is an aa in the sequence, and
    each column represents a different aa out of the 20 aa + 1 special column).
    :param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
    :return: numpy array of size (CDR_MAX_LENGTH * 21)
    """

    # get seq and aa residues
    seq, _ = get_seq_aa(pdb_file, NB_CHAIN_ID)

    seq = seq + ("-" * (NB_MAX_LENGTH - len(seq)))

    # turn in to one-hot encoding matrix
    one_hot_matrix = np.zeros((NB_MAX_LENGTH, FEATURE_NUM))
    for i in range(NB_MAX_LENGTH):
        one_hot_matrix[i][AA_DICT[seq[i]]] = 1

    return one_hot_matrix

In [None]:
def generate_label(pdb_file):  # TODO: implement this!
    """
    receives a pdb file and returns its pairwise distances and pairwise angles (omega, theta, phi).
    :param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
    :return: 4 numpy arrays, the first one with size (CDR_MAX_LENGTH * CDR_MAX_LENGTH * 1) and the other three with
    size (CDR_MAX_LENGTH * CDR_MAX_LENGTH * 2).
    """
    # get seq and aa residues
    seq, aa_residues = get_seq_aa(pdb_file, NB_CHAIN_ID)

    # turn into backbone + CB xyz matrix
    xyz_matrix = np.zeros((NB_MAX_LENGTH, OUTPUT_SIZE))
    for i in range(len(aa_residues)):
        for j, atom in enumerate(BACKBONE_ATOMS):
            if not (atom=="CB" and seq[i] == "G"):
                xyz_matrix[i][3*j:3*j+3] = aa_residues[i][atom].get_coord()

    return xyz_matrix


In [None]:
def matrix_to_pdb(seq, coord_matrix, pdb_name):
    """
    Receives a sequence (String) and the output matrix of the neural network (coord_matrix, numpy array)
    and creates from them a PDB file named pdb_name.pdb.
    :param seq: protein sequence (String), with no padding
    :param coord_matrix: output np array of the nanobody neural network, shape = (NB_MAX_LENGTH, OUTPUT_SIZE)
    :param pdb_name: name of the output PDB file (String)
    """
    ATOM_LINE = "ATOM{}{}  {}{}{} {}{}{}{}{:.3f}{}{:.3f}{}{:.3f}  1.00{}{:.2f}           {}\n"
    END_LINE = "END\n"
    k = 1
    with open(f"{pdb_name}.pdb", "w") as pdb_file:
        for i, aa in enumerate(seq):
            third_space = (4 - len(str(i))) * " "
            for j, atom in enumerate(BACKBONE_ATOMS):
                if not (aa == "G" and atom == "CB"):  # GLY lacks CB atom
                    x, y, z = coord_matrix[i][3*j], coord_matrix[i][3*j+1], coord_matrix[i][3*j+2]
                    b_factor = 0.00
                    first_space = (7 - len(str(k))) * " "
                    second_space = (4 - len(atom)) * " "
                    forth_space = (12 - len("{:.3f}".format(x))) * " "
                    fifth_space = (8 - len("{:.3f}".format(y))) * " "
                    sixth_space = (8 - len("{:.3f}".format(z))) * " "
                    seventh_space = (6 - len("{:.2f}".format(b_factor))) * " "

                    pdb_file.write(ATOM_LINE.format(first_space, k, atom, second_space, Polypeptide.one_to_three(aa) , "H", third_space, 
                                                    i, forth_space, x, fifth_space, y, sixth_space, z, seventh_space,
                                                    b_factor, atom[0]))
                    k += 1

        pdb_file.write(END_LINE)

In [None]:
if __name__ == '__main__':
    
   #  you can make all the data for the network in this section. use picke dump to save all the 5 matrices.
   # this way you won't have to generate them each time you train a newtork.
   # you can save the matrices to your drive and load them in your google colab file later.


    input_matrix = []
    labels_matrix = []
    bad = []
    data_path = "/content/drive/MyDrive/Colab Notebooks/BIO3D_EX4_2022/Ex4Data"  # TODO: change path if needed
    
    for pdb in tqdm(os.listdir(data_path)):
        nb_one_hot = generate_input(os.path.join(data_path, pdb))
        nb_xyz = generate_label(os.path.join(data_path, pdb))
    
        input_matrix.append(nb_one_hot)
        labels_matrix.append(nb_xyz)

    save_path = "/content/drive/MyDrive/Colab Notebooks/BIO3D_EX4_2022"  # TODO: change path if needed

    np.save(f"{save_path}/train_input.npy", np.array(input_matrix))
    np.save(f"{save_path}/train_labels.npy", np.array(labels_matrix))

    print("Number of samples: {}".format(len(input_matrix)))



100%|██████████| 1974/1974 [01:48<00:00, 18.25it/s]


Number of samples: 1974
