In [1]:
import sys

# allows importing local scripts (utils folder)
sys.path.append("..")

## Trying out ProtBERT

In [None]:
import torch
from utils.protbert import ProtBERTEmbedder

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
protbert = ProtBERTEmbedder(device=device)

In [3]:
tokenized_seq_example = list("MVTYDFGSDEMHD")

embedding = protbert.embed_sequence(tokenized_seq_example)
embedding.shape  # This should be [len(sequence), 1024]

(13, 1024)

In [4]:
embedding

array([[ 0.03060327,  0.02436657,  0.13633075, ..., -0.08681758,
        -0.11452153, -0.01299387],
       [ 0.03058379,  0.02417514,  0.13646212, ..., -0.08692887,
        -0.1145592 , -0.01281546],
       [ 0.03054104,  0.02443778,  0.13625446, ..., -0.08680265,
        -0.11454351, -0.01304578],
       ...,
       [ 0.03060327,  0.02436657,  0.13633075, ..., -0.08681758,
        -0.11452153, -0.01299387],
       [ 0.03054449,  0.02441703,  0.13626868, ..., -0.08681716,
        -0.11451912, -0.01304045],
       [ 0.03056773,  0.024401  ,  0.13626456, ..., -0.08680427,
        -0.11453433, -0.01304016]], dtype=float32)

## Creating a Mapping from a Protein's PDB ID to its Residue Sequence

In [5]:
import os

if not os.path.exists("../datasets/processed/pdb_id_to_sequence.json"):
    from Bio import PDB

    PDB_DIRECTORY = "../datasets/raw/pdbs/"
    pdb_parser = PDB.PDBParser(QUIET=True)

    sequences = {}

    for filename in os.listdir(PDB_DIRECTORY):
        pdb_file_path = os.path.join(PDB_DIRECTORY, filename)

        structure = pdb_parser.get_structure("protein", pdb_file_path)
        sequence = ""

        for model in structure:
            for chain in model:
                for residue in chain:
                    if PDB.is_aa(residue, standard=True):
                        sequence += PDB.Polypeptide.protein_letters_3to1[
                            residue.get_resname()
                        ]

        protein_name = filename[:-4]
        sequences[protein_name] = sequence
        print(f"{protein_name}'s sequence has been extracted from the PDB file")

In [12]:
import json

if not os.path.exists("../datasets/processed/pdb_id_to_sequence.json"):
    with open("../datasets/processed/pdb_id_to_sequence.json", "w") as f:
        f.write(json.dumps(sequences))

In [13]:
with open("../datasets/processed/pdb_id_to_sequence.json", "r") as f:
    pdb_id_to_sequence_str = f.read()

pdb_id_to_sequence = json.loads(pdb_id_to_sequence_str)
pdb_id_to_sequence.keys()

dict_keys(['1X41', '2D9K', '1X4P', '1SRQ', '2HZ6', '6HDU', '1H1W', '1AO3', '1H3H', '2JC6', '1X5T', '1O6U', '7K15', '1X4G', '4GNK', '1A9U', '2DM2', '4BQM', '2D89', '1X69', '2ASQ', '3G9V', '2J51', '1FEW', '1A6Z', '3PBL', '1XM9', '2XRW', '1QG3', '1JPL', '1V66', '1QFA', '2DCO', '2JNG', '1CX8', '2QZ4', '2DA1', '7U5O', '2FYT', '3LPO', '7E32', '1A4R', '4BHX', '5CQR', '1SJ6', '1QR2', '1T9Z', '1FQ3', '6COY', '1Z5V', '1Z57', '2JXJ', '1CNT', '1AV1', '1AUK', '133L', '1CMO', '2FLU', '2CQL', '2CRW', '2H4V', '1Z6Z', '3K7G', '2CSS', '4WRN', '3RLE', '4LG9', '1ZC0', '2O0O', '1D3E', '3RBN', '2V37', '1DFN', '2MP1', '2H8B', '1SG1', '2XIJ', '1D1Z', '3IUG', '2QCW', '1T64', '5V02', '1JHJ', '1DEB', '1DDF', '2ADO', '6C48', '1BHT', '3FQW', '3DI2', '2LXL', '2LYH', '4TNB', '5LXR', '5GK9', '2WO3', '6K41', '1UGV', '1E7Z', '1K9I', '1LJD', '3T6G', '1EAJ', '1GZU', '1ITF', '2BYG', '4M8G', '1B09', '2P31', '1IVY', '1LJ2', '1YQ7', '1ITQ', '2YMB', '5WC9', '7CIQ', '3O4O', '1BP3', '6KZQ', '5YY8', '1UR6', '3AN2', '1GLO', '1N3Y

In [14]:
import numpy as np

seq_lens = [len(seq) for seq in pdb_id_to_sequence.values()]
np.mean(seq_lens), np.median(seq_lens)

(1057.6200918964078, 575.0)

## Getting the ProtBERT embedding for Each Extracted Sequence

In [None]:
from utils.protbert import ProtBERTEmbedder

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
protbert = ProtBERTEmbedder(device=device)

In [None]:
if not os.path.exists("../datasets/processed/pdb_id_to_sequence.json"):
    pdb_id_to_protbert_embedding = {}

    for pdb_id, sequence in pdb_id_to_sequence.items():
        tokenized_seq = list(sequence)

        embeddings = protbert.embed_sequence(tokenized_seq)  # [L, 1024]
        pdb_id_to_protbert_embedding[protein_name] = embeddings
        print(f"Embedded {pdb_id}")

In [None]:
np.save(
    "../datasets/processed/pdb_to_protbert_dict.npy",
    np.array(pdb_id_to_protbert_embedding),
)