In [None]:
import os

if not os.path.exists("../datasets/processed/pdb_to_seqvec_dict.json"):
    from Bio import PDB
    import json

    PDB_DIRECTORY = "../datasets/raw/pdbs/"
    pdb_parser = PDB.PDBParser(QUIET=True)

    sequences = {}

    for filename in os.listdir(PDB_DIRECTORY):
        pdb_file_path = os.path.join(PDB_DIRECTORY, filename)

        structure = pdb_parser.get_structure("protein", pdb_file_path)
        sequence = ""

        for model in structure:
            for chain in model:
                for residue in chain:
                    if PDB.is_aa(residue, standard=True):
                        sequence += PDB.Polypeptide.protein_letters_3to1[
                            residue.get_resname()
                        ]

        protein_name = filename[:-4]
        sequences[protein_name] = sequence
        print(f"{protein_name}'s sequence has been extracted from the PDB file")

    with open("../datasets/processed/pdb_id_to_sequence.json", "w") as f:
        f.write(json.dumps(sequences))

In [None]:
with open("../datasets/processed/pdb_id_to_sequence.json", "r") as f:
    pdb_id_to_sequence_str = f.read()

pdb_id_to_sequence = json.loads(pdb_id_to_sequence_str)
pdb_id_to_sequence.keys()

In [None]:
from allennlp.commands.elmo import ElmoEmbedder
from pathlib import Path

# Path to the pre-trained seqvec model weights
model_dir = Path("../datasets/seqvec/uniref50_v2")
weights = model_dir / "weights.hdf5"
options = model_dir / "options.json"

seqvec_model = ElmoEmbedder(options, weights, cuda_device=-1)

In [None]:
if not os.path.exists("../datasets/processed/pdb_to_seqvec_dict.json"):
    pdb_id_to_seqvec_embedding = {}

    for pdb_id, sequence in pdb_id_to_sequence.items():
        tokenized_seq = list(sequence)

        embeddings = seqvec_model.embed_sentence(tokenized_seq)  # [L, 1024]
        pdb_id_to_seqvec_embedding[protein_name] = embeddings
        print(f"Embedded {pdb_id}")

In [None]:
if not os.path.exists("../datasets/processed/pdb_to_seqvec_dict.json"):
    import json

    with open("../datasets/processed/pdb_to_seqvec_dict.json", "w") as f:
        f.write(json.dumps(pdb_id_to_seqvec_embedding))

In [None]:
import numpy as np

np.save(
    "../datasets/processed/pdb_to_seqvec_dict.npy",
    np.array(pdb_id_to_seqvec_embedding),
)