In [17]:
# Package imports
import progres as pg
import pandas as pd
import numpy as np
import os
import Bio.PDB as bpdb

# Local imports
import sys
current_dir = os.path.abspath('')
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
sys.path.append(project_root)
from src.helper_functions import NtermSelectPDB

In [None]:
# Define the input parameters
folder_path = 'AF2_models/original/' # Folder with the PDB files to be embedded 
outname = 'progres' # Name of the output file
chain_id = 'A' # Chain ID of the protein

In [None]:
# Pre-load the model to save time
model = pg.load_trained_model()

# Initialize the dataframes, Proges outputs a 128-dimensional embedding
rep_df = pd.DataFrame(columns=np.arange(128))
rep_Nt_df = pd.DataFrame(columns=np.arange(128))

# Loop over the PDB files in the folder
for path, dirs, files in os.walk(folder_path):

    # Ensure we are in a model folder, as these folders do not contain any directories
    if len(dirs) == 0:

        # Get the protein name and the path to the PDB file
        prot_name = os.path.basename(path)
        pdb_path = path+'/'+prot_name+'_ranked_0.pdb'

        # Embed the full protein structure and save the embedding
        rep = pg.embed_structure(pdb_path,model=model)
        rep_df.loc[prot_name] = rep.detach().numpy()

        # Extract the N-terminal, embed it, and save the embedding
        selector = NtermSelectPDB(pdb_path, chain_id)
        selector.save_nterm('AF2_models/temp/temp.pdb')
        rep = pg.embed_structure('AF2_models/temp/temp.pdb',model=model)
        rep_Nt_df.loc[prot_name] = rep.detach().numpy()

# Rename the index of the dataframes to ensure compatibility with the evaluation scripts 
rep_df.index.names = ['enzyme']
rep_Nt_df.index.names = ['enzyme']

# Save the embeddings to a TSV file, using the output name defined above
rep_df.to_csv(f'../../encodings/{outname}.tsv',sep='\t')
rep_Nt_df.to_csv(f'../../encodings/{outname}_Nterm.tsv',sep='\t')