In [8]:
from sequence_models.pretrained import load_model_and_alphabet
from sequence_models.pdb_utils import parse_PDB, process_coords
import torch
import numpy as np
import pandas as pd
import os
from Bio import AlignIO

# Local imports
import sys
current_dir = os.path.abspath('')
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
sys.path.append(project_root)
from src.helper_functions import alignment_to_embedding


In [2]:
# Define the input parameters
folder_path = 'AF2_models/original/' # Folder with the PDB files to be embedded
outname = 'mif' # Name of the output file
align = AlignIO.read('../alignment/seqs.afa', "fasta") # Alignment file for generating mean aligned embeddings
model, collater = load_model_and_alphabet('checkpoints/mif.pt') # Load the model and collater from the checkpoint

  model_data = torch.load(model_name, map_location="cpu")


In [7]:
type(align)

Bio.Align.MultipleSeqAlignment

In [3]:
# Initialize dictionaries to store raw and mean representations
raw_rep_dict = {}
mean_rep_dict = {}

# Initialize DataFrames to store mean pooling representations, the MIF representation has 256 dimensions
rep_mean_pooled_df = pd.DataFrame(columns=np.arange(256))
rep_mean_pooled_Nt_df = pd.DataFrame(columns=np.arange(256))

# Loop over the PDB files in the folder
for path, dirs, files in os.walk(folder_path):

    # Ensure we are in a model folder, as these folders do not contain any directories
    if len(dirs) == 0:

        # Get the protein name and the path to the PDB file
        prot_name = os.path.basename(path)
        pdb_path = path+'/'+prot_name+'_ranked_0.pdb'

        # Parse the PDB file and process the coordinates to get the dihedral angles
        coords, wt, _ = parse_PDB(pdb_path)
        coords = {
        'N': coords[:, 0],
        'CA': coords[:, 1],
        'C': coords[:, 2]
            }
        dist, omega, theta, phi = process_coords(coords)

        # Create a batch with the dihedral angles and the sequence
        batch = [[wt, torch.tensor(dist, dtype=torch.float),
                torch.tensor(omega, dtype=torch.float),
                torch.tensor(theta, dtype=torch.float), torch.tensor(phi, dtype=torch.float)]]
        
        # Get the representation of the protein
        src, nodes, edges, connections, edge_mask = collater(batch)
        rep = model(src, nodes, edges, connections, edge_mask)

        # Store the raw and mean representations
        raw_rep_dict[prot_name] = rep[0].detach().numpy()
        mean_rep_dict[prot_name] = rep[0].detach().numpy().mean(axis=1)

        # Store the mean pooling representations for the full sequence and the N-terminal half
        rep_mean_pooled_df.loc[prot_name] = rep[0].detach().numpy().mean(axis=0)
        rep_mean_pooled_Nt_df.loc[prot_name] = rep[0].detach().numpy()[:int(len(rep[0])/2)].mean(axis=0)

In [9]:
# Generate the mean aligned embeddings
rep_mean_aligned_df,rep_mean_aligned_Nt_df = alignment_to_embedding(align, mean_rep_dict)

In [10]:
# Rename the index of the dataframes to ensure compatibility with the evaluation scripts
rep_mean_aligned_df.index.names = ['enzyme']
rep_mean_pooled_df.index.names = ['enzyme']
rep_mean_aligned_Nt_df.index.names = ['enzyme']
rep_mean_pooled_Nt_df.index.names = ['enzyme']

# Save the embeddings to a TSV file, using the output name defined above
rep_mean_aligned_df.to_csv(f'../../encodings/{outname}_align.tsv',sep='\t')
rep_mean_pooled_df.to_csv(f'../../encodings/{outname}_pool.tsv',sep='\t')
rep_mean_aligned_Nt_df.to_csv(f'../../encodings/{outname}_align_Nt.tsv',sep='\t')
rep_mean_pooled_Nt_df.to_csv(f'../../encodings/{outname}_pool_Nt.tsv',sep='\t')