In [1]:
# Package imports
import numpy as np
import pandas as pd
import mdtraj as md
from Bio import AlignIO
import os

# Local imports
import sys
current_dir = os.path.abspath('')
project_root = os.path.abspath(os.path.join(current_dir, '../../../../'))
sys.path.append(project_root)
import StructureInformed as StructureInformed

In [2]:
# Define the input parameters
template_structure = md.load_pdb("../../../../Data/6SU6.pdb") # Template structure for identifying important residues
outname = 'strucinform' # Name of the output file
align = AlignIO.read('../alignment/seqs.afa', "fasta") # Alignment file for matching the residues from the template protein to other proteins
raw_encoding_name = 'atchley' # Raw encoding file used for generating the embeddings
template_name = 'Pt_UGT1' # Name of the template protein in MSA
method = 'spherical' # Method for extracting the important residues, can be 'spherical' or 'manual'

#Spherical extraction parameters
centroid_ids = [26] # List of IDs of the residues used for spherical identification
radius = 12 # Radius for the spherical extraction in Angstroms

#Manual extraction parameters
important_residues=[198,399,75,381,222,296,430,188,148,190,86,110,413,146,388,297] # List of important residues for manual extraction

In [3]:
# Identify the important residues from the template protein
if method == 'spherical':
    resid, res = StructureInformed.sphere_extraction(template_structure,int(radius/10),centroid_ids) # Divide by 10 to convert to nm
elif method == 'manual':
    resid, res = StructureInformed.manual_selection(template_structure, important_residues)
else:
    raise ValueError('Method not recognized')

In [4]:
# Matching the residues from the template protein to other proteins
matched = StructureInformed.matching(align,template_name,resid,res)

# Encoding the matched residues
raw_encoding = pd.read_csv(f'../../encodings/raw/{raw_encoding_name}.csv',index_col=0)
encoded = StructureInformed.encoding_matched(matched,raw_encoding)

In [None]:
# Creating the output path
outpath = f'../../encodings/{outname}_{method}'
if method == 'spherical':

    # Combine the centroid ids into a string
    ids = '_c'.join([str(x) for x in centroid_ids])
    outpath += f'_c{ids}_r{radius}_{raw_encoding_name}.csv'
elif method == 'manual':

    # Use the number of important residues as identifier
    outpath += f'{int(len(important_residues))}_{raw_encoding}.csv'
else:
    raise ValueError('Method not recognized')

# Save the encoding to a file
encoded.to_csv(outpath,sep='\t')