In [14]:
import numpy as np
import mdtraj as md
import pandas as pd
from Bio.PDB import Superimposer, PDBParser, PDBIO
from Bio import PDB
import warnings
warnings.filterwarnings("ignore")
import glob

In [15]:
# Dictionary for converting three-letter to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}


In [16]:
# Collecting the reference and AlphaFold generated files - change path for AF2 and AF3

path_AF2 = 'Data/AF2_Aligned/*.pdb'
path_AF3 = 'Data/AF3_Aligned/*.pdb'
sample_files = []
for filename in glob.glob(path_AF3):
    sample_files.append(filename)

reference_files = []
for sample_file in sample_files:
    reference_file_name = sample_file.split('/')[-1].split('_')[0].lower()
    for filename in glob.glob('Data/TM_only_final/*'):
        if filename.endswith('.pdb') and filename.split('/')[-1].split('_')[0].lower() == reference_file_name:
            reference_files.append(filename)

In [17]:
# Function to get the sequence of the protein
def get_seq(structure):
    sequences = {}
    for model in structure:
        for chain in model:
            seq = ''
            for residue in chain:
                if residue.id[0] == ' ': # checks for heteroatoms to exclude them (like water)
                    seq += three_to_one.get(residue.resname, 'X')  # 'X' for unknown or non-standard amino acids
            sequences = seq
    return sequences

In [18]:

from Bio.PDB import PDBParser
import numpy as np
import pandas as pd


# Function to get the top 7 subsequences (TM regions) from the reference protein
def get_top_subsequences(pdb_file, num_subsequences=7):
    parser = PDBParser()
    structure = parser.get_structure("protein", pdb_file)
    subsequences = []

    for model in structure:
        for chain in model:
            current_res_num = None
            current_subsequence = []

            for residue in chain:
                res_num = residue.get_full_id()[3][1]
                if current_res_num is None or res_num != current_res_num + 1:
                    if current_subsequence:
                        subsequences.append(current_subsequence)
                    current_subsequence = []

                if residue.has_id("CA"):
                    current_subsequence.append((residue.get_resname(), residue["CA"].get_vector(), res_num))
                    current_res_num = res_num

            if current_subsequence:
                subsequences.append(current_subsequence)

    subsequences.sort(key=len, reverse=True)
    return subsequences[:num_subsequences]

# Function to find the corresponding subsequence in the generated protein
def find_corresponding_gen_subsequence(ref_subseq, gen_structure):
    ref_length = len(ref_subseq)
    best_match = None

    for model in gen_structure:
        for chain in model:
            gen_sequence = [(residue.get_resname(), residue["CA"].get_vector(), residue.get_full_id()[3][1]) 
                            for residue in chain if residue.has_id("CA")]

            for i in range(len(gen_sequence) - ref_length + 1):
                matching_length = sum(1 for ref_aa, gen_aa in zip(ref_subseq, gen_sequence[i:i+ref_length]) if ref_aa[0] == gen_aa[0])
                if best_match is None or matching_length > best_match[0]:
                    best_match = (matching_length, gen_sequence[i:i+ref_length])

    return best_match[1] if best_match else None

# Function to compare the subsequences of the reference and generated proteins and calculate the average distance between C-alpha atoms
def compare_subsequences(ref_pdb, gen_pdb, protein_name):
    parser = PDBParser()
    ref_structure = parser.get_structure("ref", ref_pdb)
    gen_structure = parser.get_structure("gen", gen_pdb)
    ref_sequences = get_top_subsequences(ref_pdb)

    distances = []
    gen_sequence_length = sum(1 for model in gen_structure for chain in model for residue in chain)
   

    for ref_subseq in ref_sequences:
        gen_subseq = find_corresponding_gen_subsequence(ref_subseq, gen_structure)

        if gen_subseq:
            for (ref_aa, ref_ca, _), (gen_aa, gen_ca, _) in zip(ref_subseq, gen_subseq):
                if ref_aa == gen_aa:
                    distance = np.linalg.norm(ref_ca - gen_ca)
                    distances.append(distance)

    if distances:
        avg_distance = np.mean(distances)
        max_distance = np.max(distances)
        return protein_name, avg_distance


results = []

for ref_file, gen_file in zip(reference_files, sample_files):
    protein_name = ref_file.split('/')[-1].split('_')[0].lower()
    result = compare_subsequences(ref_file, gen_file, protein_name)
    if result is not None:
        results.append(result)

# Storing the results in a dataframe
df_results = pd.DataFrame(results, columns=["Protein", "Average Distance"])



In [19]:
df_results.sort_values(by=['Protein'], inplace=True)
print(df_results)


   Protein  Average Distance
7    5ht1b          1.134164
58   5ht2a          3.858857
39   5ht2b          1.061522
44   5ht2c          1.367736
14    aa1r          1.990629
..     ...               ...
67   pth1r          2.433251
37   s1pr1          2.015047
52     smo          4.263313
18    ta2r          1.099221
24     v2r          5.541794

[75 rows x 2 columns]


In [20]:
# Saving final results - change the path for AF2 and AF3

df_labels = pd.read_csv('Data/labels_GPCRs.csv')
df_labels.sort_values(by=['Protein'], inplace=True)

df_results.reset_index(drop=True, inplace=True)
df_labels.reset_index(drop=True, inplace=True)

df_labels['Average Distance'] = df_results['Average Distance']



In [22]:
df_labels.to_csv("Results/deform_results_AF3.csv", index=False)