In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pathlib

from scipy.spatial.distance import pdist, squareform
from scipy.linalg import svd
from Bio.PDB import PDBParser,Superimposer # we use biopython package
from Bio import pairwise2
from Bio.Seq import Seq
from Bio import PDB
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import seq1 as three_to_one

## Frechet Dynamic Programming Formula

$$
D[i][j] =
\begin{cases}
\|a_1 - b_1\| & i = 0, j = 0 \\
\max(D[i-1][0], \|a_i - b_1\|) & j = 0 \\
\max(D[0][j-1], \|a_1 - b_j\|) & i = 0 \\
\max\left(\|a_i - b_j\|,\; \min\{D[i-1][j], D[i-1][j-1], D[i][j-1]\}\right) & \text{otherwise}
\end{cases}
$$

Return $D[m-1][n-1]$ as the discrete Fréchet distance.



In [None]:
# Compute the Euclidean distance between two points.
def euclidean_distance(x, y):
    """
    The euclidean distance between two 3D points.
    """

    return np.sqrt((x[0]-y[0])**2 + (x[1]-y[1])**2+(x[2]-y[2])**2)

def align_structures(mobile_coords, target_coords):
    sup = Superimposer()
    sup.set_atoms(target_coords, mobile_coords)  # target is fixed, mobile is moved
    
    return sup.apply(mobile_coords)  # returns aligned coordinates

# Load PDB and extract coordinates
def extract_ca_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("prot", pdb_file)
    coords = []
    
    for residue in structure.get_residues():
        if PDB.is_aa(residue, standard=True) and residue.has_id('CA'):
            coords.append(residue['CA'].coord)
    
    return np.array(coords)  # shape (51, 3)


# Compute DTW distance using Euclidean distance as cost function.
def dtw_distance(seq_a, seq_b):

    n, m = len(seq_a), len(seq_b)
    # we create the dtw_matrix, initializing every entry to infinity (we are looking for the min)
    dtw_matrix = np.full((n, m), np.inf)
    # the first cell has as cost only the euclidean_distance of the respective first values in the two time series
    dtw_matrix[0, 0] = euclidean_distance(seq_a[0],seq_b[0])

    # When j==0, aka the first column
    for i in range(1,n):
        cost=euclidean_distance(seq_a[i],seq_b[0])
        dtw_matrix[i,0]= cost + dtw_matrix[i - 1, 0]

    # When i==0, aka the first row
    for j in range(1,m):
        cost=euclidean_distance(seq_a[0],seq_b[j])
        dtw_matrix[0,j]= cost + dtw_matrix[0, j - 1]   

    # Compute DTW cost matrix based on the given formula
    for i in range(1, n):
        for j in range(1, m):
            cost = euclidean_distance(seq_a[i], seq_b[j]) # Base cost
            dtw_matrix[i, j] = cost + min(
                dtw_matrix[i - 1, j],    # Insertion
                dtw_matrix[i, j - 1],    # Deletion
                dtw_matrix[i - 1, j - 1] # Match
            )

    return dtw_matrix[n-1, m-1]  # Return the final DTW distance

def frechet_distance(seq_a,seq_b):
    m, n = len(seq_a), len(seq_b)
    D = [[0] * n for _ in range(m)]
    
    
    D[0][0] = euclidean_distance(seq_a[0], seq_b[0])
    
    for i in range(1, m):
        D[i][0] = max(D[i-1][0], euclidean_distance(seq_a[i], seq_b[0]))
    for j in range(1, n):
        D[0][j] = max(D[0][j-1], euclidean_distance(seq_a[0], seq_b[j]))
        
    for i in range(1, m):
        for j in range(1, n):
            D[i][j] = max(
                euclidean_distance(seq_a[i], seq_b[j]),
                min(D[i-1][j], D[i-1][j-1], D[i][j-1])
            )
    
    return D[m-1][n-1]


def extract_seq_and_atoms(chain):
    seq = ""
    atoms = []
    for res in chain:
        if PDB.is_aa(res, standard=True) and res.has_id("CA"):
            try:
                seq += three_to_one(res.get_resname())
                atoms.append(res["CA"])
            except KeyError:
                # Skip unknown residue names
                continue
    return seq, atoms

def get_protein_name(protein_path):
    path = pathlib.PurePath(protein_path)
    return path.name[:-4].upper()

def get_first_valid_chain(structure):
    for model in structure:
        for chain in model:
            if any(PDB.is_aa(res, standard=True) and res.has_id("CA") for res in chain):
                return chain
    raise ValueError("No valid chain with CA atoms found in structure.")

def compute_distances(protein_filepath_a,protein_filepath_b):

    # Load structures
    parser = PDB.PDBParser(QUIET=True)
    s1 = parser.get_structure("ref", protein_filepath_a)
    s2 = parser.get_structure("mov", protein_filepath_b)

    # Use first model and first valid chain from each
    chain1 = s1[0].child_list[0]
    chain2 = s2[0].child_list[0]

    chain1 = get_first_valid_chain(s1)
    chain2 = get_first_valid_chain(s2)
    
    # Get corresponding CA atoms
    seq1, atoms1 = extract_seq_and_atoms(chain1)
    seq2, atoms2 = extract_seq_and_atoms(chain2)

    # Align sequences
    alignment = pairwise2.align.globalxx(seq1, seq2,one_alignment_only=True)[0]
    aligned_seq1, aligned_seq2 = alignment.seqA, alignment.seqB

    # Map aligned residues to atoms
    matched_atoms1 = []
    matched_atoms2 = []
    i, j = 0, 0
    for a1, a2 in zip(aligned_seq1, aligned_seq2):
        if a1 != '-' and a2 != '-':
            matched_atoms1.append(atoms1[i])
            matched_atoms2.append(atoms2[j])
        if a1 != '-': i += 1
        if a2 != '-': j += 1


    # Align structures
    super_imposer = PDB.Superimposer()
    super_imposer.set_atoms(matched_atoms1, matched_atoms2)
    super_imposer.apply(s2[0].get_atoms())


    protein_a = np.array([atom.get_coord() for atom in matched_atoms1])  # Reference structure CA coords
    protein_b= np.array([atom.get_coord() for atom in matched_atoms2])  # Aligned moving structure CA coords

    
    protein_name_a=get_protein_name(protein_filepath_a)
    protein_name_b=get_protein_name(protein_filepath_b)

    print(f"Length of protein {protein_name_a} WITH superimposing is {len(protein_a)}")
    print(f"Length of protein {protein_name_b} WITH superimposing is {len(protein_b)}")

    dtw_d=dtw_distance(protein_a,protein_b)
    print(f"DTW distance of {protein_name_a} and {protein_name_b} is {dtw_d} Å")

    frechet_d=frechet_distance(protein_a,protein_b)
    print(f"Frechet distance of {protein_name_a} and {protein_name_b} is {frechet_d} Å\n")   

def compute_distances_raw(protein_filepath_a,protein_filepath_b):
    protein_a = extract_ca_coordinates(protein_filepath_a)
    protein_b = extract_ca_coordinates(protein_filepath_b) 

    
    protein_name_a=get_protein_name(protein_filepath_a)
    protein_name_b=get_protein_name(protein_filepath_b)


    print(f"Length of protein {protein_name_a} WITHOUT superimposing is {len(protein_a)}")
    print(f"Length of protein {protein_name_b} WITHOUT superimposing is {len(protein_b)}")
    
    dtw_d=dtw_distance(protein_a,protein_b)
    print(f"DTW distance of {protein_name_a} and {protein_name_b} is {dtw_d} Å")

    frechet_d=frechet_distance(protein_a,protein_b)
    print(f"Frechet distance of {protein_name_a} and {protein_name_b} is {frechet_d} Å\n") 

In [114]:

print("This is the start of the DTW and Frechet section")

compute_distances('../data/1mbn.pdb','../data/1mbo.pdb')
compute_distances('../data/1cll.pdb','../data/1cfd.pdb')
compute_distances('../data/1hho.pdb','../data/1mbn.pdb')
compute_distances('../data/1avw.pdb','../data/4cha.pdb')
compute_distances('../data/1tup.pdb','../data/2ocj.pdb')
compute_distances('../data/1hhp.pdb','../data/3phv.pdb')
compute_distances('../data/1tim.pdb','../data/7tim.pdb')
compute_distances('../data/1enh.pdb','../data/1igd.pdb')


This is the start of the DTW and Frechet section
Length of protein 1MBN WITH superimposing is 153
Length of protein 1MBO WITH superimposing is 153
DTW distance of 1MBN and 1MBO is 64.3685112516126 Å
Frechet distance of 1MBN and 1MBO is 4.865909240024418 Å

Length of protein 1CLL WITH superimposing is 144
Length of protein 1CFD WITH superimposing is 144
DTW distance of 1CLL and 1CFD is 1235.2252083438043 Å
Frechet distance of 1CLL and 1CFD is 20.807443335974337 Å

Length of protein 1HHO WITH superimposing is 56
Length of protein 1MBN WITH superimposing is 56
DTW distance of 1HHO and 1MBN is 269.724537603559 Å
Frechet distance of 1HHO and 1MBN is 8.487340373917021 Å

Length of protein 1AVW WITH superimposing is 11
Length of protein 4CHA WITH superimposing is 11
DTW distance of 1AVW and 4CHA is 103.17706725894257 Å
Frechet distance of 1AVW and 4CHA is 16.310095864545485 Å

Length of protein 1TUP WITH superimposing is 194
Length of protein 2OCJ WITH superimposing is 194
DTW distance of 1TU

In [115]:
compute_distances_raw('../data/1mbn.pdb','../data/1mbo.pdb')
compute_distances_raw('../data/1cll.pdb','../data/1cfd.pdb')
compute_distances_raw('../data/1hho.pdb','../data/1mbn.pdb')
compute_distances_raw('../data/1avw.pdb','../data/4cha.pdb')
compute_distances_raw('../data/1tup.pdb','../data/2ocj.pdb')
compute_distances_raw('../data/1hhp.pdb','../data/3phv.pdb')
compute_distances_raw('../data/1tim.pdb','../data/7tim.pdb')
compute_distances_raw('../data/1enh.pdb','../data/1igd.pdb')

Length of protein 1MBN WITHOUT superimposing is 153
Length of protein 1MBO WITHOUT superimposing is 153
DTW distance of 1MBN and 1MBO is 71.6786393404175 Å
Frechet distance of 1MBN and 1MBO is 4.86286369640323 Å

Length of protein 1CLL WITHOUT superimposing is 144
Length of protein 1CFD WITHOUT superimposing is 148
DTW distance of 1CLL and 1CFD is 4846.789124327055 Å
Frechet distance of 1CLL and 1CFD is 48.87602256692837 Å

Length of protein 1HHO WITHOUT superimposing is 287
Length of protein 1MBN WITHOUT superimposing is 153
DTW distance of 1HHO and 1MBN is 5164.794411438227 Å
Frechet distance of 1HHO and 1MBN is 53.16436908968474 Å

Length of protein 1AVW WITHOUT superimposing is 394
Length of protein 4CHA WITHOUT superimposing is 477
DTW distance of 1AVW and 4CHA is 12921.625989072956 Å
Frechet distance of 1AVW and 4CHA is 54.00856895014883 Å

Length of protein 1TUP WITHOUT superimposing is 585
Length of protein 2OCJ WITHOUT superimposing is 776
DTW distance of 1TUP and 2OCJ is 5040