In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pathlib




from scipy.spatial.distance import pdist, squareform
from scipy.linalg import svd
from Bio.PDB import PDBParser # we use biopython package

## Frechet Dynamic Programming Formula

$$
D[i][j] =
\begin{cases}
\|a_1 - b_1\| & i = 0, j = 0 \\
\max(D[i-1][0], \|a_i - b_1\|) & j = 0 \\
\max(D[0][j-1], \|a_1 - b_j\|) & i = 0 \\
\max\left(\|a_i - b_j\|,\; \min\{D[i-1][j], D[i-1][j-1], D[i][j-1]\}\right) & \text{otherwise}
\end{cases}
$$

Return $D[m-1][n-1]$ as the discrete Fréchet distance.



In [9]:
# Compute the Euclidean distance between two points.
def euclidean_distance(x, y):
    """
    The euclidean distance between two 3D points.
    """

    return np.sqrt((x[0]-y[0])**2 + (x[1]-y[1])**2+(x[2]-y[2])**2)

# Load PDB and extract coordinates
def extract_ca_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("prot", pdb_file)
    coords = []
    
    for residue in structure.get_residues():
        if residue.has_id('CA'):
            coords.append(residue['CA'].coord)
    
    return np.array(coords)  # shape (51, 3)

# Compute DTW distance using Euclidean distance as cost function.
def dtw_distance(seq_a, seq_b):

    n, m = len(seq_a), len(seq_b)
    # we create the dtw_matrix, initializing every entry to infinity (we are looking for the min)
    dtw_matrix = np.full((n, m), np.inf)
    # the first cell has as cost only the euclidean_distance of the respective first values in the two time series
    dtw_matrix[0, 0] = euclidean_distance(seq_a[0],seq_b[0])

    # When j==0, aka the first column
    for i in range(1,n):
        cost=euclidean_distance(seq_a[i],seq_b[0])
        dtw_matrix[i,0]= cost + dtw_matrix[i - 1, 0]

    # When i==0, aka the first row
    for j in range(1,m):
        cost=euclidean_distance(seq_a[0],seq_b[j])
        dtw_matrix[0,j]= cost + dtw_matrix[0, j - 1]   

    # Compute DTW cost matrix based on the given formula
    for i in range(1, n):
        for j in range(1, m):
            cost = euclidean_distance(seq_a[i], seq_b[j]) # Base cost
            dtw_matrix[i, j] = cost + min(
                dtw_matrix[i - 1, j],    # Insertion
                dtw_matrix[i, j - 1],    # Deletion
                dtw_matrix[i - 1, j - 1] # Match
            )

    return dtw_matrix[n-1, m-1]  # Return the final DTW distance

def frechet_distance(seq_a,seq_b):
    m, n = len(seq_a), len(seq_b)
    D = [[0] * n for _ in range(m)]
    
    
    D[0][0] = euclidean_distance(seq_a[0], seq_b[0])
    
    for i in range(1, m):
        D[i][0] = max(D[i-1][0], euclidean_distance(seq_a[i], seq_b[0]))
    for j in range(1, n):
        D[0][j] = max(D[0][j-1], euclidean_distance(seq_a[0], seq_b[j]))
        
    for i in range(1, m):
        for j in range(1, n):
            D[i][j] = max(
                euclidean_distance(seq_a[i], seq_b[j]),
                min(D[i-1][j], D[i-1][j-1], D[i][j-1])
            )
    
    return D[m-1][n-1]


def get_protein_name(protein_path):
    path = pathlib.PurePath(protein_path)
    return path.name[:-4].upper()

def compute_distances(protein_filepath_a,protein_filepath_b):
    protein_a = extract_ca_coordinates(protein_filepath_a)
    protein_b = extract_ca_coordinates(protein_filepath_b) 

    protein_name_a=get_protein_name(protein_filepath_a)
    protein_name_b=get_protein_name(protein_filepath_b)

    dtw_d=dtw_distance(protein_a,protein_b)
    print(f"DTW distance of {protein_name_a} and {protein_name_b} is {dtw_d}")

    frechet_d=frechet_distance(protein_a,protein_b)
    print(f"Frechet distance of {protein_name_a} and {protein_name_b} is {frechet_d}")   



In [None]:

print("This is the start of the DTW and Frechet section")

compute_distances('../data/1mbn.pdb','../data/1mbo.pdb')
compute_distances('../data/1cll.pdb','../data/1cfd.pdb')
compute_distances('../data/1hho.pdb','../data/1mbn.pdb')
compute_distances('../data/1avw.pdb','../data/4cha.pdb')
compute_distances('../data/1tup.pdb','../data/2ocj.pdb')
compute_distances('../data/1hhp.pdb','../data/3phv.pdb')
compute_distances('../data/1tim.pdb','../data/7tim.pdb')
compute_distances('../data/1enh.pdb','../data/1igd.pdb')


This is the start of the DTW and Frechet section
DTW distance of 1MBN and 1MBO is 71.6786393404175
Frechet distance of 1MBN and 1MBO is 4.86286369640323
DTW distance of 1CLL and 1CFD is 4944.635998145854
Frechet distance of 1CLL and 1CFD is 48.87602256692837
DTW distance of 1HHO and 1MBN is 5164.794411438227
Frechet distance of 1HHO and 1MBN is 53.16436908968474
DTW distance of 1AVW and 4CHA is 12945.406895476233
Frechet distance of 1AVW and 4CHA is 54.00856895014883
DTW distance of 1TUP and 2OCJ is 50402.645810149894
Frechet distance of 1TUP and 2OCJ is 107.70832197398352
DTW distance of 1HHP and 3PHV is 5907.5062018514855
Frechet distance of 1HHP and 3PHV is 58.94576269460757
DTW distance of 1TIM and 7TIM is 33494.53058654935
Frechet distance of 1TIM and 7TIM is 99.67512867396763
DTW distance of 1ENH and 1IGD is 1866.6927576666303
Frechet distance of 1ENH and 1IGD is 62.142434048182984
