In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from scipy.spatial.distance import pdist, squareform
from scipy.linalg import svd
from Bio.PDB import PDBParser # we use biopython package

In [15]:
# Compute the Euclidean distance between two points.
def euclidean_distance(x, y):
    """
    The euclidean distance between two 3D points.
    """

    return np.sqrt((x[0]-y[0])**2 + (x[1]-y[1])**2+(x[2]-y[2])**2)

# Load PDB and extract coordinates
def extract_ca_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("prot", pdb_file)
    coords = []
    
    for residue in structure.get_residues():
        if residue.has_id('CA'):
            coords.append(residue['CA'].coord)
    
    return np.array(coords)  # shape (51, 3)

# Compute DTW distance using Euclidean distance as cost function.
def dtw_distance(seq_a, seq_b):

    n, m = len(seq_a), len(seq_b)
    # we create the dtw_matrix, initializing every entry to infinity (we are looking for the min)
    dtw_matrix = np.full((n, m), np.inf)
    # the first cell has as cost only the euclidean_distance of the respective first values in the two time series
    dtw_matrix[0, 0] = euclidean_distance(seq_a[0],seq_b[0])

    # When j==0, aka the first column
    for i in range(1,n):
        cost=euclidean_distance(seq_a[i],seq_b[0])
        dtw_matrix[i,0]= cost + dtw_matrix[i - 1, 0]

    # When i==0, aka the first row
    for j in range(1,m):
        cost=euclidean_distance(seq_a[0],seq_b[j])
        dtw_matrix[0,j]= cost + dtw_matrix[0, j - 1]   

    # Compute DTW cost matrix based on the given formula
    for i in range(1, n):
        for j in range(1, m):
            cost = euclidean_distance(seq_a[i], seq_b[j]) # Base cost
            dtw_matrix[i, j] = cost + min(
                dtw_matrix[i - 1, j],    # Insertion
                dtw_matrix[i, j - 1],    # Deletion
                dtw_matrix[i - 1, j - 1] # Match
            )

    return dtw_matrix[n-1, m-1]  # Return the final DTW distance

In [16]:



print("This is the start of the DTW section")
# start_time = time.time() # we start the clock
# get the coordinates of the proteins
mbn = extract_ca_coordinates('../data/1mbn.pdb') 
mbo = extract_ca_coordinates('../data/1mbo.pdb')
dtw_d=dtw_distance(mbn,mbo)
print(f"DTW distance of 1MBN and 1MBO is {dtw_d}")


cll = extract_ca_coordinates('../data/1cll.pdb') 
cfd = extract_ca_coordinates('../data/1cfd.pdb')
dtw_d=dtw_distance(cll,cfd)
print(f"DTW distance of 1CLL and 1CFD is {dtw_d}")

hho = extract_ca_coordinates('../data/1hho.pdb') 
mbn = extract_ca_coordinates('../data/1mbn.pdb')
dtw_d=dtw_distance(hho,mbn)
print(f"DTW distance of 1HHO and 1MBN is {dtw_d}")

avw = extract_ca_coordinates('../data/1avw.pdb') 
cha = extract_ca_coordinates('../data/4cha.pdb')
dtw_d=dtw_distance(avw,cha)
print(f"DTW distance of 1AVW and 4CHA is {dtw_d}")

tup = extract_ca_coordinates('../data/1tup.pdb') 
ocj = extract_ca_coordinates('../data/2ocj.pdb')
dtw_d=dtw_distance(tup,ocj)
print(f"DTW distance of 1TUP and 2OCJ is {dtw_d}")

hhp = extract_ca_coordinates('../data/1hhp.pdb') 
phv = extract_ca_coordinates('../data/3phv.pdb')
dtw_d=dtw_distance(hhp,phv)
print(f"DTW distance of 1HHP and 2PHV is {dtw_d}")

tim1 = extract_ca_coordinates('../data/1tim.pdb') 
tim7 = extract_ca_coordinates('../data/7tim.pdb')
dtw_d=dtw_distance(tim1,tim7)
print(f"DTW distance of 1TIM and 7TIM is {dtw_d}")

enh = extract_ca_coordinates('../data/1enh.pdb') 
igd = extract_ca_coordinates('../data/1igd.pdb')
dtw_d=dtw_distance(enh,igd)
print(f"DTW distance of 1ENH and 1IGD is {dtw_d}")

# # Process each pair of time series
# results = []


# for idx, row in dtw_data.iterrows():
#     seq_a = np.array(eval(row['series_a']))  # Convert string to list
#     seq_b = np.array(eval(row['series_b']))
#     distance = dtw_distance(seq_a, seq_b) # Compute the dtw_distance for each sequence
#     results.append([idx, distance]) 

# # Measure execution time
# end_time = time.time() # we stop the clock, we have calculated all the required distances
# total_time = end_time - start_time

# # Save results to CSV
# df_results = pd.DataFrame(results, columns=["id", "DTW distance"])
# df_results.to_csv("dtw.csv", index=False) # output file

# print(f"Total time taken: {total_time:.4f} seconds")


This is the start of the DTW section
DTW distance of 1MBN and 1MBO is 71.6786393404175
DTW distance of 1CLL and 1CFD is 4944.635998145854
DTW distance of 1HHO and 1MBN is 5164.794411438227
DTW distance of 1AVW and 4CHA is 12945.406895476233
DTW distance of 1TUP and 2OCJ is 50402.645810149894
DTW distance of 1HHP and 2PHV is 5907.5062018514855
DTW distance of 1TIM and 7TIM is 33494.53058654935
DTW distance of 1ENH and 1IGD is 1866.6927576666303
