In [1]:
import numpy as np
import pandas as pd
import time

# Compute the Euclidean distance between two points.
def euclidean_distance(x, y):
    """
    The euclidean distance between two scalars is their absolute value.
    """
    return abs(x - y)

# Compute DTW distance using Euclidean distance as cost function.
def dtw_distance(seq_a, seq_b):

    n, m = len(seq_a), len(seq_b)
    # we create the dtw_matrix, initializing every entry to infinity (we are looking for the min)
    dtw_matrix = np.full((n, m), np.inf)
    # the first cell has as cost only the euclidean_distance of the respective first values in the two time series
    dtw_matrix[0, 0] = euclidean_distance(seq_a[0],seq_b[0])

    # When j==0, aka the first column
    for i in range(1,n):
        cost=euclidean_distance(seq_a[i],seq_b[0])
        dtw_matrix[i,0]= cost + dtw_matrix[i - 1, 0]

    # When i==0, aka the first row
    for j in range(1,m):
        cost=euclidean_distance(seq_a[0],seq_b[j])
        dtw_matrix[0,j]= cost + dtw_matrix[0, j - 1]   

    # Compute DTW cost matrix based on the given formula
    for i in range(1, n):
        for j in range(1, m):
            cost = euclidean_distance(seq_a[i], seq_b[j]) # Base cost
            dtw_matrix[i, j] = cost + min(
                dtw_matrix[i - 1, j],    # Insertion
                dtw_matrix[i, j - 1],    # Deletion
                dtw_matrix[i - 1, j - 1] # Match
            )

    return dtw_matrix[n-1, m-1]  # Return the final DTW distance

print("This is the start of question 3")
# Load dataset
dtw_data = pd.read_csv("dtw_test.csv")  # dtw_test.csv must be in the same file as part3.ipynb file

# Process each pair of time series
results = []
start_time = time.time() # we start the clock

for idx, row in dtw_data.iterrows():
    seq_a = np.array(eval(row['series_a']))  # Convert string to list
    seq_b = np.array(eval(row['series_b']))
    distance = dtw_distance(seq_a, seq_b) # Compute the dtw_distance for each sequence
    results.append([idx, distance]) 

# Measure execution time
end_time = time.time() # we stop the clock, we have calculated all the required distances
total_time = end_time - start_time

# Save results to CSV
df_results = pd.DataFrame(results, columns=["id", "DTW distance"])
df_results.to_csv("dtw.csv", index=False) # output file

print(f"Total time taken: {total_time:.4f} seconds")

This is the start of question 3


FileNotFoundError: [Errno 2] No such file or directory: 'dtw_test.csv'