In [None]:
import os
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic
from fastdtw import fastdtw

# --- Config ---
content_dir = '/content'
parquet_files = [f for f in os.listdir(content_dir) if f.endswith('.parquet')]
median_threshold_m = 5000  # only compare files within this distance

# --- Helper functions ---
def extract_coords(geometry):
    if geometry is None or geometry.is_empty:
        return []
    return [(c[1], c[0]) for c in geometry.coords]

def compute_file_median(coords_list):
    """Median lat/lon of all routes in a file"""
    all_lats, all_lons = [], []
    for coords in coords_list:
        for lat, lon in coords:
            all_lats.append(lat)
            all_lons.append(lon)
    if not all_lats:
        return None
    return (np.median(all_lats), np.median(all_lons))

# --- Load files and compute medians ---
file_coords = []
file_medians = []
valid_file_indices = []

for idx, parquet_file in enumerate(parquet_files):
    gdf = gpd.read_parquet(os.path.join(content_dir, parquet_file))
    gdf = gdf[gdf['geometry'].notnull() & (~gdf['geometry'].is_empty)].copy()
    if gdf.empty:
        file_coords.append([])
        file_medians.append(None)
        continue

    gdf['coords'] = gdf['geometry'].apply(extract_coords)
    coords_list = gdf['coords'].tolist()

    median = compute_file_median(coords_list)
    file_coords.append(coords_list)
    file_medians.append(median)

# --- Find files with at least one similar median ---
selected_indices = []
for i, median_i in enumerate(file_medians):
    if median_i is None:
        continue
    for j, median_j in enumerate(file_medians):
        if i != j and median_j is not None:
            if geodesic(median_i, median_j).meters <= median_threshold_m:
                selected_indices.append(i)
                break  # only need one similar file to be selected

selected_indices = sorted(set(selected_indices))
print(f"Files with similar medians: {selected_indices}")

# --- Compute simplified similarity matrix for selected files ---
n = len(selected_indices)
similarity_matrix = np.zeros((n, n))

for a, i in enumerate(selected_indices):
    for b, j in enumerate(selected_indices):
        if i == j:
            similarity_matrix[a, b] = 1.0
            continue

        distances = []
        for coords1 in file_coords[i]:
            for coords2 in file_coords[j]:
                d, _ = fastdtw(coords1, coords2, dist=lambda x, y: geodesic(x, y).meters)
                distances.append(d)

        similarity_matrix[a, b] = 1 - min(distances) / max(distances) if distances else 0

# --- Plot simplified heatmap ---
plt.figure(figsize=(6,5))
sns.heatmap(similarity_matrix, annot=True, cmap='YlGnBu', xticklabels=selected_indices, yticklabels=selected_indices, cbar=True)
plt.title(f'Route Similarity (Files with Median â‰¤ {median_threshold_m} m)')
plt.xlabel('File index')
plt.ylabel('File index')
plt.show()


Files with similar medians: [0, 49]


In [16]:
!pip install fastdtw

