In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os
from geopy.distance import geodesic

# --- Configuration ---
content_dir = '/content'  # Folder with .parquet files
parquet_files = [f for f in os.listdir(content_dir) if f.endswith('.parquet')]

# Histogram color (blue)
hist_color = '#1f77b4'

# --- Helper functions ---
def extract_start_coordinate(geometry):
    """Extract the starting coordinate from a geometry as (lat, lon)."""
    if geometry is None or geometry.is_empty:
        return None
    coords = [(coord[1], coord[0]) for coord in geometry.coords]  # (lat, lon)
    return coords[0] if coords else None

def compute_start_distances_meters(start_points):
    """Compute distances in meters between consecutive start points."""
    distances = []
    for i in range(len(start_points) - 1):
        if start_points[i] is None or start_points[i+1] is None:
            continue
        dist_m = geodesic(start_points[i], start_points[i+1]).km * 1000
        distances.append(dist_m)
    return distances

# --- Process each parquet file separately ---
for parquet_file in parquet_files:
    print(f"\nProcessing: {parquet_file}")
    file_path = os.path.join(content_dir, parquet_file)

    gdf = gpd.read_parquet(file_path)

    # Extract starting points
    start_points = gdf['geometry'].apply(extract_start_coordinate).dropna()

    # Compute distances between start points
    start_distances = compute_start_distances_meters(start_points.tolist())

    # --- Plot distribution for this parquet file ---
    plt.figure(figsize=(10, 6))
    plt.hist(start_distances, bins=50, color=hist_color, edgecolor='black')
    plt.title(f'Distance Between Activity Start Points - {parquet_file}')
    plt.xlabel('Distance (meters)')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()
