In [1]:
import xml.etree.ElementTree as ET
import numpy as np
from datetime import datetime
from math import radians, sin, cos, sqrt, atan2

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two GPS coordinates in kilometers"""
    R = 6371  # Earth's radius in km

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))

    return R * c

def parse_gpx_file(filepath):
    """Parse GPX file and extract running metrics"""
    tree = ET.parse(filepath)
    root = tree.getroot()

    # Handle GPX namespace
    ns = {'gpx': 'http://www.topografix.com/GPX/1/1'}
    if not root.findall('.//gpx:trkpt', ns):
        ns = {'gpx': 'http://www.topografix.com/GPX/1/0'}
    if not root.findall('.//gpx:trkpt', ns):
        ns = {}  # Try without namespace

    trackpoints = []

    # Find all track points
    if ns:
        points = root.findall('.//gpx:trkpt', ns)
    else:
        points = root.findall('.//{http://www.topografix.com/GPX/1/1}trkpt')
        if not points:
            points = root.findall('.//{http://www.topografix.com/GPX/1/0}trkpt')
        if not points:
            points = root.findall('.//trkpt')

    for trkpt in points:
        lat = float(trkpt.get('lat'))
        lon = float(trkpt.get('lon'))

        # Get time
        time_elem = trkpt.find('time') if not ns else trkpt.find('gpx:time', ns)
        if time_elem is not None:
            time_str = time_elem.text
            # Parse ISO format time
            try:
                time = datetime.fromisoformat(time_str.replace('Z', '+00:00'))
            except:
                time = datetime.strptime(time_str.split('.')[0], '%Y-%m-%dT%H:%M:%S')
        else:
            time = None

        trackpoints.append({'lat': lat, 'lon': lon, 'time': time})

    return trackpoints

def analyze_5km_increments(trackpoints, increment_km=5):
    """
    Analyze pace at every 5km increment
    Returns cumulative analysis showing pace degradation over distance
    """
    if len(trackpoints) < 2 or not trackpoints[0]['time']:
        return None

    # Calculate cumulative distances
    cumulative_distance = 0
    cumulative_distances = [0]

    for i in range(1, len(trackpoints)):
        dist = haversine_distance(
            trackpoints[i-1]['lat'], trackpoints[i-1]['lon'],
            trackpoints[i]['lat'], trackpoints[i]['lon']
        )
        cumulative_distance += dist
        cumulative_distances.append(cumulative_distance)

    total_distance = cumulative_distance

    # Analyze at each 5km increment
    increments = []
    current_increment_km = increment_km
    segment_start_idx = 0
    last_increment_distance = 0

    for i in range(1, len(trackpoints)):
        if cumulative_distances[i] >= current_increment_km:
            # This segment reaches the current increment
            segment_distance = cumulative_distances[i] - last_increment_distance
            segment_start_time = trackpoints[segment_start_idx]['time']
            segment_end_time = trackpoints[i]['time']

            if segment_start_time and segment_end_time and segment_distance > 0:
                segment_duration = (segment_end_time - segment_start_time).total_seconds() / 60  # minutes
                segment_pace = segment_duration / segment_distance  # min/km

                # Calculate cumulative pace from start
                cumulative_time = (segment_end_time - trackpoints[0]['time']).total_seconds() / 60
                cumulative_pace = cumulative_time / cumulative_distances[i]

                increments.append({
                    'distance_km': current_increment_km,
                    'segment_pace': segment_pace,
                    'cumulative_pace': cumulative_pace,
                    'segment_duration_min': segment_duration,
                    'cumulative_time_min': cumulative_time
                })

            last_increment_distance = cumulative_distances[i]
            segment_start_idx = i
            current_increment_km += increment_km

            if current_increment_km > total_distance + increment_km:
                break

    return increments, total_distance

def calculate_degradation_threshold(increments, threshold_percent=5):
    """
    Find the distance range where pace degradation is below threshold
    Returns the optimal distance range
    """
    if not increments or len(increments) < 2:
        return None

    baseline_pace = increments[0]['segment_pace']

    # Find where degradation exceeds threshold for the first time
    optimal_ranges = []
    current_range_start = increments[0]['distance_km']

    for i, inc in enumerate(increments):
        pace_change_percent = ((inc['segment_pace'] - baseline_pace) / baseline_pace) * 100

        if pace_change_percent > threshold_percent:
            # Degradation exceeded, mark previous point as range end
            if i > 0:
                optimal_ranges.append({
                    'start_km': current_range_start,
                    'end_km': increments[i-1]['distance_km'],
                    'max_degradation': pace_change_percent
                })
            break
    else:
        # Never exceeded threshold
        optimal_ranges.append({
            'start_km': current_range_start,
            'end_km': increments[-1]['distance_km'],
            'max_degradation': ((increments[-1]['segment_pace'] - baseline_pace) / baseline_pace) * 100
        })

    return optimal_ranges

def find_stable_pace_window(increments, window_size=3, max_std_dev=0.3):
    """
    Find distance windows where pace is most stable using rolling window analysis
    """
    if not increments or len(increments) < window_size:
        return None

    stable_windows = []

    for i in range(len(increments) - window_size + 1):
        window = increments[i:i+window_size]
        paces = [w['segment_pace'] for w in window]

        pace_std = np.std(paces)
        pace_mean = np.mean(paces)

        if pace_std <= max_std_dev:
            stable_windows.append({
                'start_km': window[0]['distance_km'],
                'end_km': window[-1]['distance_km'],
                'avg_pace': pace_mean,
                'std_dev': pace_std,
                'consistency_score': 1 / (pace_std + 0.01)  # Lower std = higher score
            })

    return stable_windows

# Main analysis
print("=" * 80)
print("5KM INCREMENT PACE DEGRADATION ANALYSIS")
print("=" * 80)

# Replace these with your actual GPX file paths
gpx_files = [
    '/content/Hoa_Binh_Park_Run.gpx',  # Replace with your first GPX file path
    '/content/VPBank_Hanoi_Marathon.gpx'   # Replace with your second GPX file path
]

all_runs_data = []

for idx, filepath in enumerate(gpx_files):
    print(f"\n{'='*80}")
    print(f"RUN {idx + 1}: {filepath}")
    print('='*80)

    try:
        trackpoints = parse_gpx_file(filepath)
        increments, total_distance = analyze_5km_increments(trackpoints, increment_km=5)

        if increments:
            print(f"\nTotal Distance: {total_distance:.2f} km")
            print(f"\n5km Increment Analysis:")
            print(f"{'Distance':<12}{'Segment Pace':<18}{'Cumulative Pace':<18}{'Degradation':<15}")
            print(f"{'(km)':<12}{'(min/km)':<18}{'(min/km)':<18}{'from Start':<15}")
            print("-" * 80)

            baseline_pace = increments[0]['segment_pace']

            for inc in increments:
                pace_change = ((inc['segment_pace'] - baseline_pace) / baseline_pace) * 100
                print(f"{inc['distance_km']:<12.0f}{inc['segment_pace']:<18.2f}"
                      f"{inc['cumulative_pace']:<18.2f}{pace_change:>+13.1f}%")

            # Calculate optimal range (where degradation < 5%)
            optimal_ranges = calculate_degradation_threshold(increments, threshold_percent=5)

            # Find stable pace windows
            stable_windows = find_stable_pace_window(increments, window_size=3, max_std_dev=0.3)

            all_runs_data.append({
                'run_num': idx + 1,
                'filepath': filepath,
                'increments': increments,
                'total_distance': total_distance,
                'optimal_ranges': optimal_ranges,
                'stable_windows': stable_windows
            })

            print(f"\n" + "-" * 80)
            print(f"Pace Degradation Analysis for Run {idx + 1}:")
            print("-" * 80)

            if optimal_ranges:
                for opt_range in optimal_ranges:
                    print(f"\n✓ Optimal Distance Range (< 5% degradation):")
                    print(f"  {opt_range['start_km']:.0f} km - {opt_range['end_km']:.0f} km")
                    print(f"  Maximum degradation in this range: {opt_range['max_degradation']:.1f}%")

            if stable_windows:
                best_window = max(stable_windows, key=lambda x: x['consistency_score'])
                print(f"\n✓ Most Stable Pace Window:")
                print(f"  {best_window['start_km']:.0f} km - {best_window['end_km']:.0f} km")
                print(f"  Average Pace: {best_window['avg_pace']:.2f} min/km")
                print(f"  Pace Consistency: ±{best_window['std_dev']:.2f} min/km")

        else:
            print("Unable to analyze run (insufficient data or missing time information)")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found")
        print("Please update the gpx_files list with your actual file paths")
    except Exception as e:
        print(f"Error parsing file: {str(e)}")

# Cross-run comparative analysis
if len(all_runs_data) >= 2:
    print("\n" + "=" * 80)
    print("CROSS-RUN COMPARATIVE ANALYSIS")
    print("=" * 80)

    print("\nPace Degradation Comparison at Key Distances:")
    print("-" * 80)

    # Find common distances across runs
    all_distances = set()
    for run in all_runs_data:
        all_distances.update([inc['distance_km'] for inc in run['increments']])
    common_distances = sorted(all_distances)

    print(f"\n{'Distance':<12}", end='')
    for run in all_runs_data:
        print(f"Run {run['run_num']} Pace{'':<8}", end='')
    print("Best Run")
    print("-" * 80)

    for dist in common_distances:
        print(f"{dist:<12.0f}", end='')
        paces_at_distance = []

        for run in all_runs_data:
            matching_inc = next((inc for inc in run['increments'] if inc['distance_km'] == dist), None)
            if matching_inc:
                pace = matching_inc['segment_pace']
                paces_at_distance.append((run['run_num'], pace))
                print(f"{pace:<18.2f}", end='')
            else:
                print(f"{'N/A':<18}", end='')

        if paces_at_distance:
            best = min(paces_at_distance, key=lambda x: x[1])
            print(f"Run {best[0]}")
        else:
            print()

    # Aggregate optimal distance recommendation
    print("\n" + "=" * 80)
    print("OPTIMAL DISTANCE RECOMMENDATION")
    print("=" * 80)

    all_optimal_ends = []
    all_stable_ranges = []

    for run in all_runs_data:
        if run['optimal_ranges']:
            for opt_range in run['optimal_ranges']:
                all_optimal_ends.append(opt_range['end_km'])

        if run['stable_windows']:
            best_stable = max(run['stable_windows'], key=lambda x: x['consistency_score'])
            all_stable_ranges.append((best_stable['start_km'], best_stable['end_km']))

    if all_optimal_ends:
        avg_optimal_distance = np.mean(all_optimal_ends)
        min_optimal_distance = min(all_optimal_ends)
        max_optimal_distance = max(all_optimal_ends)

        print(f"\nBased on pace degradation analysis across {len(all_runs_data)} runs:")
        print(f"\n✓ OPTIMAL DISTANCE RANGE: {min_optimal_distance:.0f} - {max_optimal_distance:.0f} km")
        print(f"  (Average: {avg_optimal_distance:.0f} km)")
        print(f"\n  This range maintains pace degradation below 5%")
        print(f"  and maximizes distance while maintaining consistent speed.")

        if all_stable_ranges:
            avg_stable_start = np.mean([r[0] for r in all_stable_ranges])
            avg_stable_end = np.mean([r[1] for r in all_stable_ranges])

            print(f"\n✓ MOST CONSISTENT PACE ZONE: {avg_stable_start:.0f} - {avg_stable_end:.0f} km")
            print(f"  This is your 'sweet spot' with most stable pacing.")

        # Time estimate
        if all_runs_data[0]['increments']:
            avg_baseline_pace = np.mean([run['increments'][0]['segment_pace'] for run in all_runs_data])
            estimated_time_hours = (avg_optimal_distance * avg_baseline_pace) / 60

            print(f"\n✓ RECOMMENDED RUN DURATION:")
            print(f"  Distance: {avg_optimal_distance:.0f} km")
            print(f"  Estimated Time: {estimated_time_hours:.1f} hours ({estimated_time_hours * 60:.0f} minutes)")
            print(f"  Expected Pace: {avg_baseline_pace:.2f} min/km")

        print(f"\nWhy this recommendation:")
        print(f"  • Pace remains consistent (< 5% degradation)")
        print(f"  • Maximizes distance without significant slowdown")
        print(f"  • Based on actual data from your marathon-distance runs")

        if avg_optimal_distance < 25:
            print(f"  • Your hypothesis of ~20km appears CORRECT based on your data!")

print("\n" + "=" * 80)

5KM INCREMENT PACE DEGRADATION ANALYSIS

RUN 1: /content/Hoa_Binh_Park_Run.gpx

Total Distance: 40.08 km

5km Increment Analysis:
Distance    Segment Pace      Cumulative Pace   Degradation    
(km)        (min/km)          (min/km)          from Start     
--------------------------------------------------------------------------------
5           5.37              5.37                       +0.0%
10          5.66              5.52                       +5.4%
15          5.94              5.66                      +10.5%
20          5.82              5.70                       +8.3%
25          6.46              5.85                      +20.3%
30          6.56              5.97                      +22.0%
35          7.20              6.14                      +33.9%
40          7.31              6.29                      +36.1%

--------------------------------------------------------------------------------
Pace Degradation Analysis for Run 1:
--------------------------------------