In [19]:
!pip install pandas numpy scikit-learn gpxpy haversine

Collecting haversine
  Downloading haversine-2.8.1-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading haversine-2.8.1-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.1


In [53]:
import gdown
import gpxpy

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from haversine import haversine, Unit

In [82]:
# Replace with your own GPX file in Google Drive
GPX_URL = 'https://drive.google.com/file/d/1LXlrNI0e7Jk3YGD7zVfhjZJllGVrBlmy/view?usp=sharing'

# How many hours do you have to run
CUTOFF_TIME_HOURS = [4, 5, 7, 9, 12]

In [83]:
REFERENCE_RUN_GPX = 'run.gpx'

gdown.download(GPX_URL, REFERENCE_RUN_GPX, quiet=True, fuzzy=True)

'run.gpx'

In [84]:
def parse_gpx(filepath):
    gpx_file = open(filepath, 'r')
    gpx = gpxpy.parse(gpx_file)

    data = []
    for track in gpx.tracks:
        for segment in track.segments:
            for point in segment.points:
                time, lat, lng = point.time, point.latitude, point.longitude
                data.append({
                  'time': time,
                  'lat': lat,
                  'lon': lng,
                })

    df = pd.DataFrame(data)
    return df

df = parse_gpx(REFERENCE_RUN_GPX)
df.head(3)

Unnamed: 0,time,lat,lon
0,2024-09-21 21:00:25+00:00,21.031885,105.852055
1,2024-09-21 21:00:26+00:00,21.03189,105.852048
2,2024-09-21 21:00:27+00:00,21.031893,105.852042


In [85]:
def get_pace(df):
  # Convert the 'time' column to datetime
  df['time'] = pd.to_datetime(df['time'])

  # Sort the DataFrame by time to ensure consecutive rows are in correct order
  df = df.sort_values(by='time')

  # Calculate the distance between consecutive points using the haversine formula
  df['distance'] = df.apply(lambda row: haversine(
                              (row['lat'], row['lon']),
                              (df.iloc[row.name - 1]['lat'], df.iloc[row.name - 1]['lon']),
                              unit=Unit.KILOMETERS) if row.name > 0 else 0, axis=1)

  # Calculate the time difference between consecutive points (in seconds)
  df['time_diff'] = df['time'].diff().dt.total_seconds()

  # Avoid division by zero by filling NaNs in 'time_diff' with small values
  df['time_diff'].fillna(1, inplace=True)

  # Calculate pace (in minutes per kilometer)
  df['pace'] = (df['time_diff'] / 60) / df['distance']

  # Handle NaNs or infinite pace values (e.g., first row)
  df['pace'] = df['pace'].replace([float('inf'), -float('inf')], 0).fillna(0)

  # Get cumulative distance
  df['cumulative_distance'] = df['distance'].cumsum()

  df = df.drop(columns=['distance'])
  df.rename(columns={'cumulative_distance': 'distance'}, inplace=True)

  return df

df = get_pace(df)

In [86]:
df[['distance', 'pace']].tail(3)

Unnamed: 0,distance,pace
13419,43.091174,8.699673
13420,43.092792,10.302108
13421,43.094271,11.271314


In [87]:
# Predict pace for given distance
X = df['distance'].values.reshape(-1, 1)
y = df['pace']

# Split to test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit linear regression
model = LinearRegression()
model.fit(X_train, y_train)

In [90]:
# Get a sense of the error by fitting known distances
X_long = np.array([[10], [21], [42], [50], [100], [123], [200], [322]])
y_long_pred = model.predict(X_long)

times = X_long.flatten() * y_long_pred
for distance, time in zip(X_long.flatten(), times):
    print(f'{distance} (km): {time / 60.0:.1f} (hours)')

10 (km): 0.8 (hours)
21 (km): 1.9 (hours)
42 (km): 4.1 (hours)
50 (km): 5.1 (hours)
100 (km): 12.4 (hours)
123 (km): 16.5 (hours)
200 (km): 33.6 (hours)
322 (km): 71.4 (hours)


In [89]:
def predict_cutoff_distance(time_cutoff, model, step=1, max_iterations=10000):
    distance = 0
    cumulative_time = 0

    for _ in range(max_iterations):
        # Predict time for the next unit of distance
        predicted_time = model.predict([[distance + step]])[0]

        # Check if adding this step would exceed the time cutoff
        if cumulative_time + predicted_time > time_cutoff * 60:  # Convert time_cutoff to minutes
            break

        # Add the step to our distance and time
        distance += step
        cumulative_time += predicted_time

    return distance, cumulative_time / 60  # Convert back to hours

for cutoff_time in CUTOFF_TIME_HOURS:
  max_distance, time_taken = predict_cutoff_distance(cutoff_time, model)
  print(f"{max_distance:.2f} (km): {time_taken:.2f} (hours)")

44.00 (km): 3.95 (hours)
54.00 (km): 4.97 (hours)
72.00 (km): 6.91 (hours)
90.00 (km): 8.99 (hours)
114.00 (km): 11.99 (hours)
