# Explore your runkeeper GPS tracks with python
Jupyter Notebook by Florian Neukirchen. This jupyter notebook accompanies my blog posts:

- https://www.riannek.de/2022/runkeeper-gps-tracks-python-part-1/ 
- https://www.riannek.de/2022/runkeeper-gps-tracks-python-part-2/

See my blog about further details.

## Read the files and prepare GeoDataFrame

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import LineString

folder = "gpx/"

In [None]:
def prepare_dataframes(gdf, track_name):
    """ Calculate distance, speed etc. from raw data of gpx trackpoints. 
    Return two GeoDataframes: points and (connecting) lines.
    """
    gdf.index.names = ['point_id'] 
    gdf['time'] = pd.to_datetime(gdf['time'])
    gdf.dropna(axis=1, inplace=True)
    gdf.drop(columns=['track_fid', 'track_seg_id', 'track_seg_point_id'], inplace=True)

    # Use local UTM to get geometry in meters
    gdf = gdf.to_crs(gdf.estimate_utm_crs())

    # shifted gdf gives us the next point with the same index
    # allows calculations without the need of a loop
    shifted_gdf = gdf.shift(-1)
    
    gdf['time_delta'] = shifted_gdf['time'] - gdf['time'] 
    gdf['time_delta_s'] = gdf['time_delta'].dt.seconds
    gdf['dist_delta'] = gdf.distance(shifted_gdf)
    
    # In one track, after making a pause, I had a weird outlier 1.6 km away of my real position. 
    # Therefore I replace dist_delta > 100 m with NAN. 
    # This should be counted as pause.
    gdf['dist_delta'] = np.where(gdf['dist_delta']>100, np.nan, gdf['dist_delta'])

    # speed in various formats
    gdf['m_per_s'] = gdf['dist_delta'] / gdf.time_delta.dt.seconds 
    gdf['km_per_h'] = gdf['m_per_s'] * 3.6
    gdf['min_per_km'] = 60 / (gdf['km_per_h'])
    
    # We now might have speeds with NAN (pauses, see above)
    # Fill NAN with 0 for easy filtering of pauses
    gdf['km_per_h'].fillna(0)


    # covered distance (meters) and time passed
    gdf['distance'] = gdf['dist_delta'].cumsum()
    gdf['time_passed'] = gdf['time_delta'].cumsum()
    
    # Minutes instead datetime might be useful
    gdf['minutes'] = gdf['time_passed'].dt.seconds / 60

    # Splits (in km) might be usefull for grouping
    gdf['splits'] = gdf['distance'] // 1000

    # ascent is = elevation delta, but only positive values
    gdf['ele_delta'] = shifted_gdf['ele'] - gdf['ele']  
    gdf['ascent'] = gdf['ele_delta']
    gdf.loc[gdf.ascent < 0, ['ascent']] = 0

    # Slope in %
    gdf['slope'] = 100 * gdf['ele_delta'] / gdf['dist_delta']   
   
    # slope and min_per_km can be infinite if 0 km/h
    # Replace inf with nan for better plotting
    gdf.replace(np.inf, np.nan, inplace=True)
    gdf.replace(-np.inf, np.nan, inplace=True)

    # Ele normalized: Startpoint as 0
    gdf['ele_normalized'] = gdf['ele'] - gdf.loc[0]['ele']
    
    # Back to WGS84 (we might have tracks from different places)
    gdf = gdf.to_crs(epsg = 4326)
    shifted_gdf = shifted_gdf.to_crs(epsg = 4326)
    
    # Create another geodataframe with lines instead of points as geometry.
    lines = gdf.iloc[:-1].copy() # Drop the last row
    lines['next_point'] =  shifted_gdf['geometry']
    lines['line_segment'] = lines.apply(lambda row: LineString([row['geometry'], row['next_point']]), axis=1) 
    lines.set_geometry('line_segment', inplace=True, drop=True)
    lines.drop(columns='next_point', inplace=True)
    lines.index.names = ['segment_id'] 
    
    # Add track name and use it for multiindex
    gdf['track_name'] = track_name
    lines['track_name'] = track_name
    gdf.reset_index(inplace=True)
    gdf.set_index(['track_name', 'point_id'], inplace=True)
    lines.reset_index(inplace=True)
    lines.set_index(['track_name', 'segment_id'], inplace=True)
    return gdf, lines

In [None]:
# Prepare empty Geodataframes
points = gpd.GeoDataFrame()
lines = gpd.GeoDataFrame()

# And populate them with data from gpx files
for file in os.listdir(folder):
    if file.endswith(('.gpx')):
        try:
            rawdata = gpd.read_file(folder + file, layer='track_points')
            track_points, track_lines = prepare_dataframes(rawdata, file)
            points = pd.concat([points, track_points])
            lines = pd.concat([lines, track_lines])
        except:
            print("Error", file)


In [None]:
lines.head()

In [None]:
points.head()

In [None]:
lines.info()

## Some general statistics

In [None]:
# Ascent in meters 
lines.groupby('track_name')['ascent'].sum().describe()

In [None]:
lines.groupby('track_name')['ascent'].sum()

In [None]:
# Distance in meters 
lines.groupby('track_name')['distance'].sum().describe()

In [None]:
lines.groupby('track_name')['km_per_h'].describe()

## Extract some usefull information about each run

In [None]:
runs = pd.DataFrame({
              'distance': lines.groupby('track_name')['dist_delta'].sum(),
              'ascent': lines.groupby('track_name')['ascent'].sum(),
              'start_time': points.groupby('track_name')['time'].min(),
              'end_time': points.groupby('track_name')['time'].max(),
              'median_km_h' : lines.groupby('track_name')['km_per_h'].median(),
              'max_km_h' : lines.groupby('track_name')['km_per_h'].max(),
              })

runs['total_duration'] = runs['end_time'] - runs['start_time']

# Pauses (speed <1.5 km/h)
runs['pause'] = lines[lines['km_per_h']<1.5].groupby('track_name')['time_delta'].sum()
runs['pause'] = runs['pause'].fillna(pd.Timedelta(0))

# Duration without pauses
runs['duration'] = runs['total_duration'] - runs['pause']
runs['minutes'] = runs.duration.dt.seconds / 60

# Speed
runs['m_per_s'] = runs['distance'] / runs.duration.dt.seconds 
runs['km_per_h'] = runs['m_per_s'] * 3.6
runs['min_per_km'] = 60 / runs['km_per_h']

# Distance in km
runs['distance'] = runs['distance'] / 1000 

runs

In [None]:
# Add Geometry of the (complete) runs
runs = gpd.GeoDataFrame(runs, geometry=lines.dissolve(by='track_name')['geometry'])

In [None]:
runs.describe()

## Some Queries

The 5 longest runs

In [None]:
runs.sort_values(by='distance', ascending=False).head(5)

The 5 fastest runs

In [None]:
runs.sort_values(by='km_per_h', ascending=False).head(5)

Fastest in the range from 8 to 12 km

In [None]:
runs[(runs['distance']>=8) & (runs['distance']<=12)].sort_values(by='km_per_h', ascending=False).head(5)

## Reports per month and per year

In [None]:
per_year = pd.DataFrame({
              'count': runs.groupby(runs.start_time.dt.year)['distance'].count(),
              'total_distance': runs.groupby(runs.start_time.dt.year)['distance'].sum(),
              'distance_median': runs.groupby(runs.start_time.dt.year)['distance'].median(),   
              'distance_mean': runs.groupby(runs.start_time.dt.year)['distance'].mean(),  
              'distance_max': runs.groupby(runs.start_time.dt.year)['distance'].max(),    
              'total_ascent': runs.groupby(runs.start_time.dt.year)['ascent'].sum(),
              'ascent_median': runs.groupby(runs.start_time.dt.year)['ascent'].sum(),
              'ascent_max': runs.groupby(runs.start_time.dt.year)['ascent'].max(),
              'median_km_h' : runs.groupby(runs.start_time.dt.year)['km_per_h'].median(),
              'mean_km_h' : runs.groupby(runs.start_time.dt.year)['km_per_h'].mean(),
              })

per_year.index.name = 'year'

per_year

In [None]:
per_month = pd.DataFrame({
              'count': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['distance'].count(),
              'total_distance': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['distance'].sum(),
              'distance_median': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['distance'].median(),   
              'distance_mean': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['distance'].mean(),  
              'distance_max': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['distance'].max(),    
              'total_ascent': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['ascent'].sum(),
              'ascent_median': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['ascent'].sum(),
              'ascent_max': runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['ascent'].max(),
              'median_km_h' : runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['km_per_h'].median(),
              'mean_km_h' : runs.groupby([runs.start_time.dt.year, runs.start_time.dt.month])['km_per_h'].mean(),
              })

per_month.index.names = ['year', 'month']

per_month

### Alternative way 
... including all months with 0 runs and using a datetime index for each month. This is usefull for some plots.

In [None]:
freq = runs.set_index('start_time').groupby(pd.Grouper(freq="M"))

In [None]:
freq_month = pd.DataFrame({
              'count': freq['distance'].count(),
              'total_distance': freq['distance'].sum(),
              'distance_median': freq['distance'].median(),   
              'distance_mean': freq['distance'].mean(),  
              'distance_max': freq['distance'].max(),    
              'total_ascent': freq['ascent'].sum(),
              'ascent_median': freq['ascent'].sum(),
              'ascent_max': freq['ascent'].max(),
              'median_km_h' : freq['km_per_h'].median(),
              'mean_km_h' : freq['km_per_h'].mean(),
              })

freq_month.index.name = 'month_dt'
freq_month['year'] = freq_month.index.year
freq_month['month'] = freq_month.index.month

freq_month

In [None]:
freq_month['count'].plot(kind='bar')

## Plots with Seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Plots of all Trackpoints or line segments
Note: checking points with very high speed on the maps, I found out these are a result of inaccurate GPS data, notably below bridges and after waiting (!) at a traffic light. Runkeeper seems to smooth out these outliers in their statistics.

In [None]:
sns.relplot(x="slope", y="km_per_h", data=lines[lines['km_per_h']<30], hue="track_name");

In [None]:
sns.relplot(x="time_passed", y="km_per_h", data=lines, kind="line", hue="track_name");

In [None]:
# Filter out the outliers
sns.relplot(x="time_passed", y="km_per_h", data=lines[lines['km_per_h']<30], kind="line", hue="track_name");

Profile:

In [None]:
sns.relplot(x="distance", y="ele", data=lines, kind="line", hue="track_name");

In [None]:
sns.relplot(x="distance", y="ele_normalized", data=lines, kind="line", hue="track_name");

In [None]:
sns.relplot(x="distance", y="ele_delta", data=lines, kind="line", hue="track_name");

### Heatmap of Speed

In [None]:
heatmapdata1 = lines.reset_index()
heatmapdata1 = heatmapdata1.pivot(index='track_name', columns='segment_id', values='km_per_h')
heatmapdata1.fillna(value=0, inplace=True)

In [None]:
heatmapdata1.head()

In [None]:
# Set vmax to filter out unrealistic values
sns.heatmap(heatmapdata1, vmin=0, vmax=20, xticklabels=False)

### Heatmap of ele_delta

In [None]:
heatmapdata2 = lines.reset_index()
heatmapdata2 = heatmapdata2.pivot(index='track_name', columns='segment_id', values='ele_delta')
heatmapdata2.fillna(value=0, inplace=True)

In [None]:
sns.heatmap(heatmapdata2, xticklabels=False, center=0)

In [None]:
# Set min and max values
sns.heatmap(heatmapdata2, xticklabels=False, center=0, vmin=-2, vmax=2)

### Heatmap of ele_normalized

In [None]:
heatmapdata3 = lines.reset_index()
heatmapdata3 = heatmapdata3.pivot(index='track_name', columns='segment_id', values='ele_normalized')
heatmapdata3.fillna(value=0, inplace=True)

In [None]:
sns.heatmap(heatmapdata3, xticklabels=False, center=0)

### Plots of the runs

In [None]:
sns.displot(x="distance", data=runs, binwidth=1);

In [None]:
sns.displot(x="km_per_h", data=runs, bins=10);

In [None]:
sns.displot(x="ascent", data=runs, kind="kde");

In [None]:
sns.relplot(x="distance", y="km_per_h", data=runs, hue="track_name");

In [None]:
sns.relplot(x="start_time", y="km_per_h", data=runs, size="distance");

In [None]:
import matplotlib.dates as mdates
g = sns.relplot(x="start_time", y="km_per_h", data=runs, size="distance")
g.ax.xaxis.set_major_formatter(
    mdates.ConciseDateFormatter(g.ax.xaxis.get_major_locator()))

In [None]:
g = sns.relplot(x="start_time", y="km_per_h", data=runs, size="distance", sizes=(2,300))
g.ax.set_xlabel("date")
g.ax.set_ylabel("km/h")
g.ax.xaxis.set_major_formatter(
    mdates.ConciseDateFormatter(g.ax.xaxis.get_major_locator()))

In [None]:
g = sns.relplot(x="start_time", y="distance", data=runs, size="km_per_h")
g.ax.set_xlabel("date")
g.ax.set_ylabel("distance")
g.ax.xaxis.set_major_formatter(
    mdates.ConciseDateFormatter(g.ax.xaxis.get_major_locator()))

In [None]:
g = sns.relplot(x="start_time", y="distance", data=runs, kind="line")
g.ax.xaxis.set_major_formatter(
    mdates.ConciseDateFormatter(g.ax.xaxis.get_major_locator()))

In [None]:
sns.jointplot(x="distance", y="minutes", data=runs, kind="reg");

### Plots of months

In [None]:
per_month

In [None]:
# For nicer plots, replace year month with datetime index
per_month_dt = per_month.reset_index()
per_month_dt['month'] =  pd.to_datetime(per_month_dt['year'].astype('str') + '-' + per_month_dt['month'].astype('str') + '-1')
per_month_dt.drop(columns='year', inplace=True)
per_month_dt.set_index('month', inplace=True)

In [None]:
per_month_dt

In [None]:
g = sns.relplot(x="month", y="total_distance", data=per_month_dt)
g.ax.xaxis.set_major_formatter(
    mdates.ConciseDateFormatter(g.ax.xaxis.get_major_locator()))

In [None]:
g = sns.relplot(x="month", y="median_km_h", data=per_month_dt)
g.ax.xaxis.set_major_formatter(
    mdates.ConciseDateFormatter(g.ax.xaxis.get_major_locator()))

## Save Geodataframes
- GeoJSON does support datetime, but not timedelta. 
- Shapefile does not support datetime/timedelta. 

Good reason to save as GeoJSON. We could either drop those timedelta columns ...

In [None]:
# data_to_save = lines.drop(columns=['time_delta', 'time_passed'])

... or better turn the timedelta to string:

In [None]:
# Save lines

# use copy, otherwise also lines will be changed
# and I might want to use the timedelta below
data_to_save = lines.copy()   
data_to_save['time_delta'] = data_to_save['time_delta'].astype(str)
data_to_save['time_passed'] = data_to_save['time_passed'].astype(str)

data_to_save.to_file(folder + "tracks-as-lines.geojson", driver="GeoJSON")

In [None]:
# Save points
data_to_save = points.copy() 
data_to_save['time_delta'] = data_to_save['time_delta'].astype(str)
data_to_save['time_passed'] = data_to_save['time_passed'].astype(str)

data_to_save.to_file(folder + "tracks-as-points.geojson", driver="GeoJSON")

In [None]:
#Save runs
runs.to_csv(folder + "runs.csv")

## Folium

In [None]:
import folium

For meaningfull tooltips  I have to plot the actual line segments instead of the complete runs.

In [None]:
lines_condensed = lines[['ele_delta', 'dist_delta', 'geometry', 'distance', 'km_per_h', 'min_per_km', 'minutes', 'slope', 'time_delta_s']].dropna().copy()

lines_condensed['date'] = lines['time'].dt.strftime("%d %B %Y")
lines_condensed['year'] = lines['time'].dt.year
lines_condensed['month'] = lines['time'].dt.month

lines_condensed.reset_index(level=1, inplace=True)
lines_condensed['total_distance'] = runs['distance']
lines_condensed['total_minutes'] = runs['minutes']

lines_condensed['distance'] = lines_condensed['distance']/1000
lines_condensed['distance'] = lines_condensed['distance'].round(2)
lines_condensed['total_distance'] = lines_condensed['total_distance'].round(2)
lines_condensed['total_minutes'] = lines_condensed['total_minutes'].round(2)
lines_condensed['minutes'] = lines_condensed['minutes'].round(2)
lines_condensed['min_per_km'] = lines_condensed['min_per_km'].round(2)
lines_condensed['km_per_h'] = lines_condensed['km_per_h'].round(2)
lines_condensed['slope'] = lines_condensed['slope'].round(3)

In [None]:
# style function
def style(feature):
        return {
            # 'fillColor': feature['properties']['color'],
            'color': feature['properties']['color'],
            'weight': 3,
        }


for x in lines_condensed.index:
    color = np.random.randint(16, 256, size=3)
    color = [str(hex(i))[2:] for i in color]
    color = '#'+''.join(color).upper()
    lines_condensed.at[x, 'color'] = color

lines_condensed.head()

Use start point of the last (youngest) run as map location

In [None]:
location_x = lines_condensed.iloc[-1]['geometry'].coords.xy[0][0]
location_y = lines_condensed.iloc[-1]['geometry'].coords.xy[1][0]

In [None]:
# Or the fist one (oldest) 

# location_x = lines_condensed.iloc[1]['geometry'].coords.xy[0][0]
# location_y = lines_condensed.iloc[1]['geometry'].coords.xy[1][0]

In [None]:
(location_x, location_y)

Startpoints

In [None]:
startpoints = points.groupby('track_name').first()[['geometry']] 

startpoints

### Plot the map 

In [None]:
grouped = lines_condensed.groupby('year')

In [None]:
m4 = folium.Map(location=[location_y, location_x], zoom_start=13, tiles='cartodbpositron')
folium.TileLayer('Stamen Terrain').add_to(m4)

# Iterate through the grouped dataframe
# Populate a list of feature groups
# Add the tracks to the feature groups
# And add the feature groups to the map

f_groups = []

for group_name, group_data in grouped:
    f_groups.append(folium.FeatureGroup(group_name))
    track_geojson = folium.GeoJson(data=group_data, style_function=style).add_to(f_groups[-1])
    track_geojson.add_child(
          folium.features.GeoJsonTooltip(fields=['date', 'distance', 'total_distance', 'minutes', 'total_minutes', 'min_per_km', 'km_per_h' ], 
                                   aliases=['Date', 'Kilometers', 'Total km', 'Minutes', 'Total min', 'min/km', 'km/h'])
        )
    f_groups[-1].add_to(m4)


# Add one layer with clustered start points 
"""
from folium.plugins import MarkerCluster

clusterlayer=folium.FeatureGroup("All start points")
marker_cluster = MarkerCluster().add_to(clusterlayer)

for index, row in startpoints.iterrows():
    folium.CircleMarker(
        location = [row['geometry'].y, row['geometry'].x],
        color = "blue", 
        fill_color = "blue",
        radius = 1,
    ).add_to(marker_cluster)

clusterlayer.add_to(m4)
"""



folium.LayerControl().add_to(m4)

m4

### Folium Heatmap

In [None]:
# Create a list of the locations from points
locations = list(zip(points['geometry'].y, points['geometry'].x))

hm = folium.Map(tiles='cartodbdark_matter')

# Add heatmap to map instance
# Available parameters: HeatMap(data, name=None, min_opacity=0.5, max_zoom=18, max_val=1.0, radius=25, blur=15, gradient=None, overlay=True, control=True, show=True)
folium.plugins.HeatMap(locations).add_to(hm)

hm