In [40]:
from dotenv import load_dotenv
from urllib.parse import urlencode
import requests
import os
import datetime

import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import pandas as pd
import requests
import urllib3
import seaborn as sns
from pandas import json_normalize

## Setup for Accessing Strava Data

When making a [Strava App](https://www.strava.com/settings/api), the access token scope is not sufficient for actually getting activity data. This is because the "/athlete/activities" API endpoint requires either a token with scope "activity:read" or "activity:read_all". Since a Strava API application only gets you a read token (which funnily enough is not sufficient to pull activity data on its own), the easiest way to overcome this is the following.

1) Make a request using your clientID through a browser to the http://www.strava.com/oauth/authorize. Make sure to set scope=activity:read or your desired scope in the request. redirect_url=http://localhost is sufficient in this request since it is whitelisted by default and does not need to actually successfully redirect
2) After making the request, you will be redirected to an OAuth page for your created Strava API application. Grant it permission to login with the requested scope.
3) view the URL returned (the actual browser probably just shows nothing since it tried to redirect to localhost) and save the code that was returned into the response.
4) Using this code, make a new request to https://www.strava.com/oauth/token, making sure to use your normal clientID, clientSecret, and code=your_code in the request and grant_type=authorization_code. This can be done in a Python script now.
5) If successful, the request should return a new access_token and a refresh_token, which can be saved into your preferred way for storing secrets (this project uses a .env file that is .gitignored).

The provided access_refresh tokens supposedly do not expire, so this process only needs to be done once. While roundabout, this appears to be the easiest way for handling this sort of issues when working with Strava's API in an isolated environment, rather than an active application with a server and browser to handle OAuth request.

In [12]:
# Set Strava URL for accessing API
activities_url = 'https://www.strava.com/api/v3/athlete/activities'

In [9]:
# Load .env file contents
load_dotenv()

True

## Request all my Strava Activity Data

Using the access token with the appropriate scope for reading my activities from Strava, I create request headers to use

In [None]:
header = {'Authorization': 'Bearer ' + os.getenv("STRAVA_ACCESS_TOKEN")}

This function loops through the Strava response data and adds it to an output array, if any were returned. This is because when hitting the 'athlete/activity' GET API, the items are returned in a list without an associated key (i.e. 'data') so the JSON can be directly iterated over or appended to a list. When all activites are parsed, the next page request will return an empty JSON item back, so iteration will stop there.

The maximum results per page that can be configured is 200, which is used to reduce the chance of hitting Strava's API limits.

Small functionality is added to check for an error of hitting the Rate Limit, since the boolean condition does not account for getting a response that is not actually containing data (since there isn't a key to check for when data is returned).

Strava has a 100 read requests / 15 minute limit on top of a 1000 read limit per day, which isn't an issue when dealing with just my activites but could be difficult if parsing for multiple athletes.

In [6]:
# Function to request activities data
def loop_through_pages(page):
    # start at page ...
    page = page
    # set new_results to True initially
    new_results = True
    # create an empty array to store our combined pages of data in
    data = []
    while new_results:
        # Give some feedback
        print(f'You are requesting page {page} of your activities data ...')
        # request a page + 200 results
        get_strava = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': f'{page}'}).json()

        if 'message' in get_strava:
            if get_strava['message'] == "Rate Limit Exceeded":
                print("Rate Limited Exceeded, please wait before retrying")
                break

        # save the response to new_results to check if its empty or not and close the loop
        new_results = get_strava
        # add our responses to the data array
        data.extend(get_strava)
        # increment the page
        page += 1
    # return the combine results of our get requests
    return data

# call the function to loop through our strava pages and set the starting page at 1
my_dataset = loop_through_pages(1)

You are requesting page 1 of your activities data ...
You are requesting page 2 of your activities data ...


In [13]:
print(f'Found {len(my_dataset)} activites!')

Found 106 activites!


## Converting Activities into a proper dataset

In [None]:
# JSON list is flattened into a data frame
df = json_normalize(my_dataset).reset_index()

In [19]:
df.head()

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,...,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,athlete.id,athlete.resource_state,map.id,map.summary_polyline,map.resource_state
0,2,Saturday long run,16150.5,5725,6036,33.7,Run,Run,2.0,16034518965,...,4B217AF1-1BDD-4B7F-AAAE-A1CE9EE70476-activity.fit,False,7,0,False,152953032,1,a16034518965,o~maGxo~pLKi@Yk@e@?ILGOMKFaDVaCZeA_@yAc@eDMYGs...,2
1,2,Happy Friday,9116.8,3490,3672,31.9,Run,Run,0.0,16024185362,...,A12C2EF1-BE4A-4DA1-AA35-E028DD6E0985-activity.fit,False,0,0,False,152953032,1,a16024185362,g~maGho~pL@KCJI@BEGGAQQY@CG?F?CBOMo@UKM@_AEGCO...,2
2,2,Warm up/cool down to and from gym,3467.2,1342,2864,16.1,Run,Run,0.0,16014778959,...,2D0DEE1F-3E6D-4716-B013-C2BFE6EDF58D-activity.fit,False,0,0,False,152953032,1,a16014778959,w}maG~p~pLA?CPORDRANBLP@PZ?JEZMBe@^GBCAQHIJAND...,2
3,2,Gym speed,4345.2,1380,1380,0.0,Run,Run,3.0,16014794615,...,,False,0,0,False,152953032,1,a16014794615,,2
4,2,No run club :(,8103.0,2940,3132,35.5,Run,Run,3.0,16003878242,...,315B6DEB-C192-4459-85DA-61908A525528-activity.fit,False,0,0,False,152953032,1,a16003878242,u{maGpn~pLYHMEGKCUDWCWQkASgBI[OoA?CZUJMK]Ea@M]...,2


In [28]:
print(f'Shape: {df.shape}, Column Names: {df.columns}')

Shape: (106, 51), Column Names: Index(['index', 'resource_state', 'name', 'distance', 'moving_time',
       'elapsed_time', 'total_elevation_gain', 'type', 'sport_type',
       'workout_type', 'id', 'start_date', 'start_date_local', 'timezone',
       'utc_offset', 'location_city', 'location_state', 'location_country',
       'achievement_count', 'kudos_count', 'comment_count', 'athlete_count',
       'photo_count', 'trainer', 'commute', 'manual', 'private', 'visibility',
       'flagged', 'gear_id', 'start_latlng', 'end_latlng', 'average_speed',
       'max_speed', 'has_heartrate', 'heartrate_opt_out',
       'display_hide_heartrate_option', 'elev_high', 'elev_low', 'upload_id',
       'upload_id_str', 'external_id', 'from_accepted_tag', 'pr_count',
       'total_photo_count', 'has_kudoed', 'athlete.id',
       'athlete.resource_state', 'map.id', 'map.summary_polyline',
       'map.resource_state'],
      dtype='object')


Before cleaning the dataset more, we want to reduce down to some of the more relevant columns.

I am choosing to keep:

* name
* distance
* moving_time
* elapsed_time (When a run has a big difference between elapsed_time and moving_time, does it affect outputs? Running in the city often results in lots of waiting at red lights)
* total_elevation_gain
* sport_type (for filtering, even though I know all of these are or should be runs with a few activites I miscategorized as hikes)
* workout_type (distinguishing runs by recovery, long, etc.)
* start_date_local
* average_speed
* max_speed
* elev_high
* elev_low
* pr_count

Unfortunately, I have not saved much in terms of geographical data to Strava, which would have been nice to compare my performance pre/post moving (due to a large elevation change and environment difference) but that will have to wait for another time.

In [271]:
# Define columns to keep
keep_columns = ["name","distance","moving_time","elapsed_time","total_elevation_gain","type","sport_type", "workout_type", "start_date_local", "average_speed","max_speed","elev_high","elev_low", "pr_count"]

# Drop columns and save into a reduced df
df_reduce_cols = df[keep_columns]

I know I have some biking and strength workouts saved in here so lets filter down to just my runs. Additionally, I am chosing to drop walks as well since they may skew analysis of runs. Virtual Runs is the category I used to classify treadmill running so those should stay.

In [272]:
df_reduce_cols.type.unique()

array(['Run', 'Walk', 'VirtualRun', 'Workout', 'Ride'], dtype=object)

In [273]:
# Save length of df to compare
len_before = len(df_reduce_cols)

# Filter to only running activities
df_reduce_cols = df_reduce_cols[df_reduce_cols['type'].isin(["Run", "VirtualRun"])]

df_reduce_cols.head()

Unnamed: 0,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,start_date_local,average_speed,max_speed,elev_high,elev_low,pr_count
0,Saturday long run,16150.5,5725,6036,33.7,Run,Run,2.0,2025-10-04T15:22:14Z,2.821,4.82,11.0,1.4,7
1,Happy Friday,9116.8,3490,3672,31.9,Run,Run,0.0,2025-10-03T17:13:26Z,2.612,5.42,12.7,1.5,0
2,Warm up/cool down to and from gym,3467.2,1342,2864,16.1,Run,Run,0.0,2025-10-02T18:08:42Z,2.584,6.72,14.8,4.0,0
3,Gym speed,4345.2,1380,1380,0.0,Run,Run,3.0,2025-10-02T16:35:37Z,3.149,0.0,,,0
4,No run club :(,8103.0,2940,3132,35.5,Run,Run,3.0,2025-10-01T18:06:35Z,2.756,7.6,23.9,3.0,0


In [274]:
print(f'Length before filter: {len_before} vs. length after filter: {len(df_reduce_cols)}')

Length before filter: 106 vs. length after filter: 99


Now that the data is reduced down to the columns I'm interested in, I will start with some clean up tasks.

1) Distance is given in meters. This is okay to keep for now but I want to add a column with the distance in miles.
2) moving_time is in seconds. This is also fine, but I'd like to get an hh:mm:ss timestamp that would be useful for a visualization
3) Same as #2 but for the moving time as well.
4) Adding a new column that indicates whether a run took place in the gym. This is done by checking for NaN values in the elevation fields, since those activites are manually entered by me and I know are consistent since this is my data. If possible I would further distinguish between gym vs. treadmill vs. traditional run but there isn't a very good way to do it without parsing my activity descriptions.
5) Adding 0 instead of NaN for elevation columns.
6) Convert all elevation columns (low, high, total) to feet from meters
7) Convert average_speed and max_speed to mph (from m/s)
8) Convert workout_type to string where 0: None (default), 1: Race, 2: Long Run,3: Workout
9) Clean up time stamp by splitting date/time components

In [275]:
# Define some simple helper functions to apply to columns
def time_in_hhmmss(time):
    return str(datetime.timedelta(seconds=time))

def convert_speed(speed):
    return speed * 2.23

def meters_to_miles(distance):
    return distance * 0.000621371192

def meters_to_feet(distance):
    return distance * 3.28084

def clean_date_time(timestamp):
    return timestamp.replace('T', ' ')[:-1]

In [276]:
# 1) Add column for distance in miles
print("Converting meters to miles")
df_reduce_cols.loc[:, ['distance_miles']] = df_reduce_cols.distance.apply(meters_to_miles)

# 2) Convert moving_time to hh:mm:ss
df_reduce_cols.loc[:, ["moving_time_str"]] = df_reduce_cols.moving_time.apply(time_in_hhmmss)

# 3) Convert ellapsed_time to hh:mm:ss
df_reduce_cols.loc[:, ["elapsed_time_str"]] = df_reduce_cols.elapsed_time.apply(time_in_hhmmss)

# 4) Add new column is_gym_run to check if run took place on treadmill or at gym
df_reduce_cols.loc[:, ["is_gym_run"]] = df_reduce_cols.elev_high.where(df_reduce_cols.elev_high.isna(), False).fillna(True)

# 5) Replace NaNs in elev_low and elev_high with 0
df_reduce_cols.loc[:, ["elev_low", "elev_high"]] = df_reduce_cols[["elev_low", "elev_high"]].fillna(0)

# 6) Convert elevations from meters to feet
df_reduce_cols.loc[:, ["elev_low"]] = df_reduce_cols.elev_low.apply(meters_to_feet)
df_reduce_cols.loc[:, ["elev_high"]] = df_reduce_cols.elev_high.apply(meters_to_feet)
df_reduce_cols.loc[:, ["total_elevation_gain"]] = df_reduce_cols.total_elevation_gain.apply(meters_to_feet)


# 7) Convert average_speed and max_speed from m/s to mph
df_reduce_cols.loc[:, ["average_speed"]] = df_reduce_cols.average_speed.apply(meters_to_miles)
df_reduce_cols.loc[:, ["max_speed"]] = df_reduce_cols.max_speed.apply(meters_to_miles)

# 8) Convert workout_type to categorical string
df_reduce_cols.loc[:, ["workout_type"]] = df_reduce_cols.workout_type.fillna(0) # Set any NaN workout types to a none workout
df_reduce_cols.loc[:, ["workout_type_str"]] = df_reduce_cols.workout_type.map({0:"None", 1:"Race", 2:"Long Run", 3:"Workout"})

# 9) Break start_date_local into two separate columns
df_reduce_cols = df_reduce_cols.rename(columns={"start_date_local":"start_date_full"})
df_reduce_cols.loc[:, ["start_date_full"]] = df_reduce_cols.start_date_full.apply(clean_date_time)
df_reduce_cols.loc[:, ["start_date_local"]] = pd.to_datetime(df_reduce_cols.start_date_full).dt.date
df_reduce_cols.loc[:, ["start_time_local"]] = pd.to_datetime(df_reduce_cols.start_date_full).dt.time

# 10) Drop columns not needed anymore after cleaning
cols_to_drop = ["moving_time", "elapsed_time", "start_date_full", "workout_type", "type", "sport_type"]
df_reduce_cols.drop(columns=cols_to_drop, inplace=True)

# 11) Rename columns now that old ones are dropped and save to run_df_cleaned
run_df_cleaned = df_reduce_cols.rename(columns={"moving_time_str":"moving_time", "elapsed_time_str":"elapsed_time", "workout_type_str":"workout_type"})

Converting meters to miles


  df_reduce_cols.loc[:, ["is_gym_run"]] = df_reduce_cols.elev_high.where(df_reduce_cols.elev_high.isna(), False).fillna(True)


In [278]:
run_df_cleaned.head()

Unnamed: 0,name,distance,total_elevation_gain,average_speed,max_speed,elev_high,elev_low,pr_count,distance_miles,moving_time,elapsed_time,is_gym_run,workout_type,start_date_local,start_time_local
0,Saturday long run,16150.5,110.564308,0.001753,0.002995,36.08924,4.593176,7,10.035455,1:35:25,1:40:36,False,Long Run,2025-10-04,15:22:14
1,Happy Friday,9116.8,104.658796,0.001623,0.003368,41.666668,4.92126,0,5.664917,0:58:10,1:01:12,False,,2025-10-03,17:13:26
2,Warm up/cool down to and from gym,3467.2,52.821524,0.001606,0.004176,48.556432,13.12336,0,2.154418,0:22:22,0:47:44,False,,2025-10-02,18:08:42
3,Gym speed,4345.2,0.0,0.001957,0.0,0.0,0.0,0,2.699982,0:23:00,0:23:00,True,Workout,2025-10-02,16:35:37
4,No run club :(,8103.0,116.46982,0.001712,0.004722,78.412076,9.84252,0,5.034971,0:49:00,0:52:12,False,Workout,2025-10-01,18:06:35


In [277]:
# Confirm that we don't have any NA values hiding
run_df_cleaned.isna().sum()

name                    0
distance                0
total_elevation_gain    0
average_speed           0
max_speed               0
elev_high               0
elev_low                0
pr_count                0
distance_miles          0
moving_time             0
elapsed_time            0
is_gym_run              0
workout_type            0
start_date_local        0
start_time_local        0
dtype: int64

In [279]:
# Save the output as a csv for re-reading in the future
run_df_cleaned.to_csv('data/greg_runs.csv', index=False)