In [47]:
#STRAVA V3

#This is the third version of my attempts to access the Strava API and export my running data to visualise in Tableau
#The first two attempts show my thoughts and working out, this is more of a final piece. 

#Credit to the following resources which greatly helped me get started:
#https://mixedanalytics.com/blog/list-actually-free-open-no-auth-needed-apis/
#https://developers.strava.com/docs/authentication/#:~:text=OAuth%20Overview,-When%20OAuth%20is&text=After%20the%20user%20accepts%20or,scope%20accepted%20by%20the%20user.
#https://developers.strava.com/docs/reference/#api-Activities-getActivityById
#https://towardsdatascience.com/using-the-strava-api-and-pandas-to-explore-your-activity-data-d94901d9bfde

In [29]:
#IMPORTS

#The packages required throughout this project:
import requests
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime

In [None]:
#CREDENTIALS

#You will need credentials in order to access the Strava API.
#If you complete an application, you will get the following credentials:
#Client ID
#Client Secret
#Access Token
#Refresh Token
#You then need to do some more work to get a read all access token and refresh token (we only use the refresh one).

#Use this guide to help you through the process:
#https://towardsdatascience.com/using-the-strava-api-and-pandas-to-explore-your-activity-data-d94901d9bfde

In [46]:
#MY CREDENTIALS
#This section won't work when running from GitHub. Sorry! Can't give out all my credentials!
#It's reading in my credentials from a file that I didn't include in my Git repository

credentials = open("Desktop/Strava_API_credentials.txt", "r").read()
print("Reading in credentials...")
client_id = credentials.partition("client id: ")[2][:6]
print("client id:")
print(client_id)
client_secret = credentials.partition("client secret: ")[2][:40]
print("client secret:")
print(client_secret)
read_all_refresh_token = credentials.partition("read all refresh token: ")[2][:40]
print("Read all refresh token:")
print(read_all_refresh_token)
print("Yay!")

Reading in credentials...
client id:
119533
client secret:
2ca614b67e005176d59d60c0267ddfa6c3f7cf7c
Read all refresh token:
cf1f8cfd4774654a409d713ad16e74da49dc829b
Yay!


In [45]:
#YOUR CREDENTIALS
#This section should work when running from GitHub as I included a template credential file in the Git repository.
#The file is Strava_API_credentials_TEMPLATE.txt
#You will need to update the template credential file with some real credentials for the rest of the code to work
#You can use this guide to help you get your own credentials:
#https://towardsdatascience.com/using-the-strava-api-and-pandas-to-explore-your-activity-data-d94901d9bfde

#CHANGE THIS TO A RELATIVE FILEPATH IF POSSIBLE

credentials = open("Documents/Strava/Strava/Strava_API_credentials_TEMPLATE.txt", "r").read()
print("Reading in credentials...")
client_id = credentials.partition("client id: ")[2][:6]
print("client id:")
print(client_id)
client_secret = credentials.partition("client secret: ")[2][:40]
print("client secret:")
print(client_secret)
read_all_refresh_token = credentials.partition("read all refresh token: ")[2][:40]
print("Read all refresh token:")
print(read_all_refresh_token)

if client_id == "123456" or client_secret == "1a2b3c4d5e6f7g8h9j0k1l2m3n4o5p6q7r8s9t0u" or read_all_refresh_token == "1a2b3c4d5e6f7g8h9j0k1l2m3n4o5p6q7r8s9t0u":
    print("It looks like you are using the template credentials.")
    print("Remember to go into the template and update it to your real credentials.")
    print("The template file is Strava_API_credentials_TEMPLATE.txt")
    print("Use the first half of the following guide to help:")
    print("https://towardsdatascience.com/using-the-strava-api-and-pandas-to-explore-your-activity-data-d94901d9bfde")
else:
    print("Yay!")

Reading in credentials...
client id:
123456
client secret:
1a2b3c4d5e6f7g8h9j0k1l2m3n4o5p6q7r8s9t0u
Read all refresh token:
1a2b3c4d5e6f7g8h9j0k1l2m3n4o5p6q7r8s9t0u
It looks like you are using the template credentials.
Remember to go into the template and update it to your real credentials.
The template file is Strava_API_credentials_TEMPLATE.txt
Use the first half of the following guide to help:
https://towardsdatascience.com/using-the-strava-api-and-pandas-to-explore-your-activity-data-d94901d9bfde


In [32]:
#AUTHENTICATION AND ACCESS TOKENS

#You need the following to make any Strava API requests:
#Client ID
#Client Secret
#Access Token (read all)

#The access token only lasts for ?6? hours so we can use the read all refresh token to get a new read all access token.
#I used this documentation as a guide, see refreshing an expired access token section:
#https://developers.strava.com/docs/authentication/#:~:text=OAuth%20Overview,-When%20OAuth%20is&text=After%20the%20user%20accepts%20or,scope%20accepted%20by%20the%20user.

#Do a POST request to this url:
refresh_read_all_access_token_url = "https://www.strava.com/oauth/token"

#With these parameters:
refresh_read_all_access_token_params = {
    "client_id": client_id,
    "client_secret": client_secret,
    "grant_type": "refresh_token",
    "refresh_token": read_all_refresh_token
}

#The actual POST request:
refresh_read_all_access_token = requests.post(refresh_read_all_access_token_url, refresh_read_all_access_token_params)

#See what that did:
print(refresh_read_all_access_token)
# print(refresh_read_all_access_token.json()) # prints all if interested
print("current read all access token: ")
read_all_access_token = refresh_read_all_access_token.json()["access_token"]
print(read_all_access_token)

#IMPORTANT
#This wil not work if you have not run the above section with REAL credentials
#Now we have a valid read all access token but it only lasts 6 hours (I think)
#Need to re-run this section if more than 6 hrs has elapsed since last run.

<Response [200]>
current read all access token: 
c44df7d30c2a08f730e595441b3e5d1299144ab4


In [34]:
#GET MAIN DATA

#Using this documentation as a guide:
#https://developers.strava.com/docs/reference/#api-Activities-getLoggedInAthleteActivities

#Only have 1 page worth of activities currently. Will have to change this once I have more than 200 activities!
page = 1

#Do a GET request with this url:
all_activities_url = "https://www.strava.com/api/v3/athlete/activities?pag=" + str(page) + "&per_page=200"

#The actual GET request:
all_activities_raw = requests.get(all_activities_url, headers={"Authorization": "Bearer " + read_all_access_token})

print(all_activities_raw)

all_activities = pd.json_normalize(all_activities_raw.json())

#See dimensions of table and top 5 rows to do a rough check
print(all_activities.shape)
all_activities.head()

print("Found " + str(len(all_activities.index)) + " activities.")

<Response [200]>
(169, 50)
Found169 activities


In [35]:
#CLEAN
#Create new dataframe with only columns I care about
cols = ['name', 'upload_id', 'id', 'type', 'distance', 'moving_time',   
         'average_speed', 'max_speed','total_elevation_gain',
         'start_date_local'
       ]
all_activities_df = all_activities[cols]


#Break date into start time and date
all_activities_df['start_date_local'] = pd.to_datetime(all_activities_df['start_date_local'])
all_activities_df['start_time'] = all_activities_df['start_date_local'].dt.time
all_activities_df['start_date_local'] = all_activities_df['start_date_local'].dt.date
all_activities_df.head()

#Not sure what the errors/warnings are that come up here
#But doesn't seem to be breaking anything so I'm going to choose to ignore them for now...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_activities_df['start_date_local'] = pd.to_datetime(all_activities_df['start_date_local'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_activities_df['start_time'] = all_activities_df['start_date_local'].dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_activities_df['start_date_l

Unnamed: 0,name,upload_id,id,type,distance,moving_time,average_speed,max_speed,total_elevation_gain,start_date_local,start_time
0,Blister :(,12086552421,11315100511,Run,13012.1,5013,2.596,4.419,67.7,2024-05-02,12:45:38
1,Geese cheering me on,12062716942,11291925681,Run,5015.9,1634,3.07,5.65,10.0,2024-04-29,17:05:02
2,Bit rainy,12030680885,11261037171,Run,8538.2,2999,2.847,4.066,12.6,2024-04-25,13:16:39
3,Love the river Trent,11999810434,11231043549,Run,10047.3,3599,2.792,3.658,65.0,2024-04-21,11:51:06
4,Back in Nottingham!,11977620693,11209485919,Run,5002.0,1640,3.05,4.446,9.5,2024-04-18,13:08:54


In [36]:
#EXPORT .csv
#I plan to add more sections below to practise using Pandas to explore this data,
#But for now I want to export as a .csv to read into Tableau.

#Get today's date
date_now = str(datetime.now())[:10]
print(date_now)

#Save all activities to file for use in Tableau (today's date appended for versioning)
all_activities_df.to_csv("all_activities_df.csv")
print("Data saved as 'all_activities_df.csv'")
all_activities_df.to_csv("all_activities_df_"+date_now+".csv")
print("Data saved as 'all_activities_df_"+date_now+".csv'")

2024-05-03
Data saved as 'all_activities_df.csv'
Data saved as 'all_activities_df_2024-05-03.csv'


In [37]:
#PICTURES
#Want to add pictures to the workbook which requires another API call using the ids from all my activities as a parameter

#First step, get a big list of all the ids of my activities:
all_activity_ids = all_activities["id"]
all_activity_ids_list = all_activity_ids.values.tolist()

#Then do the API call with each id.
#There is a problem with this as there is a limit on requests (100 every 15 minutes)
#When setting this code up I requested all my pictures waiting 15 mins after each batch of 100 and saved them all to a master file
#Now I can read in this master file, match up the activity ids and only run the request for ids not in the master file*
#Then add the new pictures to the master file so I don't need to re-request them next time.

#*If I have any activities without pictures, they constantly get requested.
#Need to created a "no_pics_list" and add the activity ids for activities with no pictures to it.
#Then can filter out anything on this list after doing the join and if there's anything left, then request the pics.

master = pd.read_csv("activity_photos_master.csv")
first_loop = all_activity_ids#[80] #delete first hash if re-building master file (see final section)
activity_photos = master

#Check how many photos and activites master file currently has
print("There are " + str(len(master.index)) + " photos for " + str(master["activity_id"].nunique()) + " activities currently in master photo file.")

#Join data to see which activity ids are already in the master file of pictures
joined_data = pd.merge(first_loop, master[["activity_id"]], left_on = "id" , right_on = "activity_id", how = "left")

#Filter to rows with null for activity_id
#activity_id field came from the master file so will be null for any and only activities not in the master file
new_ids = joined_data[(joined_data["activity_id"].isnull())]
new_ids = new_ids["id"]

#Make the dataframe of new ids into a list
#If list is empty then don't need to do anything
#If list is not empty then request the pictures for the new ids

new_ids = new_ids.values.tolist()

if len(new_ids) == 0:
    print("No new activities with pictures")
    
else:
    print("Requesting images for new ids: ")
    print(new_ids)
    for activity_id in new_ids:
        get_activity_photos_url = "https://www.strava.com/api/v3/activities/" + str(activity_id) + "/photos?size=500"
        activity_photos_raw = requests.get(get_activity_photos_url, headers={"Authorization": "Bearer " + read_all_access_token})
        activity_photos = activity_photos.append(pd.json_normalize(activity_photos_raw.json()))
    
    activity_photos.to_csv("activity_photos_master.csv")
    print("activity_photos_master file has been updated.")
    print("There are now " + str(len(activity_photos.index)) + " photos for " + str(activity_photos["activity_id"].nunique()) + " activities in master photo file.")

activity_photos.to_csv("activity_photos_"+date_now+".csv")

There are 235 photos for 166 activities currently in master photo file.
Requesting images for new ids: 
[10068201662, 8154716149, 7692238385]
activity_photos_master file has been updated.
There are now 235 photos for 166 activities in master photo file.


In [39]:
#JOIN THE PHOTOS TO THE MAIN DATA

#I was doing this in Tableau but for some reason lots of rows weren't joining (even though 165 rows were joining).
#Instead of messing around more with Tableau relationships to try and figure it out I thought I'd just do the join here.

photos = pd.read_csv("activity_photos_master.csv")
activities = pd.read_csv("all_activities_df.csv")

all_activities_with_photos = pd.merge(activities, photos, left_on = "id" , right_on = "activity_id", how = "left")

print("Number of activities in activities file:")
print(str(len(activities.index)))
print("Number of activities in photos file:")
print(str(photos["activity_id"].nunique()))
print("Number of activities in all activities with photos:")
print(str(all_activities_with_photos["id"].nunique()))

all_activities_with_photos.to_csv("all_activities_with_photos.csv")
print("Data saved as 'all_activities_with_photos.csv'")

Number of activities in activities file:
169
Number of activities in photos file:
166
Number of activities in all activities with photos:
169
Data saved as 'all_activities_with_photos.csv'


In [None]:
#Need to fix these urls.1800 and sized.1800 fields
#WHERE DID THEY COME FROM???

#Could also do with doing some cleaning, making sure I have all the fields I want and getting rid of extra fields.

#Need to created a "no_pics_list" and add the activity ids for activities with no pictures to it.
#Then can filter out anything on this list after doing the join and if there's anything left, then request the pics.
#Currently activities with no pics get a pic API request sent every time (not a problem but not neat)

#Change credentials template to a relative file path so will work from GitHub.

#Then ready to do other things:
#Tableau
#Save work using Git
#Publish work to GitHub
#Make a README file
#Explore Panadas and using Python to explore data (V4?)

In [64]:
#IF MASTER PHOTOS FILE GETS CURRUPTED THEN READ THIS SECTION

#***DO NOT RUN OTHERWISE***


#This is the code for setting up the master file above in case it ever becomes currupted and needs doing again.

#Take one activity id
activity_id_eg = "11291925681"

#Do one API call to get the picture info for it
get_activity_photos_url = "https://www.strava.com/api/v3/activities/" + str(activity_id_eg) + "/photos?size=500"
activity_photos_raw = requests.get(get_activity_photos_url, headers={"Authorization": "Bearer " + read_all_access_token})
activity_photos = pd.json_normalize(activity_photos_raw.json())
print(activity_photos_raw)
print(activity_photos)
print(activity_photos["urls.500"][0])

#Save the repsonse as the master file.
activity_photos.to_csv("activity_photos_master.csv")
print("activity_photos_master file has been created.")
print("Remember to adapt and run the PICTURES section to fully populate the master file.")

#Now can run the above section although will need to adapt the loop to limit <100 calls and run it every 15 minutes.

<Response [200]>
                              unique_id  athlete_id  activity_id  \
0  1C041F06-A582-4E51-9232-0BF91237A50E   107351005  11291925681   

          activity_name post_id  resource_state caption  type  source  status  \
0  Geese cheering me on    None               2             1       1       3   

            uploaded_at            created_at      created_at_local  \
0  2024-04-29T16:34:10Z  2024-04-29T16:24:46Z  2024-04-29T17:24:46Z   

   default_photo                        location  \
0          False  [52.933166666666665, -1.14745]   

                                            urls.500   sizes.500  
0  https://dgtzuqphqg23d.cloudfront.net/iyIuR8KfR...  [384, 512]  
https://dgtzuqphqg23d.cloudfront.net/iyIuR8KfRoDt7QzdA52qMiWJmexS95ob1Iu48dFbmm0-384x512.jpg
activity_photos_master file has been created.
Remember to adapt and run the PICTURES section to fully populate the master file.
