## Machine Learning Commute Model
### Created by Chris Brunet

In [52]:
import requests
import urllib3
import pandas as pd
import numpy as np
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

### Step 1: Loading and Processing Strava Commute Data

In [53]:
def request_access_token(client_id, client_secret, refresh_token):
    """
    Post request to refresh and get new API access token

    Parameters:
        client_id: string
        client_secret: string
        refresh_token: string
    
    Returns:
        access_token: string
    """
    auth_url = "https://www.strava.com/oauth/token"
    payload = {
        'client_id': client_id,
        'client_secret': client_secret,
        'refresh_token': refresh_token,
        'grant_type': 'refresh_token',
        'f': 'json'
    }
    print("\nRequesting Access Token...")
    res = requests.post(auth_url, data=payload, verify=False)
    access_token = res.json()['access_token']
    print(f"\nAccess Token = {access_token}")
    return access_token

In [54]:
def get_activity_data(access_token):
    """
    Get request for Strava user activity data 

    Parameters:
        client_id: string
        client_secret: string
        refresh_token: string
    
    Returns:
        all_activities_df: DataFrame
        all_activities_list: list
    """
    print("\nGetting Activity Data...")
    activities_url = "https://www.strava.com/api/v3/athlete/activities"
    header = {'Authorization': 'Bearer ' + access_token}
    request_page_num = 1
    all_activities_list = []
    
    while True: # since max 200 activities can be accessed per request, while loop runs until all activities are loaded
        param = {'per_page': 200, 'page': request_page_num}
        get_activities = requests.get(activities_url, headers=header,params=param).json()
        if len(get_activities) == 0: # exit condition
            break
        all_activities_list.extend(get_activities)
        print(f'\t- Activities: {len(all_activities_list) - len(get_activities)} to {len(all_activities_list)}')
        request_page_num += 1
    
    all_activities_df = pd.DataFrame(all_activities_list)
    return all_activities_df

In [55]:
# Constant variables  
client_id = '111595'
client_secret = '8e8f246270159ece4b0eb3c75e494241bad86027'
refresh_token = '8285947a1614c22ebf0a7308cafb267ed4d9426f'

# API requests, getting and formatting Activity data and Segment data from Strava API
access_token = request_access_token(client_id, client_secret, refresh_token) # int
all_activities = get_activity_data(access_token) # DataFrame


Requesting Access Token...

Access Token = 4c83d08f110e117ee788606f658654b5b3f76b2f

Getting Activity Data...
	- Activities: 0 to 200
	- Activities: 200 to 325


In [56]:
all_commutes = all_activities[all_activities['commute'] == True]
all_commutes

Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id,...,elev_low,upload_id,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,workout_type,average_temp
1,2,"{'id': 54754532, 'resource_state': 1}",Afternoon Ride,6036.1,1016,1016,92.0,Ride,Ride,10196468104,...,1053.8,10920260140,10920260140,garmin_ping_304165618299,False,0,0,False,10.0,
2,2,"{'id': 54754532, 'resource_state': 1}",Lunch Ride,6046.9,1074,1074,63.0,Ride,Ride,10195492833,...,1051.2,10919253970,10919253970,garmin_ping_304135706924,False,0,0,False,10.0,
3,2,"{'id': 54754532, 'resource_state': 1}",Afternoon Ride,6096.4,1100,1502,77.0,Ride,Ride,10191591684,...,1053.6,10915199739,10915199739,garmin_ping_304008538649,False,0,0,False,10.0,
4,2,"{'id': 54754532, 'resource_state': 1}",Morning Ride #commutemarker.com,6032.8,1053,1053,62.0,Ride,Ride,10189012200,...,1052.0,10912550727,10912550727,garmin_ping_303940892189,False,0,0,False,10.0,
5,2,"{'id': 54754532, 'resource_state': 1}",Afternoon Ride #commutemarker.com,6079.7,1164,1302,79.0,Ride,Ride,10179536062,...,1053.6,10902744686,10902744686,garmin_ping_303660586498,False,0,0,False,10.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,2,"{'id': 54754532, 'resource_state': 1}",Afternoon Ride,6208.0,1135,1160,88.0,Ride,Ride,9011977219,...,1058.4,9668642605,9668642605,garmin_ping_272582649270,False,3,0,False,10.0,
181,2,"{'id': 54754532, 'resource_state': 1}",Lunch Ride,6153.0,1050,1099,67.0,Ride,Ride,9010531477,...,1052.4,9667118367,9667118367,garmin_ping_272549041941,False,2,0,False,10.0,
182,2,"{'id': 54754532, 'resource_state': 1}",Afternoon Ride,6251.7,1208,1291,88.0,Ride,Ride,9005702275,...,1057.0,9662020260,9662020260,garmin_ping_272414779651,False,1,0,False,10.0,
183,2,"{'id': 54754532, 'resource_state': 1}",Morning Ride,6071.8,1152,1698,67.0,Ride,Ride,9003038263,...,1052.2,9659225433,9659225433,garmin_ping_272357926443,False,5,0,False,10.0,


In [57]:
all_commutes.isna().sum()

resource_state                     0
athlete                            0
name                               0
distance                           0
moving_time                        0
elapsed_time                       0
total_elevation_gain               0
type                               0
sport_type                         0
id                                 0
start_date                         0
start_date_local                   0
timezone                           0
utc_offset                         0
location_city                    121
location_state                   121
location_country                 121
achievement_count                  0
kudos_count                        0
comment_count                      0
athlete_count                      0
photo_count                        0
map                                0
trainer                            0
commute                            0
manual                             0
private                            0
v

In [58]:
columns_to_keep = ['distance', 'moving_time', 'total_elevation_gain', 'start_date_local', 'start_latlng', 'end_latlng']
columns_to_drop = [col for col in all_commutes.columns if col not in columns_to_keep] 
all_commutes = all_commutes.drop(columns=columns_to_drop)

In [59]:
min_dist = 5000
max_dist = 7000
all_commutes = all_commutes[(all_commutes['distance'] > min_dist) & (all_commutes['distance'] < max_dist)]

In [60]:
all_commutes['start_datetime'] = pd.to_datetime(all_commutes['start_date_local'])
all_commutes['start_date'] = all_commutes['start_datetime'].dt.date
all_commutes['start_time'] = all_commutes['start_datetime'].dt.time

In [61]:
all_commutes = all_commutes.drop(columns=['start_datetime', 'start_date_local'])

Unnamed: 0,distance,moving_time,total_elevation_gain,start_latlng,end_latlng,start_date,start_time
1,6036.1,1016,92.0,"[51.07540332712233, -114.1291778627783]","[51.03907272219658, -114.12874057888985]",2023-11-10,17:00:41
2,6046.9,1074,63.0,"[51.03911463171244, -114.12876421585679]","[51.07541145756841, -114.12921222858131]",2023-11-10,12:37:41
3,6096.4,1100,77.0,"[51.07547448948026, -114.12921968847513]","[51.03933264501393, -114.12946234457195]",2023-11-09,17:19:17
4,6032.8,1053,62.0,"[51.039096526801586, -114.12877150811255]","[51.07542377896607, -114.1292613465339]",2023-11-09,08:29:29
5,6079.7,1164,79.0,"[51.07527969405055, -114.12891358137131]","[51.03909091092646, -114.12880302406847]",2023-11-07,15:34:23
...,...,...,...,...,...,...,...
178,6355.9,1289,66.0,"[51.03915276937187, -114.12948220968246]","[51.075256979092956, -114.13054017350078]",2023-05-08,08:23:53
180,6208.0,1135,88.0,"[51.07541372068226, -114.12917266599834]","[51.03915335610509, -114.12912396714091]",2023-05-04,14:17:18
181,6153.0,1050,67.0,"[51.03918612934649, -114.12924617528915]","[51.07535244897008, -114.12916101515293]",2023-05-04,11:06:52
182,6251.7,1208,88.0,"[51.075480775907636, -114.12934122607112]","[51.039138017222285, -114.12918104790151]",2023-05-03,15:26:02


In [62]:
def get_direction(row):
    if row['start_lat'] < row['end_lat']:
        return 'northbound'
    else:
        return 'southbound'

In [64]:
all_commutes[['start_lat', 'start_lng']] = pd.DataFrame(all_commutes['start_latlng'].tolist(), index=all_commutes.index)
all_commutes[['end_lat', 'end_lng']] = pd.DataFrame(all_commutes['end_latlng'].tolist(), index=all_commutes.index)
all_commutes['direction'] = all_commutes.apply(get_direction, axis=1)
all_commutes = all_commutes.drop(columns=['start_latlng', 'end_latlng', 'start_lat', 'start_lng', 'end_lat', 'end_lng'])
all_commutes

Unnamed: 0,distance,moving_time,total_elevation_gain,start_date_local,start_datetime,start_date,start_time,direction
1,6036.1,1016,92.0,2023-11-10T17:00:41Z,2023-11-10 17:00:41+00:00,2023-11-10,17:00:41,southbound
2,6046.9,1074,63.0,2023-11-10T12:37:41Z,2023-11-10 12:37:41+00:00,2023-11-10,12:37:41,northbound
3,6096.4,1100,77.0,2023-11-09T17:19:17Z,2023-11-09 17:19:17+00:00,2023-11-09,17:19:17,southbound
4,6032.8,1053,62.0,2023-11-09T08:29:29Z,2023-11-09 08:29:29+00:00,2023-11-09,08:29:29,northbound
5,6079.7,1164,79.0,2023-11-07T15:34:23Z,2023-11-07 15:34:23+00:00,2023-11-07,15:34:23,southbound
...,...,...,...,...,...,...,...,...
178,6355.9,1289,66.0,2023-05-08T08:23:53Z,2023-05-08 08:23:53+00:00,2023-05-08,08:23:53,northbound
180,6208.0,1135,88.0,2023-05-04T14:17:18Z,2023-05-04 14:17:18+00:00,2023-05-04,14:17:18,southbound
181,6153.0,1050,67.0,2023-05-04T11:06:52Z,2023-05-04 11:06:52+00:00,2023-05-04,11:06:52,northbound
182,6251.7,1208,88.0,2023-05-03T15:26:02Z,2023-05-03 15:26:02+00:00,2023-05-03,15:26:02,southbound
