## Machine Learning Cycling Ride Classifier

In [39]:
import pandas as pd
import altair as alt
import numpy as np
from stravalib import unithelper
from stravalib import Client
from conf.credentials import *
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import arff
import pandas as pd

In [20]:
client_id = client_id
client_secret = client_secret
client = Client()
url = client.authorization_url(
    client_id=client_id,
    redirect_uri='http://localhost/'
)
print(url)

https://www.strava.com/oauth/authorize?client_id=24067&redirect_uri=http%3A%2F%2Flocalhost%2F&approval_prompt=auto&response_type=code&scope=read%2Cactivity%3Aread


In [27]:
code = '770e873e7c4d33d3cd11947985e514fa1879db2a' 
access_token = client.exchange_code_for_token(
    client_id=client_id,
    client_secret=client_secret,
    code=code
)
refresh_token = access_token['access_token']
client = Client(access_token=refresh_token)
# Test the connection
athlete = client.get_athlete()
print(f'Hello, {athlete.firstname}, I know you.')

Hello, Chase, I know you.


In [28]:
rides = pd.DataFrame(
    columns=[
        'date',
        'moving_time',
        'activity_id',
        'name',
        'distance',
        'elevation gain',
        'type',
        'trainer',
        'average_speed',
        'average_watts',
        'suffer_score',
        'average_heartrate',
        'average_cadence',
        'kilojoules',
        'gear_id',
        'average_temp',
        'start_longitude',
        'start_latitude'
    ]
)
for activity in client.get_activities(
    after="2013-12-31T00:00:00Z",
    before="2021-01-01T00:00:00Z"):
    if activity.type == "Ride":
        rides = rides.append(
            {
                'date': activity.start_date_local.date(),
                'activity_id': activity.id, 
                'moving_time': activity.moving_time,
                'name': activity.name, 
                'distance': round(float(unithelper.miles(activity.distance)), 2),
                'elevation gain': float(unithelper.feet(activity.total_elevation_gain)),
                'type': activity.type,
                'trainer': activity.trainer,
                'average_speed': float(unithelper.miles_per_hour(activity.average_speed)),
                'average_watts': activity.average_watts,
                'suffer_score': activity.suffer_score,
                'average_heartrate': activity.average_heartrate,
                'average_cadence': activity.average_cadence,
                'kilojoules': activity.kilojoules,
                'gear_id': activity.gear_id,
                'average_temp': activity.average_temp,
                'start_longitude': activity.start_longitude,
                'start_latitude': activity.start_latitude
            }, 
            ignore_index=True
        )

In [29]:
rides.head()

Unnamed: 0,date,moving_time,activity_id,name,distance,elevation gain,type,trainer,average_speed,average_watts,suffer_score,average_heartrate,average_cadence,kilojoules,gear_id,average_temp,start_longitude,start_latitude
0,2020-06-21,00:55:34,3650328303,Morning Ride,13.17,219.816273,Ride,False,14.217967,92.3,19,126.8,76.9,307.8,b4933861,26.0,-104.97,39.74
1,2020-06-16,00:20:31,3626733948,Pettit,5.97,0.0,Ride,True,17.452577,153.7,12,137.2,85.7,189.3,b4933861,,,
2,2020-06-13,01:30:00,3609777201,Junction -1: Every now and then I think to mys...,25.48,0.0,Ride,True,16.987294,161.2,157,159.5,88.8,870.7,b4933861,,,
3,2020-06-12,00:45:11,3604385266,Mokelumne: fasted ride,15.6,0.0,Ride,True,20.718504,153.8,65,153.9,90.7,417.0,b4933861,,,
4,2020-06-11,00:47:40,3600545290,Avalanche Spire +1: Something is better than n...,15.28,0.0,Ride,True,19.235415,180.6,75,157.2,86.5,516.4,b4933861,,,


## Data Preprocessing

In [30]:
rides = rides.apply(lambda x: x.astype(str).str.lower())
rides = rides.replace('true',1)
rides = rides.replace('false',0)

In [31]:
rides

Unnamed: 0,date,moving_time,activity_id,name,distance,elevation gain,type,trainer,average_speed,average_watts,suffer_score,average_heartrate,average_cadence,kilojoules,gear_id,average_temp,start_longitude,start_latitude
0,2020-06-21,0 days 00:55:34.000000000,3650328303,morning ride,13.17,219.81627296587925,ride,0,14.217967072297782,92.3,19,126.8,76.9,307.8,b4933861,26,-104.97,39.74
1,2020-06-16,0 days 00:20:31.000000000,3626733948,pettit,5.97,0.0,ride,1,17.452576950608446,153.7,12,137.2,85.7,189.3,b4933861,none,none,none
2,2020-06-13,0 days 01:30:00.000000000,3609777201,junction -1: every now and then i think to mys...,25.48,0.0,ride,1,16.987294201861133,161.2,157,159.5,88.8,870.7,b4933861,none,none,none
3,2020-06-12,0 days 00:45:11.000000000,3604385266,mokelumne: fasted ride,15.6,0.0,ride,1,20.718503937007874,153.8,65,153.9,90.7,417.0,b4933861,none,none,none
4,2020-06-11,0 days 00:47:40.000000000,3600545290,avalanche spire +1: something is better than n...,15.28,0.0,ride,1,19.235415175375806,180.6,75,157.2,86.5,516.4,b4933861,none,none,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,2014-03-05,0 days 01:09:29.000000000,118082842,back in the saddle,11.0,298.2283464566929,ride,0,9.498031496062993,59.3,none,none,none,247.2,b1111681,none,-105.27,40.01
718,2014-01-08,0 days 03:38:42.000000000,104930901,madera canyon,45.55,2792.9790026246715,ride,0,12.495526127415893,102.2,none,none,none,1341.6,none,none,-110.77,31.96
719,2014-01-07,0 days 02:21:12.000000000,104704896,morning ride,28.2,2025.9186351706035,ride,0,11.983267716535433,117.8,none,none,none,998.0,none,none,-111.02,32.23
720,2014-01-06,0 days 02:06:05.000000000,104474814,morning ride,20.24,2527.8871391076113,ride,0,9.634484609878312,139.7,none,none,none,1056.7,none,none,-110.74,32.31


In [145]:
rides = rides[rides['average_heartrate'] != 'none']
rides = rides[rides['kilojoules'] != 'none']
rides = rides[rides['suffer_score'] != 'none']

In [146]:
rides.average_speed = rides.average_speed.astype(float)
rides.average_heartrate = rides.average_heartrate.astype(float)
rides.kilojoules = rides.kilojoules.astype(float)

In [147]:
#xVar = list(['distance','average_speed','average_heartrate'])
xVar = list(['distance','average_speed','average_heartrate','kilojoules','elevation gain','suffer_score'])
yVar = rides['trainer']
df2 = rides[xVar]

In [148]:
df2

Unnamed: 0,distance,average_speed,average_heartrate,kilojoules,elevation gain,suffer_score
0,13.17,14.217967,126.8,307.8,219.81627296587925,19
1,5.97,17.452577,137.2,189.3,0.0,12
2,25.48,16.987294,159.5,870.7,0.0,157
3,15.6,20.718504,153.9,417.0,0.0,65
4,15.28,19.235415,157.2,516.4,0.0,75
...,...,...,...,...,...,...
707,15.04,13.833214,161.5,489.4,872.7034120734908,70
708,27.15,14.580351,163.6,837.1,1364.8293963254594,124
711,43.49,13.862294,147.2,1303.6,2398.293963254593,127
712,12.13,13.596099,168.0,366.2,646.3254593175852,72


In [149]:
df2.dtypes

distance              object
average_speed        float64
average_heartrate    float64
kilojoules           float64
elevation gain        object
suffer_score          object
dtype: object

In [165]:
X_train, X_test, y_train, y_test = train_test_split(df2, yVar, test_size=0.25)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(452, 6) (452,)
(151, 6) (151,)


In [166]:
clf = RandomForestClassifier(n_jobs=2, random_state=0)

clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=1e-07,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [167]:
y_pred = clf.predict(X_test)

In [168]:
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,113,2
1,0,36


In [169]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[113   2]
 [  0  36]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       115
           1       0.95      1.00      0.97        36

    accuracy                           0.99       151
   macro avg       0.97      0.99      0.98       151
weighted avg       0.99      0.99      0.99       151

0.9867549668874173


In [170]:
list(zip(X_train, clf.feature_importances_))

[('distance', 0.03474913144659668),
 ('average_speed', 0.16767346458410495),
 ('average_heartrate', 0.03026311452466061),
 ('kilojoules', 0.04406090679016659),
 ('elevation gain', 0.7123478769853913),
 ('suffer_score', 0.010905505669079966)]

In [172]:
rides.gear_id.value_counts()

b1703581    215
b4933861    186
b5499491     99
b3092328     41
b1315248     32
b1111681     30
Name: gear_id, dtype: int64

In [180]:
mtb_rides = rides[rides['gear_id'] == 'b3092328']

In [182]:
rides['mtb?'] = rides['gear_id'].apply(lambda x: 1 if x == 'b3092328' else 0)