In [1]:
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt  
from sklearn import metrics

In [2]:
df = pd.read_csv('workouts_cleaned.csv', index_col=0)

# choose relevant columns 
df.columns

Index(['type', 'date', 'moving_time', 'activity_id', 'name', 'distance',
       'elevation gain', 'trainer', 'average_speed', 'max_speed',
       'average_watts', 'suffer_score', 'average_heartrate', 'average_cadence',
       'kilojoules', 'gear_id', 'average_temp', 'start_longitude',
       'start_latitude', 'timezone', 'location_city', 'location_state',
       'location_country', 'year', 'month', 'mnth_yr', 'day', 'dow',
       'week_number', 'hour', 'moving_time (minutes)'],
      dtype='object')

In [3]:
#https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd
df['type'] = df['type'].astype('category')
df['type_Cat'] = df['type'].cat.codes

In [4]:
df_model = df[['type','type_Cat','distance',
       'elevation gain', 'trainer', 'average_speed', 'max_speed',
       'average_watts', 'suffer_score', 'average_heartrate', 'average_cadence',
       'kilojoules', 'gear_id', 'average_temp', 'timezone', 'location_city', 'location_state',
       'location_country', 'year', 'month', 'mnth_yr', 'day', 'dow',
       'week_number', 'hour', 'moving_time (minutes)']]

In [5]:
# get dummy data 
df_dum = pd.get_dummies(df_model)
df_dum.columns

Index(['type_Cat', 'distance', 'elevation gain', 'trainer', 'average_speed',
       'max_speed', 'average_watts', 'suffer_score', 'average_heartrate',
       'average_cadence',
       ...
       'mnth_yr_2020-11', 'mnth_yr_2020-12', 'mnth_yr_2021-01', 'dow_Friday',
       'dow_Monday', 'dow_Saturday', 'dow_Sunday', 'dow_Thursday',
       'dow_Tuesday', 'dow_Wednesday'],
      dtype='object', length=175)

In [6]:
df_dum.head()

Unnamed: 0,type_Cat,distance,elevation gain,trainer,average_speed,max_speed,average_watts,suffer_score,average_heartrate,average_cadence,...,mnth_yr_2020-11,mnth_yr_2020-12,mnth_yr_2021-01,dow_Friday,dow_Monday,dow_Saturday,dow_Sunday,dow_Thursday,dow_Tuesday,dow_Wednesday
0,2,2.01,26,0,5,5,0.0,43.0,162.5,0.0,...,0,0,1,0,1,0,0,0,0,0
1,2,1.17,26,0,5,5,0.0,35.0,169.5,0.0,...,0,0,1,0,0,0,1,0,0,0
2,5,0.0,0,1,0,0,0.0,11.0,136.3,0.0,...,0,0,1,0,0,0,1,0,0,0
3,0,3.71,740,0,3,3,0.0,40.0,133.3,0.0,...,0,0,1,0,0,1,0,0,0,0
4,4,0.75,27,0,3,3,0.0,3.0,124.6,0.0,...,0,0,1,1,0,0,0,0,0,0


In [7]:
#making sure none of my training data has any NaNs
nan_values = df_dum.isna()
nan_columns = nan_values.any()

columns_with_nan = df_dum.columns[nan_columns].tolist()
print(columns_with_nan)


[]


In [8]:
# train test split 
from sklearn.model_selection import train_test_split

X = df_dum.drop('type_Cat', axis =1)
y = df_dum.type_Cat.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0).fit(X_train, y_train)

In [10]:
y_pred = lr.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[  0   0   2   0   0   0   0]
 [  0 163   4   0   0   0   0]
 [  1   0  61   0   0   1   0]
 [  0   0   2   0   0   0   0]
 [  0   0   1   0  12   1   0]
 [  0   0   4   0   0   2   0]
 [  0   0   2   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      0.98      0.99       167
           2       0.80      0.97      0.88        63
           3       0.00      0.00      0.00         2
           4       1.00      0.86      0.92        14
           5       0.50      0.33      0.40         6
           6       0.00      0.00      0.00         2

    accuracy                           0.93       256
   macro avg       0.47      0.45      0.46       256
weighted avg       0.92      0.93      0.92       256

0.9296875


### Random Forrest with Grid Search For Model Optimization

In [11]:
# random forrest with grid search

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
#create a new random forest classifier
rf = RandomForestClassifier()
#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)
#fit model to training data
rf_gs.fit(X_train, y_train)
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)
print('rf: {}'.format(rf_best.score(X_test, y_test)))

y_pred = rf_best.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

{'n_estimators': 200}
rf: 0.984375
[[  1   0   1   0   0   0   0]
 [  0 167   0   0   0   0   0]
 [  0   0  63   0   0   0   0]
 [  0   0   0   1   0   1   0]
 [  0   0   0   0  14   0   0]
 [  0   0   0   0   0   6   0]
 [  0   0   0   0   0   2   0]]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       1.00      1.00      1.00       167
           2       0.98      1.00      0.99        63
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00        14
           5       0.67      1.00      0.80         6
           6       0.00      0.00      0.00         2

    accuracy                           0.98       256
   macro avg       0.81      0.71      0.73       256
weighted avg       0.98      0.98      0.98       256

0.984375


### XGBoost

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
XG = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
XG.score(X_test, y_test)

y_pred = XG.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[  2   0   0   0   0   0   0]
 [  0 167   0   0   0   0   0]
 [  0   0  63   0   0   0   0]
 [  0   0   0   2   0   0   0]
 [  0   0   0   0  14   0   0]
 [  0   0   0   0   0   6   0]
 [  0   0   0   0   0   0   2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       167
           2       1.00      1.00      1.00        63
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00        14
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00         2

    accuracy                           1.00       256
   macro avg       1.00      1.00      1.00       256
weighted avg       1.00      1.00      1.00       256

1.0


In [13]:
# Theoretically could do ensemble models but XGBoost has an accuracy of 100%. Starting to understand why this type of model wins Kaggle competions all the time. 
# I could theretically plot an ROC curve but I am very confident in my XGBoost model