In [1]:
import pandas as pd
import numpy as np

In [2]:
workouts = pd.read_csv('workouts.csv', index_col=0)
workouts.head()

Unnamed: 0,type,date,moving_time,activity_id,name,distance,elevation gain,trainer,average_speed,max_speed,...,average_cadence,kilojoules,gear_id,average_temp,start_longitude,start_latitude,timezone,location_city,location_state,location_country
0,Run,2021-01-25,0 days 00:23:15,4681776003,Afternoon Run,2.01,26,False,5,5,...,,,g5384532,,-105.0,39.75,America/Denver,,,United States
1,Run,2021-01-24,0 days 00:13:49,4676373084,Afternoon Run,1.17,26,False,5,5,...,,,g5384532,,-105.0,39.75,America/Denver,,,United States
2,WeightTraining,2021-01-24,0 days 00:21:03,4676271709,Afternoon Activity,0.0,0,True,0,0,...,,,,,,,America/Denver,,,United States
3,Hike,2021-01-23,0 days 01:18:58,4669702889,Green mountain w Abby,3.71,740,False,3,3,...,,,g5384532,,-105.19,39.7,America/Denver,,,United States
4,Walk,2021-01-22,0 days 00:14:31,4663042410,Morning Walk,0.75,27,False,3,3,...,,,g5384532,,-105.0,39.75,America/Denver,,,United States


### What are the column types?

In [3]:
workouts.dtypes

type                  object
date                  object
moving_time           object
activity_id            int64
name                  object
distance             float64
elevation gain         int64
trainer                 bool
average_speed          int64
max_speed              int64
average_watts        float64
suffer_score         float64
average_heartrate    float64
average_cadence      float64
kilojoules           float64
gear_id               object
average_temp         float64
start_longitude      float64
start_latitude       float64
timezone              object
location_city         object
location_state        object
location_country      object
dtype: object

### Missing Data

In [4]:
workouts.isnull().sum()

type                    0
date                    0
moving_time             0
activity_id             0
name                    0
distance                0
elevation gain          0
trainer                 0
average_speed           0
max_speed               0
average_watts         487
suffer_score          284
average_heartrate     282
average_cadence       527
kilojoules            493
gear_id                63
average_temp          639
start_longitude       395
start_latitude        395
timezone                0
location_city        1002
location_state       1002
location_country        0
dtype: int64

If the columns below don't have a value, simply take the average. We don't want to lose those observations given we don't have a ton of data

In [5]:
workouts.average_heartrate = workouts.average_heartrate.fillna(workouts.average_heartrate.mean())

workouts.kilojoules = workouts.kilojoules.fillna(workouts.kilojoules.mean())

workouts.suffer_score = workouts.suffer_score.fillna(workouts.suffer_score.mean())

In [6]:
workouts.isnull().sum()

type                    0
date                    0
moving_time             0
activity_id             0
name                    0
distance                0
elevation gain          0
trainer                 0
average_speed           0
max_speed               0
average_watts         487
suffer_score            0
average_heartrate       0
average_cadence       527
kilojoules              0
gear_id                63
average_temp          639
start_longitude       395
start_latitude        395
timezone                0
location_city        1002
location_state       1002
location_country        0
dtype: int64

I am comftorable with remaining missing data

In [7]:
workouts = workouts.apply(lambda x: x.astype(str).str.lower())
workouts = workouts.replace('true',1)
workouts = workouts.replace('false',0)

### Feature Engineering

In [8]:
# date additions of time_added
workouts['date'] = pd.to_datetime(workouts['date'])
workouts['year'] = workouts['date'].dt.year
workouts['month'] = workouts['date'].dt.month
workouts['mnth_yr'] = workouts['date'].apply(lambda x: x.strftime('%Y-%m')) # note: not very efficient
workouts['day'] = workouts['date'].dt.day
workouts['dow'] = workouts['date'].dt.day_name()
workouts['week_number'] = workouts['date'].dt.week
workouts['hour'] = workouts['date'].dt.hour
workouts['date'] = workouts['date'].apply(lambda x: x.strftime('%Y-%m-%d')) # note: not very efficient

In [9]:
workouts['moving_time'] = pd.to_timedelta(workouts['moving_time'])
workouts['moving_time (minutes)'] = workouts['moving_time'].dt.total_seconds().div(60).astype(int)

In [10]:
workouts.head()

Unnamed: 0,type,date,moving_time,activity_id,name,distance,elevation gain,trainer,average_speed,max_speed,...,location_state,location_country,year,month,mnth_yr,day,dow,week_number,hour,moving_time (minutes)
0,run,2021-01-25,0 days 00:23:15,4681776003,afternoon run,2.01,26,0,5,5,...,,united states,2021,1,2021-01,25,Monday,4,0,23
1,run,2021-01-24,0 days 00:13:49,4676373084,afternoon run,1.17,26,0,5,5,...,,united states,2021,1,2021-01,24,Sunday,3,0,13
2,weighttraining,2021-01-24,0 days 00:21:03,4676271709,afternoon activity,0.0,0,1,0,0,...,,united states,2021,1,2021-01,24,Sunday,3,0,21
3,hike,2021-01-23,0 days 01:18:58,4669702889,green mountain w abby,3.71,740,0,3,3,...,,united states,2021,1,2021-01,23,Saturday,3,0,78
4,walk,2021-01-22,0 days 00:14:31,4663042410,morning walk,0.75,27,0,3,3,...,,united states,2021,1,2021-01,22,Friday,3,0,14


### Identifying and Removing Any Outliers

In [11]:
workouts.type.value_counts()

ride              777
run               315
virtualride        66
walk               60
weighttraining     26
workout            19
hike                9
swim                4
yoga                4
crossfit            1
rowing              1
Name: type, dtype: int64

In [12]:
#simply not interested in these workout types and such small sample sizes that it's not even really trainable even if we wanted to
workouts = workouts[workouts['type'] != 'crossfit']
workouts = workouts[workouts['type'] != 'rowing']

#WeightTraining = Workout
workouts['type'] = workouts['type'].apply(lambda x: 'workout' if x == 'weighttraining' else x )

#VirtualRide = Ride
workouts['type'] = workouts['type'].apply(lambda x: 'ride' if x == 'virtualride' else x )


In [13]:
workouts.type.value_counts()

ride       843
run        315
walk        60
workout     45
hike         9
swim         4
yoga         4
Name: type, dtype: int64

In [14]:
workouts.to_csv('workouts_cleaned.csv')