In [2]:
import pandas as pd
pd.set_option("display.max_columns", 100)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

IN_CSV_DATA = Path().cwd().parent.parent / "data/2_loaded"
OUT_CSV_DATA = Path().cwd().parent.parent / "data/3_feature_engineered"

In [3]:
# Load the summary  for reference later
df_summary = pd.read_csv(IN_CSV_DATA/'summary/ride_summary_good.csv')
df_summary['start_date'] = pd.to_datetime(df_summary['start_date'])
df_summary['year'] = df_summary['start_date'].dt.year
df_summary['month'] = df_summary['start_date'].dt.month
df_summary['month_name'] = df_summary['start_date'].dt.month_name()
df_summary['start_time'] = pd.to_timedelta(df_summary['start_time'])
df_summary['end_time'] = pd.to_timedelta(df_summary['end_time'])

## Handle categorical variables
month_order = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}
df_summary['month_name'] = pd.Categorical(df_summary['month_name'] , categories=month_order.keys(), ordered=True)

In [4]:
df_summary.columns

Index(['ride_id', 'start_date', 'start_time', 'end_time', 'biker_weight_lbs',
       'bike_weight', 'bag_weight', 'avg_speed', 'avg_cruising_speed',
       'total_ride_time_sec', 'total_moving_time_sec', 'total_distance_mi',
       'total_ascent_ft', 'total_descent_ft', 'avg_heart_rate', 'avg_power',
       'avg_cadence', 'avg_ambient_temp_F', 'best_power_4s', 'best_power_5s',
       'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
       'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
       'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
       'best_power_40m', 'best_power_1h', 'best_power_2h', 'year', 'month',
       'month_name'],
      dtype='object')

In [7]:
cols_of_interest = ['ride_id','start_date','start_time','end_time','total_distance_mi','avg_speed','avg_cruising_speed',
                    'total_ride_time_sec','total_moving_time_sec','total_ascent_ft','total_descent_ft','year','month_name']
power_cols = ['best_power_4s', 'best_power_5s','best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
            'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m','best_power_6m', 'best_power_10m', 
            'best_power_20m', 'best_power_30m','best_power_40m', 'best_power_1h', 'best_power_2h']
cols_of_interest += power_cols
df_summary = df_summary.loc[:, cols_of_interest]

for col in power_cols:
    df_summary[col] = df_summary[col].fillna(0.0) # for the power columns, fill the nulls with zeros

In [6]:
df_summary.head()

Unnamed: 0,ride_id,start_date,start_time,end_time,total_distance_mi,avg_speed,avg_cruising_speed,total_ride_time_sec,total_moving_time_sec,total_ascent_ft,total_descent_ft,year,month_name,best_power_4s,best_power_5s,best_power_10s,best_power_20s,best_power_30s,best_power_1m,best_power_2m,best_power_3m,best_power_4m,best_power_5m,best_power_6m,best_power_10m,best_power_20m,best_power_30m,best_power_40m,best_power_1h,best_power_2h
0,0x5b086853,2018-05-25,0 days 19:47:31,0 days 20:27:31,6.991217,13.720594,14.267175,2401.0,1736.0,220.1551,237.8725,2018,May,672.602587,667.914386,629.65666,514.425348,410.461476,303.697449,224.933753,198.493162,179.981622,180.681178,183.201559,153.455322,143.166693,121.926138,0.0,0.0,0.0
1,0x5b3c1e5a,2018-07-04,0 days 01:09:46,0 days 01:42:05,6.994353,13.615799,14.215438,1940.0,1737.0,152.5665,180.1269,2018,July,528.072743,521.385666,483.86017,376.2232,296.104667,242.541335,194.185269,168.695252,166.227917,154.582374,149.646974,142.407923,119.830925,110.8224,0.0,0.0,0.0
2,0x5ccb413c,2019-05-02,0 days 19:13:00,0 days 19:53:00,8.865922,13.784195,14.574504,2401.0,2143.0,300.5396,300.5396,2019,May,485.282271,481.991827,462.283537,412.144319,391.244346,314.816769,209.581472,186.631718,179.636006,157.569641,155.009664,142.368486,134.44285,130.264839,0.0,0.0,0.0
3,0x5cccc63c,2019-05-03,0 days 22:52:44,0 days 23:53:54,9.516836,11.228317,12.081964,3671.0,2725.0,333.3496,280.1974,2019,May,432.549324,425.735809,389.502414,330.2679,299.166732,225.907473,143.7939,147.597113,129.794118,118.283785,115.819729,107.256435,98.405537,97.000798,94.003892,0.0,0.0
4,0x5cdbff9c,2019-05-15,0 days 12:01:32,0 days 12:37:36,8.02963,14.256866,14.701154,2165.0,1942.0,255.2618,211.2964,2019,May,422.177957,419.814683,393.709515,363.263188,336.841737,271.435198,215.164384,187.689124,188.538262,164.696769,157.267499,154.66866,139.481512,138.60614,0.0,0.0,0.0


# Making New Features
## 1. Time Since Last Ride
How long has it been since the last ride?|

In [9]:
df_summary['start_time'] = df_summary['start_date'] + df_summary['start_time']
df_summary['end_time'] = df_summary['start_date'] + df_summary['end_time']

In [None]:
# ensure that the data is sorted chronologically
df_summary = df_summary.set_index('start_time').sort_index().reset_index()

# create a temporary column for the Last end time in UTC seconds
df_summary['last_ride_end'] = df_summary['end_time'].apply(lambda x: x.timestamp()).shift()

sec2hour = 1.0/3600 # convert seconds to hours
df_summary['hours_since_last_ride'] = sec2hour*(df_summary['start_time'].apply(lambda x: x.timestamp()) - df_summary['last_ride_end'])
df_summary['hours_since_last_ride'] = df_summary['hours_since_last_ride'].fillna(10000) # assume any contribution to fatigue/fitness is 0 ..

df_summary.drop(['last_ride_end'], axis=1, inplace=True) # drop temporary column

## 2. Active Ride Time Ratio
What percentage of the ride was I resting vs moving?

In [12]:
df_summary['active_time_ratio'] = df_summary['total_moving_time_sec'] / df_summary['total_ride_time_sec']