In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 100)

In [2]:
file_paths = {'in':'../../../data/prepared/activity_log_train.csv'
             }

df = pd.read_csv(file_paths['in'])

# guarantee the timestamps are datetime objects
time_columns = ['start_time','end_time']

for time_col in time_columns:
    df[time_col] = pd.to_datetime(df[time_col])

### Replace Peak Power Nulls

In [14]:
# Need to fill in the missing values for peak_20min_power.
# Use intensity * FTP to estimate the missing
filt_nan = df['peak_20min_power'].isna()

df.loc[filt_nan, 'peak_20min_power'] = df.loc[filt_nan,'ftp_power'] * df.loc[filt_nan, 'intensity']

In [3]:
cols_to_keep = ['simple_exertion','elapsed_distance','moving_time','avg_speed','ride_cruise_speed','ride_avg_power',
                'peak_20min_power', 'training_window_id']

### Replace Simple Exertion with Enumerated Categories

In [7]:
df['simple_exertion'] = df['simple_exertion'].replace({'EASY':1, 'NORMAL':2, 'HARD':3})
df['simple_exertion']

0      2
1      3
2      3
3      3
4      2
      ..
121    2
122    1
123    2
124    2
125    2
Name: simple_exertion, Length: 126, dtype: int64

In [8]:
num_cols = ['elapsed_distance','moving_time','avg_speed','ride_cruise_speed','ride_avg_power','peak_20min_power']
cat_cols = ['training_window_id']
target_col = ['simple_exertion']

# 2. Pipeline Preparation

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([('selector', DataFrameSelector(num_cols)),
                         ('std_scaler', StandardScaler())
                        ])

nominal_cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_cols)),
                                 ('one_hot', OneHotEncoder(sparse=False))
                                ])

In [12]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[('numerical_pipeline', num_pipeline), 
                                               ('categorical_pipeline', nominal_cat_pipeline)
                                              ])

### Check Nullity 

In [15]:
df[cols_to_keep].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   simple_exertion     126 non-null    int64  
 1   elapsed_distance    126 non-null    float64
 2   moving_time         126 non-null    int64  
 3   avg_speed           126 non-null    float64
 4   ride_cruise_speed   126 non-null    float64
 5   ride_avg_power      126 non-null    float64
 6   peak_20min_power    126 non-null    float64
 7   training_window_id  126 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 8.0 KB


### Apply Pipeline to Prepare Data for ML

In [19]:
data_prepared = full_pipeline.fit_transform(df)
data_prepared.shape

(126, 19)

In [20]:
data_prepared

array([[ 0.30194675,  0.17918467,  0.50259987, ...,  0.        ,
         0.        ,  0.        ],
       [-0.5583539 , -0.7691198 ,  0.86486012, ...,  0.        ,
         0.        ,  0.        ],
       [-0.5957914 , -0.80185087,  0.88804778, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.332341  ,  0.13524432, -1.55925874, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05286141,  0.17021725, -0.61943897, ...,  1.        ,
         0.        ,  0.        ],
       [-0.67087782, -0.4561569 , -1.32563783, ...,  0.        ,
         0.        ,  0.        ]])

# 3. Save Prepared Values

In [24]:
training_id_one_hot = ['training_window_'+str(x) for x in list(nominal_cat_pipeline['one_hot'].categories_[0])]
print(training_id_one_hot)

['training_window_0', 'training_window_1', 'training_window_2', 'training_window_3', 'training_window_4', 'training_window_5', 'training_window_6', 'training_window_7', 'training_window_8', 'training_window_9', 'training_window_10', 'training_window_11', 'training_window_12']


In [26]:
targets = df[target_col].values

In [29]:
column_names = num_cols + training_id_one_hot + target_col
print(column_names)

['elapsed_distance', 'moving_time', 'avg_speed', 'ride_cruise_speed', 'ride_avg_power', 'peak_20min_power', 'training_window_0', 'training_window_1', 'training_window_2', 'training_window_3', 'training_window_4', 'training_window_5', 'training_window_6', 'training_window_7', 'training_window_8', 'training_window_9', 'training_window_10', 'training_window_11', 'training_window_12', 'simple_exertion']
