In [15]:
import pandas as pd
pd.set_option("display.max_columns", 100)
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Any, Union

In [16]:
IN_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"
OUT_CSV_DATA = Path().cwd().parent.parent / "data/4_data_split"

# 1. Loading Test/Train Data

In [17]:
df_train = pd.read_csv(IN_CSV_DATA/'ride_summary_train.csv')
df_test = pd.read_csv(IN_CSV_DATA/'ride_summary_test.csv')
df_train['start_time'] = pd.to_datetime(df_train['start_time'])
df_test['start_time'] = pd.to_datetime(df_test['start_time'])
# Sort the summary by ascending order of ride start time
df_train = df_train.set_index('start_time').sort_index(ascending=True).reset_index()
df_test = df_train.set_index('start_time').sort_index(ascending=True).reset_index()

In [18]:
ride_cols = ['ride_id','start_time']
numerical_feature_cols = ['total_distance_mi','total_weight_lbs','avg_cruising_speed', 'log_hours_since_last_ride',
                            'active_time_ratio', 'avg_climb_rate', 'distance_training_intensity','prior_training_load']
categorical_feature_cols = ['year']

target_cols = ['best_power_4s', 'best_power_5s',
                'best_power_10s', 'best_power_20s', 'best_power_30s', 'best_power_1m',
                'best_power_2m', 'best_power_3m', 'best_power_4m', 'best_power_5m',
                'best_power_6m', 'best_power_10m', 'best_power_20m', 'best_power_30m',
                'best_power_40m', 'best_power_1h', 'best_power_2h']

In [19]:
df_train.head()

Unnamed: 0,start_time,ride_id,total_distance_mi,total_weight_lbs,avg_cruising_speed,log_hours_since_last_ride,active_time_ratio,avg_climb_rate,distance_training_intensity,prior_training_load,year,best_power_4s,best_power_5s,best_power_10s,best_power_20s,best_power_30s,best_power_1m,best_power_2m,best_power_3m,best_power_4m,best_power_5m,best_power_6m,best_power_10m,best_power_20m,best_power_30m,best_power_40m,best_power_1h,best_power_2h
0,2019-05-02 19:13:00,0x5ccb413c,8.865922,255,14.574504,1.659221,0.892545,8.414548,1.0,0.0,2019,485.282271,481.991827,462.283537,412.144319,391.244346,314.816769,209.581472,186.631718,179.636006,157.569641,155.009664,142.368486,134.44285,130.264839,0.0,0.0,0.0
1,2019-05-03 22:52:44,0x5cccc63c,9.516836,255,12.081964,1.431292,0.742305,7.339808,1.0,0.617508,2019,432.549324,425.735809,389.502414,330.2679,299.166732,225.907473,143.7939,147.597113,129.794118,118.283785,115.819729,107.256435,98.405537,97.000798,94.003892,0.0,0.0
2,2019-05-15 12:01:32,0x5cdbff9c,8.02963,255,14.701154,2.441109,0.896998,7.886564,0.843729,0.011599,2019,422.177957,419.814683,393.709515,363.263188,336.841737,271.435198,215.164384,187.689124,188.538262,164.696769,157.267499,154.66866,139.481512,138.60614,0.0,0.0,0.0
3,2019-05-15 21:24:33,0x5cdc8391,8.452656,255,14.453954,0.943618,0.87127,6.895879,0.888179,0.73107,2019,416.729572,414.718966,393.124198,327.432493,295.389433,231.954659,191.952644,182.999122,175.94251,169.921718,170.186598,156.173244,143.046556,137.236242,0.0,0.0,0.0
4,2019-05-24 12:13:37,0x5ce7dff1,8.018602,255,15.17962,2.314218,0.918417,8.303722,0.84257,0.040566,2019,419.659872,414.650607,402.052528,376.548577,344.272237,262.329945,225.472466,193.519868,187.969739,181.348731,173.976554,167.162898,150.946458,151.020599,0.0,0.0,0.0


# 2. Pipeline Preparation

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

num_pipeline = Pipeline([('selector', DataFrameSelector(numerical_feature_cols)),
                         ('std_scaler', StandardScaler())
                        ])

ordinal_cat_pipeline = Pipeline([('selector', DataFrameSelector(categorical_feature_cols)),
                                 ('ordinal', OrdinalEncoder())
                                ])

In [22]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[('numerical_pipeline', num_pipeline), 
                                               ('categorical_pipeline', ordinal_cat_pipeline)
                                              ])

In [23]:
type(full_pipeline)

sklearn.pipeline.FeatureUnion

# 3. Pipeline Application
## 3a. Train Dataset

In [24]:
def get_prepared_X_y_dataframe(pipeline:FeatureUnion, df:pd.DataFrame, 
                                feature_cols:list[str], target_cols:list[str]) -> pd.DataFrame:
    X_data_prepared = full_pipeline.fit_transform(df)
    print(f'Shape of X = {X_data_prepared.shape}')
    y_targets = df_train[target_cols].values
    print(f'Shape of y = {y_targets.shape}')
    column_names = feature_cols + target_cols
    df_prepared = pd.DataFrame(np.c_[X_data_prepared, y_targets], columns=column_names)
    return df_prepared


In [25]:
feature_cols = numerical_feature_cols + categorical_feature_cols 
df_train_prepped = get_prepared_X_y_dataframe(full_pipeline, df_train, feature_cols, target_cols)

Shape of X = (330, 9)
Shape of y = (330, 17)


In [26]:
df_train_prepped.to_csv(OUT_CSV_DATA / "prepared_train.csv", index=False)

In [27]:
df_test_prepped = get_prepared_X_y_dataframe(full_pipeline, df_test, feature_cols, target_cols)

Shape of X = (330, 9)
Shape of y = (330, 17)


In [28]:
df_test_prepped.to_csv(OUT_CSV_DATA / "prepared_test.csv", index=False)