In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 100)

In [36]:
file_paths = {'in':'../../../data/prepared/activity_log_train.csv',
              'out':'../../../data/prepared/activity_log_train.csv'
             }

df = pd.read_csv(file_paths['in'])

# guarantee the timestamps are datetime objects
time_columns = ['start_time','end_time']

for time_col in time_columns:
    df[time_col] = pd.to_datetime(df[time_col])

In [8]:
df.head()

Unnamed: 0,start_time,ride_id,row_count,segment_count,end_time,elapsed_time,moving_time,elapsed_distance,elapsed_ascent,elapsed_descent,max_speed,avg_speed,ride_cruise_speed,avg_power,ride_avg_power,ride_max_power,peak_20min_power,training_window_id,perceived_exertion,simple_exertion,perceived_relative_effort,ride_work,hours_since_last_ride,active_time_ratio,ftp_power,intensity,time_intensity,start_TOD,start_DOW
0,2020-09-02 21:21:17+00:00,4003175803,4224,15,2020-09-02 22:46:42+00:00,5151,4173,14.666919,611.2503,612.5627,28.632832,12.522739,14.541428,95.573578,112.72417,1058.440085,146.082425,4,5.0,NORMAL,75.0,580.6422,75.300833,0.810134,146.082425,0.771648,46.488211,21,2
1,2020-10-24 20:44:22+00:00,4238817655,2097,5,2020-10-24 21:22:49+00:00,2308,2058,7.747065,192.2666,146.9888,22.145705,13.312794,15.123428,103.136627,118.232006,606.592362,144.19097,5,7.0,HARD,65.0,272.879469,0.018333,0.891681,144.4606,0.818438,0.014446,20,5
2,2021-09-20 11:41:28+00:00,5990296028,2013,4,2021-09-20 12:17:01+00:00,2134,1985,7.445936,201.7815,148.6293,25.053728,13.363364,15.723231,113.92363,129.781023,1067.198334,136.550865,11,8.0,HARD,85.0,276.952703,59.486389,0.930178,169.020795,0.767841,35.729477,11,0
3,2020-10-16 19:04:26+00:00,4203579387,3169,8,2020-10-16 20:06:37+00:00,3786,3114,11.225301,318.9132,320.8818,20.579848,12.767843,14.524287,98.027222,104.373331,522.192986,128.441399,5,7.0,HARD,99.0,395.15743,459.581944,0.822504,144.4606,0.722504,292.237704,19,4
4,2020-08-06 22:28:05+00:00,3875911617,1159,11,2020-08-06 23:46:19+00:00,4720,1141,3.171579,78.4159,58.0737,29.751302,9.914663,15.002448,110.379005,94.070193,615.164105,,3,5.0,NORMAL,20.0,444.011312,4.995833,0.241737,161.323024,0.583117,3.202162,22,3


In [9]:
df.shape

(126, 29)

# 1. Feature Selection

In [12]:
df.columns

Index(['start_time', 'ride_id', 'row_count', 'segment_count', 'end_time',
       'elapsed_time', 'moving_time', 'elapsed_distance', 'elapsed_ascent',
       'elapsed_descent', 'max_speed', 'avg_speed', 'ride_cruise_speed',
       'avg_power', 'ride_avg_power', 'ride_max_power', 'peak_20min_power',
       'training_window_id', 'perceived_exertion', 'simple_exertion',
       'perceived_relative_effort', 'ride_work', 'hours_since_last_ride',
       'active_time_ratio', 'ftp_power', 'intensity', 'time_intensity',
       'start_TOD', 'start_DOW'],
      dtype='object')

In [16]:
numerical_columns = ['row_count', 'segment_count','elapsed_time','moving_time', 'active_time_ratio',
                     'elapsed_distance','elapsed_ascent','elapsed_descent','max_speed','avg_speed',
                     'ride_cruise_speed','avg_power','ride_avg_power','peak_20min_power','ride_work',
                     'hours_since_last_ride','intensity','time_intensity']
categorical_columns = ['start_TOD', 'start_DOW','training_window_id']
target_column = ['simple_exertion']

### Numerical Features

In [32]:
from sklearn.feature_selection import f_classif

def perform_anova_test(X,y):
    result = f_classif(X,y)
    score = result[0][0]
    p_value = result[1][0]
    return score, p_value

def evaluate_numerical_features(df, p_threshold=0.05):
    data = df.drop('simple_exertion', axis=1)
    targets = df['simple_exertion']
    
    # create the anova results dataframe
    anova_results = pd.DataFrame(np.zeros([len(data.columns),2]),
                                index=list(data), columns=['anova_f_score','p_value'])
    
    for feature in list(data):
        # find the instances for which this feature is NaN
        filt_nan = data.loc[:,feature].isna()
        #define the existing data's X and y for sklearn
        X_feature, y_feature = data.loc[~filt_nan, feature].values, targets.loc[~filt_nan].values
        X_feature = X_feature.reshape(-1,1)
        # store the test results
        score, p_value = perform_anova_test(X_feature, y_feature)
        anova_results.loc[feature, 'anova_f_score'] = score
        anova_results.loc[feature, 'p_value'] = p_value
        
    anova_results.sort_values(by='anova_f_score', ascending=False, inplace=True)
    filt_p_value = anova_results.loc[:, 'p_value'] <= p_threshold
    
    best_features = list(anova_results.loc[filt_p_value, :].index)
    
    return anova_results, best_features
        

In [33]:
results, best_features = evaluate_numerical_features(df[numerical_columns+target_column])

In [34]:
results

Unnamed: 0,anova_f_score,p_value
elapsed_distance,11.943107,1.8e-05
ride_work,9.236968,0.000183
moving_time,8.29258,0.000418
row_count,8.218895,0.000446
avg_speed,7.911778,0.000586
elapsed_ascent,6.751688,0.001651
elapsed_descent,6.540173,0.001999
peak_20min_power,5.95657,0.003586
ride_avg_power,4.696023,0.010831
ride_cruise_speed,4.571222,0.012164


In [35]:
best_features

['elapsed_distance',
 'ride_work',
 'moving_time',
 'row_count',
 'avg_speed',
 'elapsed_ascent',
 'elapsed_descent',
 'peak_20min_power',
 'ride_avg_power',
 'ride_cruise_speed',
 'elapsed_time',
 'intensity']

### Categorical Features

In [37]:
from sklearn.feature_selection import chi2

def perform_chi2_test(X,y):
    result = chi2(X,y)
    score = result[0][0]
    p_value = result[1][0]
    return score, p_value

def evaluate_categorical_features(df, p_threshold=0.05):
    data = df.drop('simple_exertion', axis=1)
    targets = df['simple_exertion']
    
    # create the anova results dataframe
    chi2_results = pd.DataFrame(np.zeros([len(data.columns),2]),
                                index=list(data), columns=['chi2_score','p_value'])
    
    for feature in list(data):
        # find the instances for which this feature is NaN
        filt_nan = data.loc[:,feature].isna()
        #define the existing data's X and y for sklearn
        X_feature, y_feature = data.loc[~filt_nan, feature].values, targets.loc[~filt_nan].values
        X_feature = X_feature.reshape(-1,1)
        # store the test results
        score, p_value = perform_chi2_test(X_feature, y_feature)
        chi2_results.loc[feature, 'chi2_score'] = score
        chi2_results.loc[feature, 'p_value'] = p_value
        
    chi2_results.sort_values(by='chi2_score', ascending=False, inplace=True)
    filt_p_value = chi2_results.loc[:, 'p_value'] <= p_threshold
    
    best_features = list(chi2_results.loc[filt_p_value, :].index)
    
    return chi2_results, best_features
        

In [38]:
results, best_features = evaluate_categorical_features(df[categorical_columns+target_column])

In [39]:
results

Unnamed: 0,chi2_score,p_value
training_window_id,11.614369,0.003006
start_DOW,3.83092,0.147274
start_TOD,0.074279,0.963542
