In [428]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [429]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/dennissmith0/Running_Performance/main/prep/training_data.csv')

In [430]:
# # Display the first few rows of the dataframe
# print(df.head())

# # Get a summary of the dataframe
# print(df.info())

# # Generate descriptive statistics of the dataframe
# print(df.describe())

# # Check for missing values
# print(df.isnull().sum())

In [431]:
# Convert the "Activity Date" column to datetime
df['Activity Date'] = pd.to_datetime(df['Activity Date'])

# Sort data by date just to be sure
df = df.sort_values('Activity Date')

# Calculate days between activities and create new column that serves as measure for 'Rest Days'.
# NOTE: this measure reflects ALL activities prior to the next function which removes all activities that are not a 'Run'
df['Days Between Activity'] = df['Activity Date'].diff().dt.days

# First value = NaN, replace with 0.
if pd.isna(df.loc[0, 'Days Between Activity']):
    df.loc[0, 'Days Between Activity'] = 0.0


# Now set as datetime index
df.set_index('Activity Date', inplace=True)

In [432]:
# For now, remove all activities that are not a Run
df = df[df['Activity Type'].isin(['Run'])].reset_index(drop=True)
# Drop the column
df.drop(columns=['Activity Type'], inplace=True)

In [433]:
# Replace NaN values in the 'Average Speed' column with the calculated speed using the 'Distance.1' and 'Moving Time' columns
def replace_nan_average_speed(df):
    df['Average Speed'] = df.apply(lambda row: row['Distance.1'] / row['Moving Time'] if pd.isna(row['Average Speed']) else row['Average Speed'], axis=1)
    return df

# Replace NaN values in 'Average Speed' column
df = replace_nan_average_speed(df)

In [434]:
df.drop(columns=['Elapsed Time', 'Activity ID', 'Activity Name', 'Media', 'Commute', 'From Upload', 'Filename', 'Athlete Weight',
                 'Activity Gear', 'Number of Runs', 'Prefer Perceived Exertion',
                 'Average Temperature', 'Elevation Loss', 'Gear'], inplace=True)

# Drop potentially important features with high NaN columns for now, that we intend to find a measure that fills the value in a meaningful way.
df.drop(columns=['Grade Adjusted Distance'], inplace=True)

In [435]:
def remove_columns_with_prefix(df, prefix):
    columns_to_remove = [column for column in df.columns if column.startswith(prefix)]
    df.drop(columns=columns_to_remove, inplace=True)
    return df

df = remove_columns_with_prefix(df, '<span')


In [436]:
# Some columns are repeats with more decimals, keep the rounded up decimals columns, remove the extras
def drop_columns_with_extra_decimals(df, suffix):
    columns_with_extra_decimals = [column for column in df.columns if column.endswith(suffix)]
    df.drop(columns=columns_with_extra_decimals, inplace=True)
    return df

df = drop_columns_with_extra_decimals(df, '.1')

In [437]:
# Run for dropping each column with empty columns or 50% NaN values
def drop_columns_with_null(df):
    columns_with_null = df.columns[df.isnull().sum() >= (.50 * len(df))]
    df.drop(columns=columns_with_null, inplace=True)
    return df

df = drop_columns_with_null(df)

In [438]:
def remove_columns_with_few_values(dataframe, threshold):
    columns_to_remove = [column for column in dataframe.columns if dataframe[column].count() < threshold]
    dataframe.drop(columns=columns_to_remove, inplace=True)
    return dataframe

# Remove columns with less than 1000 values
# df = remove_columns_with_few_values(df, 1000)
df = remove_columns_with_few_values(df, 50)

In [439]:
df.isnull().sum()

Distance                   0
Max Heart Rate           163
Relative Effort          163
Moving Time                0
Max Speed                  5
Average Speed              0
Elevation Gain             2
Elevation Low             23
Elevation High            23
Max Grade                  5
Average Grade              0
Max Cadence               44
Average Cadence           20
Average Heart Rate       305
Calories                   0
Days Between Activity      0
dtype: int64

In [440]:
df[df['Average Heart Rate'].isna()].tail(20)

Unnamed: 0,Distance,Max Heart Rate,Relative Effort,Moving Time,Max Speed,Average Speed,Elevation Gain,Elevation Low,Elevation High,Max Grade,Average Grade,Max Cadence,Average Cadence,Average Heart Rate,Calories,Days Between Activity
372,5.62,,,3060.0,4.9,1.838301,478.0,254.0,540.0,49.099998,-0.03911,124.0,68.241158,,392.0,0.0
373,40.67,,,23386.0,5.3,1.739246,1681.0,206.399994,944.400024,42.700001,0.994739,124.0,71.098877,,2730.0,1.0
374,7.35,,,2406.0,5.8,3.055736,60.0,252.800003,313.399994,8.3,-0.038084,99.0,84.01857,,578.0,0.0
375,27.42,,,16500.0,4.8,1.662358,1533.0,656.599976,1156.199951,43.599998,0.382079,123.0,66.662773,,1826.0,0.0
376,3.55,,,2500.0,4.8,1.41984,152.0,266.799988,346.0,29.700001,0.078882,91.0,74.296761,,251.0,1.0
377,8.25,,,3889.0,4.7,2.122139,397.0,269.399994,557.0,37.200001,0.189022,116.0,74.842094,,596.0,0.0
378,22.38,,,12689.0,5.4,1.763732,1227.0,242.199997,579.599976,48.099998,0.138517,120.0,72.77327,,1599.0,1.0
379,14.85,,,7048.0,6.6,2.10708,675.0,468.0,946.200012,48.0,0.106392,115.0,73.636322,,1041.0,0.0
380,20.92,,,11518.0,7.7,1.816314,1196.0,469.200012,1234.400024,44.200001,1.228472,122.0,72.806862,,1415.0,1.0
381,13.5,,,4953.0,6.8,2.726933,141.0,-6.4,34.200001,13.5,-0.068115,113.0,82.054459,,1049.0,1.0


In [441]:
df.tail(10)

Unnamed: 0,Distance,Max Heart Rate,Relative Effort,Moving Time,Max Speed,Average Speed,Elevation Gain,Elevation Low,Elevation High,Max Grade,Average Grade,Max Cadence,Average Cadence,Average Heart Rate,Calories,Days Between Activity
617,9.4,171.0,64.0,3531.0,5.267969,2.663234,289.0,1324.199951,1454.199951,47.197281,-0.085071,90.0,81.420311,147.338272,750.0,0.0
618,17.73,167.0,73.0,11388.0,5.634082,1.556976,1148.0,1707.199951,2534.399902,48.636837,-0.171453,118.0,67.808128,123.561058,1510.0,0.0
619,3.63,168.0,21.0,1257.0,5.42998,2.893882,122.0,1571.0,1676.0,33.101528,-0.291399,88.0,79.991745,147.546112,270.0,1.0
620,9.78,174.0,74.0,5362.0,6.736133,1.825179,720.0,1724.199951,2549.600098,49.164585,-0.128748,112.0,75.091339,142.909683,1036.0,0.0
621,4.49,162.0,18.0,1917.0,5.833984,2.345144,89.0,1319.400024,1347.400024,29.65811,-0.320311,118.0,78.520119,140.439209,380.0,0.0
622,5.64,173.0,37.0,2030.0,7.63775,2.781005,53.0,1294.599976,1343.199951,35.065315,-0.170048,110.0,82.424431,148.805817,430.0,1.0
623,20.06,177.0,154.0,6998.0,5.051595,2.867208,134.0,1772.199951,2570.0,30.338728,-3.919317,119.0,82.241585,153.711639,1513.0,1.0
624,9.62,170.0,34.0,7086.0,5.224023,1.357962,542.0,1759.0,2336.199951,49.868637,0.301376,122.0,62.268711,118.441292,893.0,2.0
625,10.41,155.0,27.0,7866.0,4.337891,1.324174,743.0,1908.199951,2707.600098,44.702003,0.011521,125.0,58.039753,112.808022,1045.0,0.0
626,24.99,160.0,71.0,19299.0,4.787988,1.295079,1489.0,1460.0,2512.800049,49.834286,-0.350488,124.0,64.065208,117.229675,1965.0,0.0


In [442]:
# SAVE FOR LATER MODEL. This model will only be modeled on "Run"

# # To adjust the "Activity Type" column to eliminate activities with less than 10% of the max value and convert it into
# # separate indicator columns for each activity, you can use the following function:
# def adjust_and_convert_activity_type_column(df):
#     activity_counts = df['Activity Type'].value_counts()
#     max_value = activity_counts.max()
#     threshold = max_value * 0.1

#     filtered_activities = activity_counts[activity_counts >= threshold].index.tolist()

#     # If the activity does not make the threshold, remove the activity (we are considering these activities as "outliers", with little effect on training)
#     df.loc[~df['Activity Type'].isin(filtered_activities), 'Activity Type'] = None
#     df.dropna(subset=['Activity Type'], inplace=True)


#     for activity in filtered_activities:
#         df[activity] = df['Activity Type'].apply(lambda x: 1 if x == activity else 0)

#     df.drop(columns=['Activity Type'], inplace=True)

#     return df

# # Adjust and convert the "Activity Type" column. This will allow the model to capture the influence of different activities on running performance.
# df = adjust_and_convert_activity_type_column(df)

In [443]:
# To impute missing values in the "Average Heart Rate" column with a random value within a range around the average, you can use the following function:
def impute_average_heart_rate(df):
    average_hr = df['Average Heart Rate'].mean()

    # Generate random values within the range of average_hr ± 10
    random_values = np.random.uniform(average_hr - 10, average_hr + 10, size=df['Average Heart Rate'].isnull().sum())

    # Replace missing values with the generated random values
    df.loc[df['Average Heart Rate'].isnull(), 'Average Heart Rate'] = random_values

    return df

# Impute missing values in "Average Heart Rate" column
df = impute_average_heart_rate(df)

In [444]:
# Feature Engineering: Training Stress
# For simplicity, let's define training stress as distance * average heart rate

# Convert 'Distance' and 'Average Heart Rate' columns to numeric types
df['Distance'] = pd.to_numeric(df['Distance'], errors='coerce')
df['Average Heart Rate'] = pd.to_numeric(df['Average Heart Rate'], errors='coerce')

Training_Stress_Space = df['Distance'] * df['Average Heart Rate']
Training_Stress_Time = (df['Moving Time'] / 60) * df['Average Heart Rate']
#df['Training Stress'] = Training_Stress_Time / Training_Stress_Space
# Check if distance is zero, assign Training Stress Time value if true, else calculate Training Stress
df['Training Stress'] = np.where(df['Distance'] == 0, Training_Stress_Time, Training_Stress_Time / Training_Stress_Space) # but now its a ridiciluously high score...

# But note rows that are workouts, e.g, not a run, the stress is much higher. Is this true?

In [445]:
# FEATURE:
# if add measure of days between activites, do so before removing activities that are below threshold percentage of max activity

# for imputing average heart rate value: TO DO: look at the activity type first, then get the average of those types.

In [446]:
df['Max Heart Rate'].isnull().sum()

163

In [447]:
df['Relative Effort'].isnull().sum()

163

In [448]:
df.columns

Index(['Distance', 'Max Heart Rate', 'Relative Effort', 'Moving Time',
       'Max Speed', 'Average Speed', 'Elevation Gain', 'Elevation Low',
       'Elevation High', 'Max Grade', 'Average Grade', 'Max Cadence',
       'Average Cadence', 'Average Heart Rate', 'Calories',
       'Days Between Activity', 'Training Stress'],
      dtype='object')

In [449]:
# # Removing for now, given that it leads to 'ValueError: Input X contains infinity or a value too large for dtype('float64').' when making predictions
# # Do we need? Given that 'Average Speed' = meters/second

# def calculate_average_pace(df):
#     # Convert moving time to minutes
#     moving_time_minutes = df['Moving Time'] / 60

#     # Convert distance from kilometers to miles
#     distance_miles = df['Distance'] * 0.621371

#     # # Reset the index of the dataframe
#     # df.reset_index(drop=True, inplace=True)
#     # # # Calculate average pace in minutes per mile
#     # # if df['Run'] == 1:
#     # #   df['Average Pace (min/mile)'] = moving_time_minutes / distance_miles
#     #     # Calculate average pace in minutes per mile only for 'run' activities
#     # df.loc[df['Run'] == 1, 'Average Pace (min/mile)'] = moving_time_minutes / distance_miles
#     # Calculate average pace in minutes per mile
#     df['Average Pace (min/mile)'] = moving_time_minutes / distance_miles


#     return df

# # Calculate the average pace
# df = calculate_average_pace(df)

In [450]:
df.tail(10)

Unnamed: 0,Distance,Max Heart Rate,Relative Effort,Moving Time,Max Speed,Average Speed,Elevation Gain,Elevation Low,Elevation High,Max Grade,Average Grade,Max Cadence,Average Cadence,Average Heart Rate,Calories,Days Between Activity,Training Stress
617,9.4,171.0,64.0,3531.0,5.267969,2.663234,289.0,1324.199951,1454.199951,47.197281,-0.085071,90.0,81.420311,147.338272,750.0,0.0,6.260638
618,17.73,167.0,73.0,11388.0,5.634082,1.556976,1148.0,1707.199951,2534.399902,48.636837,-0.171453,118.0,67.808128,123.561058,1510.0,0.0,10.70502
619,3.63,168.0,21.0,1257.0,5.42998,2.893882,122.0,1571.0,1676.0,33.101528,-0.291399,88.0,79.991745,147.546112,270.0,1.0,5.77135
620,9.78,174.0,74.0,5362.0,6.736133,1.825179,720.0,1724.199951,2549.600098,49.164585,-0.128748,112.0,75.091339,142.909683,1036.0,0.0,9.137696
621,4.49,162.0,18.0,1917.0,5.833984,2.345144,89.0,1319.400024,1347.400024,29.65811,-0.320311,118.0,78.520119,140.439209,380.0,0.0,7.115813
622,5.64,173.0,37.0,2030.0,7.63775,2.781005,53.0,1294.599976,1343.199951,35.065315,-0.170048,110.0,82.424431,148.805817,430.0,1.0,5.998818
623,20.06,177.0,154.0,6998.0,5.051595,2.867208,134.0,1772.199951,2570.0,30.338728,-3.919317,119.0,82.241585,153.711639,1513.0,1.0,5.814224
624,9.62,170.0,34.0,7086.0,5.224023,1.357962,542.0,1759.0,2336.199951,49.868637,0.301376,122.0,62.268711,118.441292,893.0,2.0,12.276507
625,10.41,155.0,27.0,7866.0,4.337891,1.324174,743.0,1908.199951,2707.600098,44.702003,0.011521,125.0,58.039753,112.808022,1045.0,0.0,12.59366
626,24.99,160.0,71.0,19299.0,4.787988,1.295079,1489.0,1460.0,2512.800049,49.834286,-0.350488,124.0,64.065208,117.229675,1965.0,0.0,12.871148


For now, in lieu of adding conditions that adjust the training stress score for activities such as yoga and rock climbing, lets remove these activities/rows with really high values.

---



In [451]:
# # Don't need in Version 1.0 of model as we only have 'Run' activity type
# df['Training Stress'].max()
# # This code filters the dataframe to keep only the rows where the 'Training Stress' value is less than or equal to 500. The resulting filtered dataframe is assigned back to the variable df, effectively removing the rows with high 'Training Stress' scores.
# df = df[df['Training Stress'] <= 500]


In [452]:
# You can use the following function to check the entire dataframe for NaN values and impute the average value of each column:
def impute_nan_with_average(df):
    for column in df.columns:
        if df[column].isnull().any():
            average = df[column].mean()
            df[column].fillna(average, inplace=True)
    return df

# Impute NaN values with the average value of each column
df = impute_nan_with_average(df)

In [453]:
df.isnull().sum()

Distance                 0
Max Heart Rate           0
Relative Effort          0
Moving Time              0
Max Speed                0
Average Speed            0
Elevation Gain           0
Elevation Low            0
Elevation High           0
Max Grade                0
Average Grade            0
Max Cadence              0
Average Cadence          0
Average Heart Rate       0
Calories                 0
Days Between Activity    0
Training Stress          0
dtype: int64



---

Baseline Model

---



In [454]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [455]:
# Split the data into features X and target y
# Here we're assuming that 'Training Stress' is the target variable
X = df.drop('Training Stress', axis=1)
y = df['Training Stress']

In [456]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [457]:
# Create a baseline model using Linear Regression
lr = LinearRegression()

# Fit the model to the training data
lr.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = lr.predict(X_test)
y_pred

In [459]:
# Evaluate the model
def calculate_mse(actual, predicted):
  mse = mean_squared_error(actual, predicted)
  return mse

mse = calculate_mse(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 6.855243132677554


In [460]:
def calculate_rmse(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    return rmse

# Calculate RMSE
rmse = calculate_rmse(y_test, y_pred)
print(f'RMSE: {rmse}')

RMSE: 2.6182519230733994
