In [144]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [145]:
# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/dennissmith0/Running_Performance/main/prep/training_data.csv')

In [146]:
# # Display the first few rows of the dataframe
# print(df.head())

# # Get a summary of the dataframe
# print(df.info())

# # Generate descriptive statistics of the dataframe
# print(df.describe())

# # Check for missing values
# print(df.isnull().sum())

In [147]:
# Convert the "Activity Date" column to datetime index
df['Activity Date'] = pd.to_datetime(df['Activity Date'])
df.set_index('Activity Date', inplace=True)

In [148]:
df.drop(columns=['Activity ID', 'Activity Name', 'Media', 'Commute', 'From Upload', 'Filename', 'Athlete Weight',
                 'Activity Gear', 'Number of Runs', 'Prefer Perceived Exertion',
                 'Average Temperature', 'Elevation Loss', 'Average Speed'], inplace=True)

In [149]:
def remove_columns_with_prefix(df, prefix):
    columns_to_remove = [column for column in df.columns if column.startswith(prefix)]
    df.drop(columns=columns_to_remove, inplace=True)
    return df

df = remove_columns_with_prefix(df, '<span')


In [150]:
# Some columns are repeats with more decimals, keep the rounded up decimals columns, remove the extras
def drop_columns_with_extra_decimals(df, suffix):
    columns_with_extra_decimals = [column for column in df.columns if column.endswith(suffix)]
    df.drop(columns=columns_with_extra_decimals, inplace=True)
    return df

df = drop_columns_with_extra_decimals(df, '.1')

In [151]:
# Run for dropping each column with empty columns
def drop_columns_with_null(df):
    columns_with_null = df.columns[df.isnull().sum() == len(df)]
    df.drop(columns=columns_with_null, inplace=True)
    return df

df = drop_columns_with_null(df)

In [152]:
def remove_columns_with_few_values(dataframe, threshold):
    columns_to_remove = [column for column in dataframe.columns if dataframe[column].count() < threshold]
    dataframe.drop(columns=columns_to_remove, inplace=True)
    return dataframe

# Remove columns with less than 1000 values
df = remove_columns_with_few_values(df, 1000)

In [153]:
# To adjust the "Activity Type" column to eliminate activities with less than 10% of the max value and convert it into
# separate indicator columns for each activity, you can use the following function:
def adjust_and_convert_activity_type_column(df):
    activity_counts = df['Activity Type'].value_counts()
    max_value = activity_counts.max()
    threshold = max_value * 0.1

    filtered_activities = activity_counts[activity_counts >= threshold].index.tolist()

    # If the activity does not make the threshold, remove the activity (we are considering these activities as "outliers", with little effect on training)
    df.loc[~df['Activity Type'].isin(filtered_activities), 'Activity Type'] = None
    df.dropna(subset=['Activity Type'], inplace=True)


    for activity in filtered_activities:
        df[activity] = df['Activity Type'].apply(lambda x: 1 if x == activity else 0)

    df.drop(columns=['Activity Type'], inplace=True)

    return df

# Adjust and convert the "Activity Type" column. This will allow the model to capture the influence of different activities on running performance.
df = adjust_and_convert_activity_type_column(df)

In [154]:
# To impute missing values in the "Average Heart Rate" column with a random value within a range around the average, you can use the following function:
def impute_average_heart_rate(df):
    average_hr = df['Average Heart Rate'].mean()

    # Generate random values within the range of average_hr ± 10
    random_values = np.random.uniform(average_hr - 10, average_hr + 10, size=df['Average Heart Rate'].isnull().sum())

    # Replace missing values with the generated random values
    df.loc[df['Average Heart Rate'].isnull(), 'Average Heart Rate'] = random_values

    return df

# Impute missing values in "Average Heart Rate" column
df = impute_average_heart_rate(df)

In [155]:
# Feature Engineering: Training Stress
# For simplicity, let's define training stress as distance * average heart rate

# Convert 'Distance' and 'Average Heart Rate' columns to numeric types
df['Distance'] = pd.to_numeric(df['Distance'], errors='coerce')
df['Average Heart Rate'] = pd.to_numeric(df['Average Heart Rate'], errors='coerce')

Training_Stress_Space = df['Distance'] * df['Average Heart Rate']
Training_Stress_Time = (df['Elapsed Time'] / 60) * df['Average Heart Rate']
#df['Training Stress'] = Training_Stress_Time / Training_Stress_Space
# Check if distance is zero, assign Training Stress Time value if true, else calculate Training Stress
df['Training Stress'] = np.where(df['Distance'] == 0, Training_Stress_Time, Training_Stress_Time / Training_Stress_Space) # but now its a ridiciluously high score...

# But note rows that are workouts, e.g, not a run, the stress is much higher. Is this true?

In [156]:
# FEATURE:
# if add measure of days between activites, do so before removing activities that are below threshold percentage of max activity

# for imputing average heart rate value: TO DO: look at the activity type first, then get the average of those types.

In [157]:
df['Max Heart Rate'].isnull().sum()

462

In [158]:
df['Relative Effort'].isnull().sum()

462

In [159]:
df.columns

Index(['Elapsed Time', 'Distance', 'Max Heart Rate', 'Relative Effort',
       'Moving Time', 'Max Speed', 'Elevation Gain', 'Elevation Low',
       'Elevation High', 'Max Grade', 'Average Grade', 'Max Cadence',
       'Average Cadence', 'Average Heart Rate', 'Calories', 'Run', 'Ride',
       'Hike', 'Yoga', 'Rock Climb', 'Walk', 'Training Stress'],
      dtype='object')

For now, in lieu of adding conditions that adjust the training stress score for activities such as yoga and rock climbing, lets remove these activities/rows with really high values.

---



In [166]:
# This code filters the dataframe to keep only the rows where the 'Training Stress' value is less than or equal to 500. The resulting filtered dataframe is assigned back to the variable df, effectively removing the rows with high 'Training Stress' scores.
df = df[df['Training Stress'] <= 500]


In [173]:
# You can use the following function to check the entire dataframe for NaN values and impute the average value of each column:
def impute_nan_with_average(df):
    for column in df.columns:
        if df[column].isnull().any():
            average = df[column].mean()
            df[column].fillna(average, inplace=True)
    return df

# Impute NaN values with the average value of each column
df = impute_nan_with_average(df)



---

Baseline Model

---



In [169]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [174]:
# Split the data into features X and target y
# Here we're assuming that 'performance' is your target variable
X = df.drop('Training Stress', axis=1)
y = df['Training Stress']

In [175]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [176]:
# Create a baseline model using Linear Regression
lr = LinearRegression()

# Fit the model to the training data
lr.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = lr.predict(X_test)
y_pred

In [179]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 55.43209281548262
