In [571]:
import pandas as pd
import importlib
import activity_data_importer

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

# reload my module each time I execute to get new changes without restarting kernel
importlib.reload(activity_data_importer)

# import data and filter out erroneous rows
df = activity_data_importer.import_activity_data()
filtered_df = activity_data_importer.drop_erroneous_rows(df)
print(f"Dropped {len(df.index) - len(filtered_df.index)} records due to erroneous measurements.")
df = filtered_df

Dropped 4 records due to erroneous measurements.


In [572]:
# creating maps for converting string types to numbers and back

# getting all unique values in the columns
activity_types = set(df['Activity Type'].unique())
event_types = set(df['Event Type'].unique())

# create the empty maps
int_to_activity_type = {}
activity_type_to_int = {}

int_to_event_type = {}
event_type_to_int = {}

# fill the maps
for index, activity_type in enumerate(activity_types):
    int_to_activity_type[index] = activity_type
    activity_type_to_int[activity_type] = index

for index, event_type in enumerate(event_types):
    int_to_event_type[index] = event_type
    event_type_to_int[event_type] = index

In [573]:
from datetime import timezone, datetime

# do any conversions that are required for feeding the data into our model

# convert duration column to total seconds
df['Duration (h:m:s)'] = df['Duration (h:m:s)'].apply(lambda td: td.total_seconds())
df.rename(columns={'Duration (h:m:s)': 'Duration (s)'}, inplace=True)

# convert string columns into numbers
df['Activity Type'] = df['Activity Type'].apply(lambda act: activity_type_to_int[act])
df['Event Type'] = df['Event Type'].apply(lambda evt: event_type_to_int[evt])

# convert Start Time to UTC timestamp
df['Start Time'] = df['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

# Sort by 'Start Time' in ascending order (oldest to newest)
df = df.sort_values(by='Start Time', ascending=True)

# fill all NaN values with a 0
df.fillna(0, inplace=True)

# dropping rows with specific activity types since they are not interesting for training
activity_types_to_drop = [activity_type_to_int['Other'], activity_type_to_int['Transition'],
                          activity_type_to_int['Walking'], activity_type_to_int['Strength Training']]
df = df[~df['Activity Type'].isin(activity_types_to_drop)]

df

Unnamed: 0,Start Time,End Time,Duration (s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Average Moving Speed (km/h),Max. Speed (km/h),Elevation Gain (m),...,Anaerobic Training Effect,Avg. Run Cadence,Max. Run Cadence,Stride Length,Steps,Avg. Cadence (rpm),Max. Cadence (rpm),Avg. Temp (°C),Min. Temp (°C),Max. Temp (°C)
1100,1637166705.00000,2021-11-17 17:14:05+00:00,2537.00000,5,1,7.80808,11.08080,11.75504,13.70520,40.00000,...,0.40000,159.86000,197.00000,115.65000,6840.00000,0.00000,0.00000,18.58025,17.00000,24.00000
1099,1637391420.00000,2021-11-20 07:41:48+00:00,2688.00000,9,1,8.14423,10.90440,10.96432,17.26560,180.00000,...,0.40000,171.11000,200.00000,106.29000,7708.00000,0.00000,0.00000,20.72532,17.00000,26.00000
1098,1637493441.00000,2021-11-21 13:22:33+00:00,7486.00000,9,1,20.10268,9.66600,10.21263,17.70120,415.00000,...,2.00000,156.19000,246.00000,102.37000,19746.00000,0.00000,0.00000,17.49492,12.00000,23.00000
1097,1637687915.00000,2021-11-23 18:43:56+00:00,5121.00000,5,1,14.81816,10.41480,11.86394,20.18880,59.00000,...,0.50000,141.97000,249.00000,120.77000,12340.00000,0.00000,0.00000,19.25532,16.00000,27.00000
1096,1637765343.00000,2021-11-24 15:34:30+00:00,2727.00000,9,1,8.05562,10.63440,10.64606,16.83000,167.00000,...,0.90000,170.20000,191.00000,104.19000,7734.00000,0.00000,0.00000,21.05870,18.00000,28.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,1732775741.00000,2024-11-28 07:02:48+00:00,1627.00000,5,1,5.66210,12.52800,12.54540,13.43520,54.83000,...,0.00000,180.80000,188.00000,115.48000,4882.00000,0.00000,0.00000,16.18451,12.00000,29.00000
4,1732892652.00000,2024-11-29 16:09:38+00:00,3526.00000,11,1,20.12749,20.54880,20.70256,43.76880,374.48000,...,0.30000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,13.00000,10.00000,27.00000
3,1732953144.00000,2024-11-30 09:50:49+00:00,6933.00000,5,1,23.37705,12.13920,12.17715,15.68520,472.01000,...,0.00000,175.20000,201.00000,115.46000,20238.00000,0.00000,0.00000,17.63128,15.00000,29.00000
2,1733044326.00000,2024-12-01 09:23:11+00:00,665.00000,5,1,1.57320,8.51760,9.48663,13.43520,8.68000,...,0.00000,145.23000,239.00000,95.74000,1760.00000,0.00000,0.00000,22.90281,19.00000,28.00000


In [574]:
# getting all 10km races
ten_k_races = df.loc[
    (df['Activity Type'] == activity_type_to_int['Running']) &
    (df['Event Type'] == event_type_to_int['Race']) &
    (df['Distance (km)'] >= 9) &
    (df['Distance (km)'] <= 11)
]

In [575]:
from dateutil.relativedelta import relativedelta

# creating a list of 3 month blocks before each race
ten_k_race_training_blocks = ten_k_races['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
ten_k_race_training_blocks = ten_k_race_training_blocks.apply(lambda dt: dt - relativedelta(months=3))
ten_k_race_training_blocks = pd.DataFrame({
    'Training Block Start': ten_k_race_training_blocks.apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()),
    'Race Date': ten_k_races['Start Time'],
})

# getting all activities within each 3 month block
def get_training_block_activities():
    training_block_activities_list = []
    for index, row in ten_k_race_training_blocks.iterrows():
        # getting all activities within a single training block excluding the race itself
        training_block_activities = df[
            (df['Start Time'] >= row['Training Block Start']) &
            (df['Start Time'] < row['Race Date'])
        ]
        training_block_activities_list.append(training_block_activities)
    return training_block_activities_list

# creating a series of training block activities: [[block1_activities], [block2_activities], ...]
ten_k_race_training_block_activities = pd.Series(get_training_block_activities())

# further reduce data by summing up activities on a per-week basis
def generate_training_weeks():
    training_block_weeks_list = []
    columns_to_keep = ['Start Time', 'Duration (s)', 'Distance (km)']

    for training_block_tuple in ten_k_race_training_block_activities.items():
        training_block = training_block_tuple[1]
        training_block['Start Time'] = training_block['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
        training_block = training_block[columns_to_keep]

        training_block_weeks = training_block.groupby(pd.Grouper(key='Start Time', freq='W')).sum()
        training_block_weeks.reset_index(inplace=True)
        training_block_weeks['Start Time'] = training_block_weeks['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

        training_block_weeks_list.append(training_block_weeks)

    return training_block_weeks_list

# Creating a series of training block weeks
ten_k_race_training_block_weeks = pd.Series(generate_training_weeks())

In [576]:
from sklearn import preprocessing

# normalize and scale input data to resolve big differences in feature magnitudes
# i.e. Start Time is a big number (> 1 billion), event type is a very small number (0..2)

scaler = preprocessing.StandardScaler()

ten_k_race_training_block_weeks = ten_k_race_training_block_weeks.apply(
    lambda block: pd.DataFrame(scaler.fit_transform(block))
)

ten_k_race_training_block_weeks.iloc[0]

Unnamed: 0,0,1,2
0,-1.60357,-0.3649,0.02048
1,-1.33631,1.15363,1.77144
2,-1.06904,0.09113,-0.15392
3,-0.80178,0.31374,-0.01719
4,-0.53452,-0.74406,-0.63822
5,-0.26726,1.32617,1.19368
6,0.0,1.37224,0.93257
7,0.26726,0.86121,0.62073
8,0.53452,-0.35714,-0.28791
9,0.80178,-1.04212,-0.44552


In [577]:
import numpy as np

X = []
y = []

max_input_size = 0

for index, row in ten_k_races.reset_index().iterrows():
    x = np.array(ten_k_race_training_block_weeks.iloc[index])
    if max_input_size < x.shape[0]:
        max_input_size = x.shape[0]
    X.append(np.array(ten_k_race_training_block_weeks.iloc[index]))

# create reference shape, second dimension is constant
input_shape = (max_input_size, X[0].shape[1])

# pad each input with 0 rows to ensure each input has a consistent shape
for i, x in enumerate(X):
    result = np.zeros(input_shape)
    result[:x.shape[0],:x.shape[1]] = x
    X[i] = result

X = np.array(X)
X = X.reshape(X.shape[0], -1)
y = np.array(ten_k_races['Duration (s)']) # solve for the duration

# print(X.shape, y.shape)

In [578]:
train_training_data, test_training_data = X[:-2], X[-2:]
train_racing_data, test_racing_data = y[:-2], y[-2:]

print(train_training_data.shape, test_training_data.shape)
print(train_racing_data.shape, test_racing_data.shape)

(8, 42) (2, 42)
(8,) (2,)


In [579]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression(fit_intercept=True)

model_lr.fit(train_training_data, train_racing_data)

In [580]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test data
predictions = model_lr.predict(test_training_data)

print(f"predictions: {predictions}, actual race times: {test_racing_data}")

# Calculate evaluation metrics
mae = mean_absolute_error(test_racing_data, predictions)
mse = mean_squared_error(test_racing_data, predictions)
r2 = r2_score(test_racing_data, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

predictions: [2189.2741198  2300.25536502], actual race times: [2113. 2142.]
Mean Squared Error (MSE): 15431.25095401568
Mean Absolute Error (MAE): 117.26474240961852
R-squared (R2): -72.39477267070478
