In [680]:
import pandas as pd
import importlib
import activity_data_importer

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

# reload my module each time I execute to get new changes without restarting kernel
importlib.reload(activity_data_importer)

# import data and filter out erroneous rows
df = activity_data_importer.import_activity_data()
filtered_df = activity_data_importer.drop_erroneous_rows(df)
print(f"Dropped {len(df.index) - len(filtered_df.index)} records due to erroneous measurements.")
df = filtered_df

df

Dropped 4 records due to erroneous measurements.


Unnamed: 0,Start Time,End Time,Duration (h:m:s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Average Moving Speed (km/h),Max. Speed (km/h),Elevation Gain (m),...,Anaerobic Training Effect,Avg. Run Cadence,Max. Run Cadence,Stride Length,Steps,Avg. Cadence (rpm),Max. Cadence (rpm),Avg. Temp (°C),Min. Temp (°C),Max. Temp (°C)
0,2024-12-02 16:20:29+00:00,2024-12-02 16:55:37+00:00,0 days 00:35:08,Strength Training,Uncategorized,,,,,,...,,,,,256.00000,,,29.97059,29.00000,31.00000
1,2024-12-01 09:30:08+00:00,2024-12-01 10:05:50+00:00,0 days 00:35:42,Running,Race,9.95481,16.72920,16.76225,23.85000,79.45000,...,1.50000,181.44000,193.00000,153.67000,6426.00000,,,11.49034,9.00000,18.00000
2,2024-12-01 09:12:06+00:00,2024-12-01 09:23:11+00:00,0 days 00:11:05,Running,Uncategorized,1.57320,8.51760,9.48663,13.43520,8.68000,...,,145.23000,239.00000,95.74000,1760.00000,,,22.90281,19.00000,28.00000
3,2024-11-30 07:52:24+00:00,2024-11-30 09:50:49+00:00,0 days 01:55:33,Running,Uncategorized,23.37705,12.13920,12.17715,15.68520,472.01000,...,,175.20000,201.00000,115.46000,20238.00000,,,17.63128,15.00000,29.00000
4,2024-11-29 15:04:12+00:00,2024-11-29 16:09:38+00:00,0 days 00:58:46,Mountain Biking,Uncategorized,20.12749,20.54880,20.70256,43.76880,374.48000,...,0.30000,,,,,,,13.00000,10.00000,27.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,2021-11-24 14:49:03+00:00,2021-11-24 15:34:30+00:00,0 days 00:45:27,Trail Running,Uncategorized,8.05562,10.63440,10.64606,16.83000,167.00000,...,0.90000,170.20000,191.00000,104.19000,7734.00000,,,21.05870,18.00000,28.00000
1097,2021-11-23 17:18:35+00:00,2021-11-23 18:43:56+00:00,0 days 01:25:21,Running,Uncategorized,14.81816,10.41480,11.86394,20.18880,59.00000,...,0.50000,141.97000,249.00000,120.77000,12340.00000,,,19.25532,16.00000,27.00000
1098,2021-11-21 11:17:21+00:00,2021-11-21 13:22:33+00:00,0 days 02:04:46,Trail Running,Uncategorized,20.10268,9.66600,10.21263,17.70120,415.00000,...,2.00000,156.19000,246.00000,102.37000,19746.00000,,,17.49492,12.00000,23.00000
1099,2021-11-20 06:57:00+00:00,2021-11-20 07:41:48+00:00,0 days 00:44:48,Trail Running,Uncategorized,8.14423,10.90440,10.96432,17.26560,180.00000,...,0.40000,171.11000,200.00000,106.29000,7708.00000,,,20.72532,17.00000,26.00000


In [681]:
# creating maps for converting string types to numbers and back

# getting all unique values in the columns
activity_types = set(df['Activity Type'].unique())
event_types = set(df['Event Type'].unique())

# create the empty maps
int_to_activity_type = {}
activity_type_to_int = {}

int_to_event_type = {}
event_type_to_int = {}

# fill the maps
for index, activity_type in enumerate(activity_types):
    int_to_activity_type[index] = activity_type
    activity_type_to_int[activity_type] = index

for index, event_type in enumerate(event_types):
    int_to_event_type[index] = event_type
    event_type_to_int[event_type] = index

In [682]:
from datetime import timezone, datetime

# do any conversions that are required for feeding the data into our model

# convert string columns into numbers
df['Activity Type'] = df['Activity Type'].apply(lambda act: activity_type_to_int[act])
df['Event Type'] = df['Event Type'].apply(lambda evt: event_type_to_int[evt])

# convert duration column to total seconds
df['Duration (h:m:s)'] = df['Duration (h:m:s)'].apply(lambda td: td.total_seconds())
df.rename(columns={'Duration (h:m:s)': 'Duration (s)'}, inplace=True)

# convert Start Time to UTC timestamp
df['Start Time'] = df['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())
df['End Time'] = df['End Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

# Sort by 'Start Time' in ascending order (oldest to newest)
df = df.sort_values(by='Start Time', ascending=True)

# fill all NaN values with a 0
df.fillna(0, inplace=True)

# dropping rows with specific activity types since they are not interesting for training
activity_types_to_drop = [activity_type_to_int['Other'], activity_type_to_int['Transition'],
                          activity_type_to_int['Walking'], activity_type_to_int['Strength Training']]
df = df[~df['Activity Type'].isin(activity_types_to_drop)]

df

Unnamed: 0,Start Time,End Time,Duration (s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Average Moving Speed (km/h),Max. Speed (km/h),Elevation Gain (m),...,Anaerobic Training Effect,Avg. Run Cadence,Max. Run Cadence,Stride Length,Steps,Avg. Cadence (rpm),Max. Cadence (rpm),Avg. Temp (°C),Min. Temp (°C),Max. Temp (°C)
1100,1637166705.00000,1637169245.00000,2537.00000,2,2,7.80808,11.08080,11.75504,13.70520,40.00000,...,0.40000,159.86000,197.00000,115.65000,6840.00000,0.00000,0.00000,18.58025,17.00000,24.00000
1099,1637391420.00000,1637394108.00000,2688.00000,5,2,8.14423,10.90440,10.96432,17.26560,180.00000,...,0.40000,171.11000,200.00000,106.29000,7708.00000,0.00000,0.00000,20.72532,17.00000,26.00000
1098,1637493441.00000,1637500953.00000,7486.00000,5,2,20.10268,9.66600,10.21263,17.70120,415.00000,...,2.00000,156.19000,246.00000,102.37000,19746.00000,0.00000,0.00000,17.49492,12.00000,23.00000
1097,1637687915.00000,1637693036.00000,5121.00000,2,2,14.81816,10.41480,11.86394,20.18880,59.00000,...,0.50000,141.97000,249.00000,120.77000,12340.00000,0.00000,0.00000,19.25532,16.00000,27.00000
1096,1637765343.00000,1637768070.00000,2727.00000,5,2,8.05562,10.63440,10.64606,16.83000,167.00000,...,0.90000,170.20000,191.00000,104.19000,7734.00000,0.00000,0.00000,21.05870,18.00000,28.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,1732775741.00000,1732777368.00000,1627.00000,2,2,5.66210,12.52800,12.54540,13.43520,54.83000,...,0.00000,180.80000,188.00000,115.48000,4882.00000,0.00000,0.00000,16.18451,12.00000,29.00000
4,1732892652.00000,1732896578.00000,3526.00000,7,2,20.12749,20.54880,20.70256,43.76880,374.48000,...,0.30000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,13.00000,10.00000,27.00000
3,1732953144.00000,1732960249.00000,6933.00000,2,2,23.37705,12.13920,12.17715,15.68520,472.01000,...,0.00000,175.20000,201.00000,115.46000,20238.00000,0.00000,0.00000,17.63128,15.00000,29.00000
2,1733044326.00000,1733044991.00000,665.00000,2,2,1.57320,8.51760,9.48663,13.43520,8.68000,...,0.00000,145.23000,239.00000,95.74000,1760.00000,0.00000,0.00000,22.90281,19.00000,28.00000


In [683]:
# getting all 10km races
ten_k_races = df.loc[
    (df['Activity Type'] == activity_type_to_int['Running']) &
    (df['Event Type'] == event_type_to_int['Race']) &
    (df['Distance (km)'] >= 9) &
    (df['Distance (km)'] <= 11)
]

In [684]:
from dateutil.relativedelta import relativedelta

# creating a list of 3 month blocks before each race
ten_k_race_training_blocks = ten_k_races['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
ten_k_race_training_blocks = ten_k_race_training_blocks.apply(lambda dt: dt - relativedelta(months=3))
ten_k_race_training_blocks = pd.DataFrame({
    'Training Block Start': ten_k_race_training_blocks.apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()),
    'Race Date': ten_k_races['Start Time'],
})

# getting all activities within each 3 month block
def get_training_block_activities():
    training_block_activities_list = []
    for index, row in ten_k_race_training_blocks.iterrows():
        # getting all activities within a single training block excluding the race itself
        training_block_activities = df[
            (df['Start Time'] >= row['Training Block Start']) &
            (df['Start Time'] < row['Race Date'])
        ]
        training_block_activities_list.append(training_block_activities)
    return training_block_activities_list

# creating a series of training block activities: [[block1_activities], [block2_activities], ...]
ten_k_race_training_block_activities = pd.Series(get_training_block_activities())

# further reduce data by summing up activities on a per-week basis
def generate_training_weeks():
    training_block_weeks_list = []
    columns_to_keep = ['Start Time', 'Duration (s)', 'Distance (km)', 'Elevation Gain (m)',
                       'Elevation Loss (m)', 'Calories', 'Aerobic Training Effect', 'Anaerobic Training Effect']

    for training_block_tuple in ten_k_race_training_block_activities.items():
        training_block = training_block_tuple[1]
        training_block['Start Time'] = (training_block['Start Time']
                                        .apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc)))
        training_block = training_block[columns_to_keep]

        training_block_weeks = training_block.groupby(pd.Grouper(key='Start Time', freq='W')).sum()
        training_block_weeks.reset_index(inplace=True)
        training_block_weeks['Start Time'] = (training_block_weeks['Start Time']
                                              .apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()))

        training_block_weeks_list.append(training_block_weeks)

    return training_block_weeks_list

# Creating a series of training block weeks
ten_k_race_training_block_weeks = pd.Series(generate_training_weeks())
ten_k_race_training_block_weeks.iloc[0]


Unnamed: 0,Start Time,Duration (s),Distance (km),Elevation Gain (m),Elevation Loss (m),Calories,Aerobic Training Effect,Anaerobic Training Effect
0,1640476800.0,19028.0,61.74289,431.0,345.0,3398,17.1,0.6
1,1641081600.0,25488.0,84.68369,328.0,311.0,5160,34.5,0.1
2,1641686400.0,20968.0,59.45793,741.0,696.0,3995,18.3,1.0
3,1642291200.0,21915.0,61.24936,683.0,647.0,3672,15.5,0.6
4,1642896000.0,17415.0,53.11269,561.0,520.0,3289,14.2,0.9
5,1643500800.0,26222.0,77.11399,759.0,660.0,4424,17.0,0.6
6,1644105600.0,26418.0,73.69302,685.0,614.0,4587,16.3,1.0
7,1644710400.0,24244.0,69.60733,389.0,345.0,4085,14.0,1.0
8,1645315200.0,19061.0,57.70232,527.0,498.0,3264,14.3,0.4
9,1645920000.0,16147.0,55.63739,236.0,200.0,2803,13.4,0.3


In [685]:
from sklearn import preprocessing

# normalize and scale input data to resolve big differences in feature magnitudes
# i.e. Start Time is a big number (> 1 billion), event type is a very small number (0..2)

scaler = preprocessing.StandardScaler()

ten_k_race_training_block_weeks = ten_k_race_training_block_weeks.apply(
    lambda block: pd.DataFrame(scaler.fit_transform(block))
)

ten_k_race_training_block_weeks.iloc[0]

Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.60357,-0.3649,0.02048,-0.39477,-0.56388,-0.28859,0.10835,-0.20429
1,-1.33631,1.15363,1.77144,-0.96193,-0.75694,1.83421,3.21059,-1.53221
2,-1.06904,0.09113,-0.15392,1.31223,1.42914,0.43066,0.32229,0.85804
3,-0.80178,0.31374,-0.01719,0.99285,1.15091,0.04152,-0.17692,-0.20429
4,-0.53452,-0.74406,-0.63822,0.32107,0.42979,-0.41991,-0.40869,0.59245
5,-0.26726,1.32617,1.19368,1.41134,1.22473,0.9475,0.09052,-0.20429
6,0.0,1.37224,0.93257,1.00387,0.96353,1.14388,-0.03429,0.85804
7,0.26726,0.86121,0.62073,-0.62604,-0.56388,0.53909,-0.44435,0.85804
8,0.53452,-0.35714,-0.28791,0.13385,0.30487,-0.45003,-0.39087,-0.73546
9,0.80178,-1.04212,-0.44552,-1.46853,-1.38721,-1.00543,-0.55133,-1.00104


In [686]:
import numpy as np

X = []
y = []

max_input_size = 0

for index, row in ten_k_races.reset_index().iterrows():
    x = np.array(ten_k_race_training_block_weeks.iloc[index])
    if max_input_size < x.shape[0]:
        max_input_size = x.shape[0]
    X.append(np.array(ten_k_race_training_block_weeks.iloc[index]))

# create reference shape, second dimension is constant
input_shape = (max_input_size, X[0].shape[1])

# pad each input with 0 rows to ensure each input has a consistent shape
for i, x in enumerate(X):
    result = np.zeros(input_shape)
    result[:x.shape[0],:x.shape[1]] = x
    X[i] = result

X = np.array(X)
y = np.array(ten_k_races['Duration (s)'])

print(X.shape, y.shape)

(10, 14, 8) (10,)


In [None]:
import sys

# create comparison variable to check for best found model
best_r2 = -sys.maxsize - 1

In [758]:
from keras import Input
from keras.src.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam

number_of_lstm_units = 128
dropout_rate = 0.2
learning_rate = 0.015

model = Sequential()
model.add(Input(shape=input_shape))
model.add(LSTM(units=number_of_lstm_units))
model.add(Dropout(dropout_rate))
model.add(Dense(units=1))

optimizer = Adam(learning_rate=learning_rate)

model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])

model.summary()

In [759]:
from keras.callbacks import EarlyStopping

train_training_data, val_training_data, test_training_data  = X[:-4], X[-4:-2], X[-2:]
train_racing_data, val_racing_data, test_racing_data = y[:-4], y[-4:-2], y[-2:]

early_stopping = EarlyStopping(
    monitor='val_loss', # Monitor validation loss
    patience=10, # Stop training after 10 epochs of no improvement
    restore_best_weights=True # Restore the weights of the best epoch
)

model.fit(
    train_training_data,
    train_racing_data,
    validation_data=(val_training_data, val_racing_data),
    epochs=1000,
    batch_size=2,
    callbacks=[early_stopping],
    shuffle=False
)

Epoch 1/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - loss: 5211885.5000 - mean_absolute_error: 2281.5483 - val_loss: 4501762.5000 - val_mean_absolute_error: 2121.7141
Epoch 2/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 5166755.5000 - mean_absolute_error: 2271.6157 - val_loss: 4462261.0000 - val_mean_absolute_error: 2112.3828
Epoch 3/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 5129001.5000 - mean_absolute_error: 2263.2847 - val_loss: 4439250.5000 - val_mean_absolute_error: 2106.9292
Epoch 4/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 5104472.5000 - mean_absolute_error: 2257.8799 - val_loss: 4417088.0000 - val_mean_absolute_error: 2101.6631
Epoch 5/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 5078958.0000 - mean_absolute_error: 2252.2327 - val_loss: 4394167.5000 - val_mean_absolute_error: 2096.

<keras.src.callbacks.history.History at 0x3bce66de0>

In [760]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test data
predictions = model.predict(test_training_data)

print(f"predictions: {predictions}\nactual race times: {test_racing_data}")

# Calculate evaluation metrics
mae = mean_absolute_error(test_racing_data, predictions)
mse = mean_squared_error(test_racing_data, predictions)
r2 = r2_score(test_racing_data, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

if r2 > best_r2:
    best_r2 = r2
    print(f"Saving model with R2 score of {r2}")
    model.save('running_time_predictor.keras')
else:
    print("Not the best model so far. Keep trying ;)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
predictions: [[2129.921]
 [2129.921]]
actual race times: [2113. 2142.]
Mean Squared Error (MSE): 216.11074924468994
Mean Absolute Error (MAE): 14.5
R-squared (R2): -0.027875145040142435
Not the best model so far. Keep trying ;)


In [762]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('running_time_predictor.keras')

# Perform predictions
predictions = model.predict(test_training_data)
print(f"predictions: {predictions}\nactual race times: {test_racing_data}")

# Calculate evaluation metrics
mae = mean_absolute_error(test_racing_data, predictions)
mse = mean_squared_error(test_racing_data, predictions)
r2 = r2_score(test_racing_data, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
predictions: [[2129.8865]
 [2129.8865]]
actual race times: [2113. 2142.]
Mean Squared Error (MSE): 215.94526106119156
Mean Absolute Error (MAE): 14.5
R-squared (R2): -0.027088043097225123
