In [600]:
import pandas as pd
import importlib
import activity_data_importer

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

# reload my module each time I execute to get new changes without restarting kernel
importlib.reload(activity_data_importer)

# import data and filter out erroneous rows
df = activity_data_importer.import_activity_data()
filtered_df = activity_data_importer.drop_erroneous_rows(df)
print(f"Dropped {len(df.index) - len(filtered_df.index)} records due to erroneous measurements.")
df = filtered_df

Dropped 4 records due to erroneous measurements.


In [601]:
# creating maps for converting string types to numbers and back

# getting all unique values in the columns
activity_types = set(df['Activity Type'].unique())
event_types = set(df['Event Type'].unique())

# create the empty maps
int_to_activity_type = {}
activity_type_to_int = {}

int_to_event_type = {}
event_type_to_int = {}

# fill the maps
for index, activity_type in enumerate(activity_types):
    int_to_activity_type[index] = activity_type
    activity_type_to_int[activity_type] = index

for index, event_type in enumerate(event_types):
    int_to_event_type[index] = event_type
    event_type_to_int[event_type] = index

In [602]:
from datetime import timezone, datetime

# do any conversions that are required for feeding the data into our model

# convert duration column to total seconds
df['Duration (h:m:s)'] = df['Duration (h:m:s)'].apply(lambda td: td.total_seconds())
df.rename(columns={'Duration (h:m:s)': 'Duration (s)'}, inplace=True)

# convert string columns into numbers
df['Activity Type'] = df['Activity Type'].apply(lambda act: activity_type_to_int[act])
df['Event Type'] = df['Event Type'].apply(lambda evt: event_type_to_int[evt])

# convert Start Time to UTC timestamp
df['Start Time'] = df['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

# Sort by 'Start Time' in ascending order (oldest to newest)
df = df.sort_values(by='Start Time', ascending=True)

# fill all NaN values with a 0
df.fillna(0, inplace=True)

# dropping columns irrelevant to training the model
columns_to_drop = ['Min. Temp (°C)', 'Max. Temp (°C)', 'Stride Length', 'Steps',
                   'Avg. Cadence (rpm)', 'Max. Cadence (rpm)', 'Avg. Run Cadence', 'Max. Run Cadence',
                   'VO2max', 'Aerobic Training Effect', 'Anaerobic Training Effect', 'End Time']
df.drop(columns=columns_to_drop, inplace=True)

# dropping rows with specific activity types since they are not interesting for training
activity_types_to_drop = [activity_type_to_int['Other'], activity_type_to_int['Transition'],
                          activity_type_to_int['Walking'], activity_type_to_int['Strength Training']]
df = df[~df['Activity Type'].isin(activity_types_to_drop)]

df

Unnamed: 0,Start Time,Duration (s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Average Moving Speed (km/h),Max. Speed (km/h),Elevation Gain (m),Elevation Loss (m),Elevation Min. (m),Elevation Max. (m),Max. Heart Rate (bpm),Average Heart Rate (bpm),Calories,Avg. Temp (°C)
1100,1637166705.00000,2537.00000,2,2,7.80808,11.08080,11.75504,13.70520,40.00000,36.00000,127.40000,152.40000,166.00000,151.00000,488,18.58025
1099,1637391420.00000,2688.00000,5,2,8.14423,10.90440,10.96432,17.26560,180.00000,171.00000,396.20000,498.80000,164.00000,146.00000,486,20.72532
1098,1637493441.00000,7486.00000,5,2,20.10268,9.66600,10.21263,17.70120,415.00000,400.00000,172.60000,321.40000,169.00000,141.00000,1082,17.49492
1097,1637687915.00000,5121.00000,2,2,14.81816,10.41480,11.86394,20.18880,59.00000,52.00000,153.80000,187.20000,164.00000,144.00000,838,19.25532
1096,1637765343.00000,2727.00000,5,2,8.05562,10.63440,10.64606,16.83000,167.00000,157.00000,222.60000,329.60000,163.00000,147.00000,500,21.05870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,1732775741.00000,1627.00000,2,2,5.66210,12.52800,12.54540,13.43520,54.83000,54.98000,176.60000,204.60000,165.00000,150.00000,360,16.18451
4,1732892652.00000,3526.00000,7,2,20.12749,20.54880,20.70256,43.76880,374.48000,328.59000,174.20000,360.80000,166.00000,128.00000,516,13.00000
3,1732953144.00000,6933.00000,2,2,23.37705,12.13920,12.17715,15.68520,472.01000,439.17000,141.40000,370.60000,166.00000,148.00000,1500,17.63128
2,1733044326.00000,665.00000,2,2,1.57320,8.51760,9.48663,13.43520,8.68000,54.26000,211.00000,216.80000,125.00000,106.00000,106,22.90281


In [603]:
# getting all 10km races
ten_k_races = df.loc[
    (df['Activity Type'] == activity_type_to_int['Running']) &
    (df['Event Type'] == event_type_to_int['Race']) &
    (df['Distance (km)'] >= 9) &
    (df['Distance (km)'] <= 11)
]

In [604]:
from dateutil.relativedelta import relativedelta

# creating a list of 3 month blocks before each race
ten_k_race_training_blocks = ten_k_races['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
ten_k_race_training_blocks = ten_k_race_training_blocks.apply(lambda dt: dt - relativedelta(months=3))
ten_k_race_training_blocks = pd.DataFrame({
    'Training Block Start': ten_k_race_training_blocks.apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()),
    'Race Date': ten_k_races['Start Time'],
})

# getting all activities within each 3 month block
def get_training_block_activities():
    training_block_activities_list = []
    for index, row in ten_k_race_training_blocks.iterrows():
        # getting all activities within a single training block excluding the race itself
        training_block_activities = df[
            (df['Start Time'] >= row['Training Block Start']) &
            (df['Start Time'] < row['Race Date'])
        ]
        training_block_activities_list.append(training_block_activities)
    return training_block_activities_list

# creating a series of training block activities: [[block1_activities], [block2_activities], ...]
ten_k_race_training_block_activities = pd.Series(get_training_block_activities())

# further reduce data by summing up activities on a per-week basis
def generate_training_weeks():
    columns_to_drop = ['Activity Type', 'Event Type', 'Average Speed (km/h)', 'Max. Speed (km/h)',
                       'Max. Heart Rate (bpm)', 'Average Heart Rate (bpm)', 'Avg. Temp (°C)']

    training_block_weeks_list = []

    for training_block_tuple in ten_k_race_training_block_activities.items():
        training_block = training_block_tuple[1]
        training_block['Start Time'] = training_block['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
        training_block.drop(columns=columns_to_drop, inplace=True)

        training_block_weeks = training_block.groupby(pd.Grouper(key='Start Time', freq='W')).sum()
        training_block_weeks.reset_index(inplace=True)
        training_block_weeks['Start Time'] = training_block_weeks['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

        training_block_weeks_list.append(training_block_weeks)

    return training_block_weeks_list

# Creating a series of training block weeks
ten_k_race_training_block_weeks = pd.Series(generate_training_weeks())

In [605]:
from sklearn import preprocessing

# normalize and scale input data to resolve big differences in feature magnitudes
# i.e. Start Time is a big number (> 1 billion), event type is a very small number (0..2)

scaler = preprocessing.StandardScaler()

ten_k_race_training_block_weeks = ten_k_race_training_block_weeks.apply(
    lambda block: pd.DataFrame(scaler.fit_transform(block))
)

ten_k_race_training_block_weeks.iloc[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.60357,-0.3649,0.02048,0.13858,-0.39477,-0.56388,-0.51279,-0.57779,-0.28859
1,-1.33631,1.15363,1.77144,3.29665,-0.96193,-0.75694,2.90049,2.68265,1.83421
2,-1.06904,0.09113,-0.15392,0.11419,1.31223,1.42914,-0.29002,0.05007,0.43066
3,-0.80178,0.31374,-0.01719,-0.00934,0.99285,1.15091,-0.51958,-0.34204,0.04152
4,-0.53452,-0.74406,-0.63822,-0.46879,0.32107,0.42979,-0.52148,-0.39244,-0.41991
5,-0.26726,1.32617,1.19368,0.08881,1.41134,1.22473,0.15253,0.41562,0.9475
6,0.0,1.37224,0.93257,-0.41547,1.00387,0.96353,0.62769,0.75435,1.14388
7,0.26726,0.86121,0.62073,-0.41798,-0.62604,-0.56388,-0.22998,-0.27999,0.53909
8,0.53452,-0.35714,-0.28791,-0.37376,0.13385,0.30487,-0.40412,-0.40328,-0.45003
9,0.80178,-1.04212,-0.44552,-0.30914,-1.46853,-1.38721,-1.25011,-1.50753,-1.00543


In [606]:
import numpy as np

X = []
y = []

max_input_size = 0

for index, row in ten_k_races.reset_index().iterrows():
    x = np.array(ten_k_race_training_block_weeks.iloc[index])
    if max_input_size < x.shape[0]:
        max_input_size = x.shape[0]
    X.append(np.array(ten_k_race_training_block_weeks.iloc[index]))

# create reference shape, second dimension is constant
input_shape = (max_input_size, X[0].shape[1])

# pad each input with 0 rows to ensure each input has a consistent shape
for i, x in enumerate(X):
    result = np.zeros(input_shape)
    result[:x.shape[0],:x.shape[1]] = x
    X[i] = result

X = np.array(X)
y = np.array(ten_k_races['Duration (s)'])

print(X.shape, y.shape)

(10, 14, 9) (10,)


In [None]:
import sys

# create comparison variable to check for best found model
best_r2 = -sys.maxsize - 1

In [609]:
from keras.src.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam

number_of_lstm_units = 128
dropout_rate = 0.2
learning_rate = 0.015

model = Sequential()
model.add(LSTM(units=number_of_lstm_units, return_sequences=True, input_shape=input_shape))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=number_of_lstm_units, return_sequences=True))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=number_of_lstm_units, return_sequences=True))
model.add(Dropout(dropout_rate))
model.add(LSTM(units=number_of_lstm_units, return_sequences=False))
model.add(Dropout(dropout_rate))
model.add(Dense(units=1))

optimizer = Adam(learning_rate=learning_rate)

model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])

model.summary()

  super().__init__(**kwargs)


In [610]:
from keras.callbacks import EarlyStopping

train_training_data, test_training_data, val_training_data = X[:-4], X[-4:-2], X[-2:]
train_racing_data, test_racing_data, val_racing_data = y[:-4], y[-4:-2], y[-2:]

print(train_training_data.shape, test_training_data.shape, val_training_data.shape)
print(train_racing_data.shape, test_racing_data.shape, val_racing_data.shape)

early_stopping = EarlyStopping(
    monitor='val_loss', # Monitor validation loss
    patience=10, # Stop training after 3 epochs of no improvement
    restore_best_weights=True # Restore the weights of the best epoch
)

model.fit(
    train_training_data,
    train_racing_data,
    validation_data=(val_training_data, val_racing_data),
    epochs=1000,
    batch_size=2,
    verbose=1,
    callbacks=[early_stopping],
    shuffle=False
)

(6, 14, 9) (2, 14, 9) (2, 14, 9)
(6,) (2,) (2,)
Epoch 1/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 140ms/step - loss: 5207693.0000 - mean_absolute_error: 2280.6294 - val_loss: 4479481.0000 - val_mean_absolute_error: 2116.4287
Epoch 2/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 5159258.0000 - mean_absolute_error: 2269.9746 - val_loss: 4450772.0000 - val_mean_absolute_error: 2109.6355
Epoch 3/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 5132320.0000 - mean_absolute_error: 2264.0205 - val_loss: 4430671.0000 - val_mean_absolute_error: 2104.8660
Epoch 4/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 5106236.0000 - mean_absolute_error: 2258.2622 - val_loss: 4410372.0000 - val_mean_absolute_error: 2100.0386
Epoch 5/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 5084167.5000 - mean_absolute_error: 2253.3826 - val_lo

<keras.src.callbacks.history.History at 0x39003f710>

In [599]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test data
predictions = model.predict(test_training_data)

print(f"predictions: {predictions}\nactual race times: {test_racing_data}")

# Calculate evaluation metrics
mae = mean_absolute_error(test_racing_data, predictions)
mse = mean_squared_error(test_racing_data, predictions)
r2 = r2_score(test_racing_data, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

if r2 > best_r2:
    best_r2 = r2
    print(f"Saving model with R2 score of {r2}")
    model.save('running_time_predictor.keras')
else:
    print("Not the best model so far. Keep trying ;)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257ms/step
predictions: [[2130.233]
 [2130.233]]
actual race times: [2140. 2120.]
Mean Squared Error (MSE): 100.0542471408844
Mean Absolute Error (MAE): 10.0
R-squared (R2): -0.000542471408844003
Saving model with R2 score of -0.000542471408844003
