In [208]:
import pandas as pd
import importlib
import activity_data_importer

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

# reload my module each time I execute to get new changes without restarting kernel
importlib.reload(activity_data_importer)

# import data and filter out erroneous rows
df = activity_data_importer.import_activity_data()
filtered_df = activity_data_importer.drop_erroneous_rows(df)
print(f"Dropped {len(df.index) - len(filtered_df.index)} records due to erroneous measurements.")
df = filtered_df

Dropped 4 records due to erroneous measurements.


In [209]:
# creating maps for converting string types to numbers and back

# getting all unique values in the columns
activity_types = set(df['Activity Type'].unique())
event_types = set(df['Event Type'].unique())

# create the empty maps
int_to_activity_type = {}
activity_type_to_int = {}

int_to_event_type = {}
event_type_to_int = {}

# fill the maps
for index, activity_type in enumerate(activity_types):
    int_to_activity_type[index] = activity_type
    activity_type_to_int[activity_type] = index

for index, event_type in enumerate(event_types):
    int_to_event_type[index] = event_type
    event_type_to_int[event_type] = index

In [210]:
from datetime import timezone, datetime

# do any conversions that are required for feeding the data into our model

# convert duration column to total seconds
df['Duration (h:m:s)'] = df['Duration (h:m:s)'].apply(lambda td: td.total_seconds())
df.rename(columns={'Duration (h:m:s)': 'Duration (s)'}, inplace=True)

# convert string columns into numbers
df['Activity Type'] = df['Activity Type'].apply(lambda act: activity_type_to_int[act])
df['Event Type'] = df['Event Type'].apply(lambda evt: event_type_to_int[evt])

# convert Start Time to UTC timestamp
df['Start Time'] = df['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

# fill all NaN values with a 0
df.fillna(0, inplace=True)

# dropping columns irrelevant to training the model
columns_to_drop = ['Min. Temp (°C)', 'Max. Temp (°C)', 'Stride Length', 'Steps',
                   'Avg. Cadence (rpm)', 'Max. Cadence (rpm)', 'Avg. Run Cadence', 'Max. Run Cadence',
                   'VO2max', 'Aerobic Training Effect', 'Anaerobic Training Effect', 'End Time']
df.drop(columns=columns_to_drop, inplace=True)

# dropping rows with specific activity types since they are not interesting for training
activity_types_to_drop = [activity_type_to_int['Other'], activity_type_to_int['Transition'],
                          activity_type_to_int['Walking'], activity_type_to_int['Strength Training']]
df = df[~df['Activity Type'].isin(activity_types_to_drop)]

df

Unnamed: 0,Start Time,Duration (s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Average Moving Speed (km/h),Max. Speed (km/h),Elevation Gain (m),Elevation Loss (m),Elevation Min. (m),Elevation Max. (m),Max. Heart Rate (bpm),Average Heart Rate (bpm),Calories,Avg. Temp (°C)
1,1.733045e+09,2142.0,2,1,9.95481,16.729199,16.762247,23.850000,79.45,79.33,221.2,275.8,186.0,163.0,615,11.490340
2,1.733044e+09,665.0,2,2,1.57320,8.517600,9.486633,13.435200,8.68,54.26,211.0,216.8,125.0,106.0,106,22.902810
3,1.732953e+09,6933.0,2,2,23.37705,12.139200,12.177149,15.685200,472.01,439.17,141.4,370.6,166.0,148.0,1500,17.631282
4,1.732893e+09,3526.0,7,2,20.12749,20.548801,20.702561,43.768800,374.48,328.59,174.2,360.8,166.0,128.0,516,13.000000
6,1.732776e+09,1627.0,2,2,5.66210,12.528000,12.545405,13.435200,54.83,54.98,176.6,204.6,165.0,150.0,360,16.184515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,1.637765e+09,2727.0,5,2,8.05562,10.634400,10.646061,16.830001,167.00,157.00,222.6,329.6,163.0,147.0,500,21.058698
1097,1.637688e+09,5121.0,2,2,14.81816,10.414800,11.863943,20.188799,59.00,52.00,153.8,187.2,164.0,144.0,838,19.255316
1098,1.637493e+09,7486.0,5,2,20.10268,9.666000,10.212630,17.701199,415.00,400.00,172.6,321.4,169.0,141.0,1082,17.494922
1099,1.637391e+09,2688.0,5,2,8.14423,10.904400,10.964320,17.265600,180.00,171.00,396.2,498.8,164.0,146.0,486,20.725323


In [211]:
# getting all 10km races
ten_k_races = df.loc[
    (df['Activity Type'] == activity_type_to_int['Running']) &
    (df['Event Type'] == event_type_to_int['Race']) &
    (df['Distance (km)'] >= 9) &
    (df['Distance (km)'] <= 11)
]

In [212]:
from dateutil.relativedelta import relativedelta

# creating a list of 3 month blocks before each race
ten_k_race_training_blocks = ten_k_races['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
ten_k_race_training_blocks = ten_k_race_training_blocks.apply(lambda dt: dt - relativedelta(months=3))
ten_k_race_training_blocks = pd.DataFrame({
    'Training Block Start': ten_k_race_training_blocks.apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()),
    'Race Date': ten_k_races['Start Time'],
})

# getting all activities within each 3 month block
def get_training_block_activities():
    training_block_activities_list = []
    for index, row in ten_k_race_training_blocks.iterrows():
        # getting all activities within a single training block excluding the race itself
        training_block_activities = df[
            (df['Start Time'] >= row['Training Block Start']) &
            (df['Start Time'] < row['Race Date'])
        ]
        training_block_activities_list.append(training_block_activities)
    return training_block_activities_list

# creating a series of training block activities: [[block1_activities], [block2_activities], ...]
ten_k_race_training_block_activities = pd.Series(get_training_block_activities())

In [213]:
from sklearn import preprocessing

# normalize and scale input data to resolve big differences in feature magnitudes
# i.e. Start Time is a big number (> 1 billion), event type is a very small number (0..2)

scaler = preprocessing.StandardScaler()

ten_k_race_training_block_activities = ten_k_race_training_block_activities.apply(
    lambda block: block.drop(['Start Time', 'Event Type', 'Duration (s)'], axis='columns')
)
ten_k_race_training_block_activities = ten_k_race_training_block_activities.apply(
    lambda block: pd.DataFrame(scaler.fit_transform(block))
)

ten_k_race_training_block_activities.iloc[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.839964,-1.035303,-1.424867,-1.206281,-0.931412,-1.160686,-0.845714,1.149955,-0.263091,-3.747945,-2.973062,-1.301316,0.499821
1,-0.839964,0.315744,-0.763913,-0.735546,-0.803828,1.864899,1.922126,-0.118067,1.146375,-0.096482,0.510293,1.448957,-0.524620
2,0.878144,0.114389,0.770867,0.756068,0.788626,1.228020,1.126959,0.479506,1.056565,-0.096482,-1.148448,-0.492412,-1.424639
3,-0.839964,-0.781939,-0.692956,-0.671115,-0.931412,-0.859322,-0.840536,0.523231,-0.374895,-0.185542,0.676167,-0.800190,-0.805777
4,-0.839964,-0.294146,0.116483,0.121818,-0.876092,-0.818248,-0.726633,0.840237,-0.114630,-0.274602,0.427356,-0.220147,-0.319589
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,-0.839964,1.514541,-0.582578,-0.566231,-0.540903,-0.206705,-0.098941,-1.739532,-1.487439,1.862840,1.920223,3.928939,1.712396
68,-0.839964,-0.853447,-0.541186,-0.526326,-0.891402,-1.152392,-1.170454,-2.191356,-1.896166,-0.452722,0.759104,-0.948160,1.101452
69,-0.839964,-0.820648,-0.459060,-0.447998,-0.813218,-1.153176,-1.163694,-2.143988,-1.910828,0.705059,1.007915,-0.877135,1.995037
70,-0.839964,-0.818842,-0.474171,-0.455062,-0.807502,-1.142728,-1.184620,-1.910788,-1.784361,0.615999,0.676167,-0.902783,1.029377


In [214]:
import numpy as np

X = []
y = []

max_input_size = 0

for index, row in ten_k_races.reset_index().iterrows():
    x = np.array(ten_k_race_training_block_activities.iloc[index])
    if max_input_size < x.shape[0]:
        max_input_size = x.shape[0]
    X.append(np.array(ten_k_race_training_block_activities.iloc[index]))

# create reference shape, second dimension is constant
input_shape = (max_input_size, X[0].shape[1])

# pad each input with 0 rows to ensure each input has a consistent shape
for i, x in enumerate(X):
    result = np.zeros(input_shape)
    result[:x.shape[0],:x.shape[1]] = x
    X[i] = result

X = np.array(X)
y = np.array(ten_k_races['Duration (s)'])

print(X.shape, y.shape)

(10, 88, 13) (10,)


In [215]:
from keras.src.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

model = Sequential()

model.add(LSTM(units=100, input_shape=input_shape))
model.add(Dropout(0.2))

model.add(Dense(units=1))

model.compile(loss='mse', optimizer='adam', metrics=['mae'])

  super().__init__(**kwargs)


In [216]:
from sklearn.model_selection import train_test_split

train_training_data, test_training_data = X[:-2], X[-2:]
train_racing_data, test_racing_data = y[:-2], y[-2:]

print(train_training_data.shape, test_training_data.shape)
print(train_racing_data.shape, test_racing_data.shape)

model.fit(train_training_data, train_racing_data, epochs=200)

(8, 88, 13) (2, 88, 13)
(8,) (2,)
Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 637ms/step - loss: 4772673.5000 - mae: 2183.1309
Epoch 2/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 4772571.0000 - mae: 2183.1069
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - loss: 4772493.0000 - mae: 2183.0891
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 4772397.0000 - mae: 2183.0669
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 4772240.5000 - mae: 2183.0310
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 4772150.0000 - mae: 2183.0098
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 4771917.0000 - mae: 2182.9563
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 4771602.5000 - mae:

<keras.src.callbacks.history.History at 0x16dec8770>

In [217]:
model.summary()

In [218]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test data
predictions = model.predict(test_training_data)

print(f"predictions: {predictions}\nactual race times: {test_racing_data}")

# Calculate evaluation metrics
mae = mean_absolute_error(test_racing_data, predictions)
mse = mean_squared_error(test_racing_data, predictions)
r2 = r2_score(test_racing_data, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
predictions: [[32.984356]
 [32.98105 ]]
actual race times: [2339. 2328.]
Mean Squared Error (MSE): 5292410.069035161
Mean Absolute Error (MAE): 2300.517297744751
R-squared (R2): -174954.70476149293
