In [691]:
import pandas as pd
import importlib
import activity_data_importer

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

# reload my module each time I execute to get new changes without restarting kernel
importlib.reload(activity_data_importer)

# import data and filter out erroneous rows
df = activity_data_importer.import_activity_data()
filtered_df = activity_data_importer.drop_erroneous_rows(df)
print(f"Dropped {len(df.index) - len(filtered_df.index)} records due to erroneous measurements.")
df = filtered_df

Dropped 4 records due to erroneous measurements.


In [692]:
# creating maps for converting string types to numbers and back

# getting all unique values in the columns
activity_types = set(df['Activity Type'].unique())
event_types = set(df['Event Type'].unique())

# create the empty maps
int_to_activity_type = {}
activity_type_to_int = {}

int_to_event_type = {}
event_type_to_int = {}

# fill the maps
for index, activity_type in enumerate(activity_types):
    int_to_activity_type[index] = activity_type
    activity_type_to_int[activity_type] = index

for index, event_type in enumerate(event_types):
    int_to_event_type[index] = event_type
    event_type_to_int[event_type] = index

In [693]:
from datetime import timezone, datetime
from sklearn import preprocessing

# do any conversions that are required for feeding the data into our model

# convert duration column to total seconds
df['Duration (h:m:s)'] = df['Duration (h:m:s)'].apply(lambda td: td.total_seconds())
df.rename(columns={'Duration (h:m:s)': 'Duration (s)'}, inplace=True)

# convert string columns into numbers
df['Activity Type'] = df['Activity Type'].apply(lambda act: activity_type_to_int[act])
df['Event Type'] = df['Event Type'].apply(lambda evt: event_type_to_int[evt])

# convert Start Time to UTC timestamp
df['Start Time'] = df['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

# fill all NaN values with a 0
df.fillna(0, inplace=True)

# dropping End Time column since it is not relevant for training the model
df.drop(columns='End Time', inplace=True)

# TODO: scale / normalize data
# .pct_change()?
# use sklearn preprocessing for this

for column in df.columns:
    if column == 'Duration (s)':
        continue
    #df[column] = df[column].pct_change()
    #df.dropna(inplace=True)
    #df[column] = preprocessing.scale(df[column].values)

# menge an dimensionen reduzieren
# spalten herausnehmen, die oft 0 sind?
# ausprobieren und schauen

# aktuellen stand festhalten + dokumentieren
# für die präsentation

df

Unnamed: 0,Start Time,Duration (s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Average Moving Speed (km/h),Max. Speed (km/h),Elevation Gain (m),Elevation Loss (m),...,Anaerobic Training Effect,Avg. Run Cadence,Max. Run Cadence,Stride Length,Steps,Avg. Cadence (rpm),Max. Cadence (rpm),Avg. Temp (°C),Min. Temp (°C),Max. Temp (°C)
0,1733156429.00000,2108.00000,5,1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,256.00000,0.00000,0.00000,29.97059,29.00000,31.00000
1,1733045408.00000,2142.00000,2,0,9.95481,16.72920,16.76225,23.85000,79.45000,79.33000,...,1.50000,181.44000,193.00000,153.67000,6426.00000,0.00000,0.00000,11.49034,9.00000,18.00000
2,1733044326.00000,665.00000,2,1,1.57320,8.51760,9.48663,13.43520,8.68000,54.26000,...,0.00000,145.23000,239.00000,95.74000,1760.00000,0.00000,0.00000,22.90281,19.00000,28.00000
3,1732953144.00000,6933.00000,2,1,23.37705,12.13920,12.17715,15.68520,472.01000,439.17000,...,0.00000,175.20000,201.00000,115.46000,20238.00000,0.00000,0.00000,17.63128,15.00000,29.00000
4,1732892652.00000,3526.00000,4,1,20.12749,20.54880,20.70256,43.76880,374.48000,328.59000,...,0.30000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,13.00000,10.00000,27.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,1637765343.00000,2727.00000,6,1,8.05562,10.63440,10.64606,16.83000,167.00000,157.00000,...,0.90000,170.20000,191.00000,104.19000,7734.00000,0.00000,0.00000,21.05870,18.00000,28.00000
1097,1637687915.00000,5121.00000,2,1,14.81816,10.41480,11.86394,20.18880,59.00000,52.00000,...,0.50000,141.97000,249.00000,120.77000,12340.00000,0.00000,0.00000,19.25532,16.00000,27.00000
1098,1637493441.00000,7486.00000,6,1,20.10268,9.66600,10.21263,17.70120,415.00000,400.00000,...,2.00000,156.19000,246.00000,102.37000,19746.00000,0.00000,0.00000,17.49492,12.00000,23.00000
1099,1637391420.00000,2688.00000,6,1,8.14423,10.90440,10.96432,17.26560,180.00000,171.00000,...,0.40000,171.11000,200.00000,106.29000,7708.00000,0.00000,0.00000,20.72532,17.00000,26.00000


In [694]:
# getting all 10km races
ten_k_races = df.loc[
    (df['Activity Type'] == activity_type_to_int['Running']) &
    (df['Event Type'] == event_type_to_int['Race']) &
    (df['Distance (km)'] >= 9) &
    (df['Distance (km)'] <= 11)
]

In [695]:
from dateutil.relativedelta import relativedelta

# creating a list of 3 month blocks before each race
ten_k_race_training_blocks = ten_k_races['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
ten_k_race_training_blocks = ten_k_race_training_blocks.apply(lambda dt: dt - relativedelta(months=3))
ten_k_race_training_blocks = pd.DataFrame({
    'Training Block Start': ten_k_race_training_blocks.apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()),
    'Race Date': ten_k_races['Start Time'],
})

# getting all activities within each 3 month block
def get_training_block_activities():
    training_block_activities_list = []
    for index, row in ten_k_race_training_blocks.iterrows():
        # getting all activities within a single training block excluding the race itself
        training_block_activities = df[
            (df['Start Time'] >= row['Training Block Start']) &
            (df['Start Time'] < row['Race Date'])
        ]
        training_block_activities_list.append(training_block_activities)
    return training_block_activities_list

# creating a series of training block activities: [[block1_activities], [block2_activities], ...]
ten_k_race_training_block_activities = pd.Series(get_training_block_activities())

In [696]:
import numpy as np

X = []
y = []

max_input_size = 0

for index, row in ten_k_races.reset_index().iterrows():
    x = np.array(ten_k_race_training_block_activities.iloc[index])
    if max_input_size < x.shape[0]:
        max_input_size = x.shape[0]
    X.append(np.array(ten_k_race_training_block_activities.iloc[index]))

# create reference shape, second dimension is constant
input_shape = (max_input_size, X[0].shape[1])

# pad each input with 0 rows to ensure each input has a consistent shape
for i, x in enumerate(X):
    result = np.zeros(input_shape)
    result[:x.shape[0],:x.shape[1]] = x
    X[i] = result

X = np.array(X)
y = np.array(ten_k_races['Duration (s)']) # TODO: should this just be the duration of the races?

print(X.shape, y.shape)

(10, 103, 27) (10,)


In [697]:
from keras.src.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

model = Sequential()

model.add(LSTM(units=128, input_shape=input_shape))
model.add(Dropout(0.2))
model.add(Dense(units=1))

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

  super().__init__(**kwargs)


In [698]:
from sklearn.model_selection import train_test_split

# TODO: just use the last two races of the data set for testing instead of a random split since my data is temporal
train_training_data, test_training_data, train_racing_data, test_racing_data = train_test_split(X, y, test_size=0.2)

model.fit(train_training_data, train_racing_data, epochs=1000)

Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 756ms/step - accuracy: 0.0000e+00 - loss: 4945922.0000
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.0000e+00 - loss: 4945523.5000
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.0000e+00 - loss: 4944346.5000
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 4943394.0000
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.0000e+00 - loss: 4942017.0000
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0000e+00 - loss: 4941202.0000
Epoch 7/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.0000e+00 - loss: 4939155.0000
Epoch 8/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy:

<keras.src.callbacks.history.History at 0x17ecfd190>

In [699]:
model.summary()

In [700]:
prediction = model.predict(test_training_data)
print(prediction)
print(test_racing_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[[141.92313]
 [141.92313]]
[2113. 2244.]
