In [481]:
import pandas as pd
import importlib
import activity_data_importer

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

# reload my module each time I execute to get new changes without restarting kernel
importlib.reload(activity_data_importer)

# import data and filter out erroneous rows
df = activity_data_importer.import_activity_data()
filtered_df = activity_data_importer.drop_erroneous_rows(df)
print(f"Dropped {len(df.index) - len(filtered_df.index)} records due to erroneous measurements.")
df = filtered_df

Dropped 4 records due to erroneous measurements.


In [482]:
# creating maps for converting string types to numbers and back

# getting all unique values in the columns
activity_types = set(df['Activity Type'].unique())
event_types = set(df['Event Type'].unique())

# create the empty maps
int_to_activity_type = {}
activity_type_to_int = {}

int_to_event_type = {}
event_type_to_int = {}

# fill the maps
for index, activity_type in enumerate(activity_types):
    int_to_activity_type[index] = activity_type
    activity_type_to_int[activity_type] = index

for index, event_type in enumerate(event_types):
    int_to_event_type[index] = event_type
    event_type_to_int[event_type] = index

In [483]:
from datetime import timezone, datetime

# do any conversions that are required for feeding the data into our model

# convert duration column to total seconds
df['Duration (h:m:s)'] = df['Duration (h:m:s)'].apply(lambda td: td.total_seconds())
df.rename(columns={'Duration (h:m:s)': 'Duration (s)'}, inplace=True)

# convert string columns into numbers
df['Activity Type'] = df['Activity Type'].apply(lambda act: activity_type_to_int[act])
df['Event Type'] = df['Event Type'].apply(lambda evt: event_type_to_int[evt])

# convert Start Time to UTC timestamp
df['Start Time'] = df['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

# fill all NaN values with a 0
df.fillna(0, inplace=True)

# dropping columns irrelevant to training the model
columns_to_drop = ['Min. Temp (°C)', 'Max. Temp (°C)', 'Stride Length', 'Steps',
                   'Avg. Cadence (rpm)', 'Max. Cadence (rpm)', 'Avg. Run Cadence', 'Max. Run Cadence',
                   'VO2max', 'End Time', 'Average Moving Speed (km/h)', 'Elevation Loss (m)', 'Elevation Min. (m)',
                   'Elevation Max. (m)']

df.drop(columns=columns_to_drop, inplace=True)

# dropping rows with specific activity types since they are not interesting for training
activity_types_to_drop = [activity_type_to_int['Other'], activity_type_to_int['Transition'],
                          activity_type_to_int['Walking'], activity_type_to_int['Strength Training']]
df = df[~df['Activity Type'].isin(activity_types_to_drop)]

df

Unnamed: 0,Start Time,Duration (s),Activity Type,Event Type,Distance (km),Average Speed (km/h),Max. Speed (km/h),Elevation Gain (m),Max. Heart Rate (bpm),Average Heart Rate (bpm),Calories,Aerobic Training Effect,Anaerobic Training Effect,Avg. Temp (°C)
1,1733045408.00000,2142.00000,5,0,9.95481,16.72920,23.85000,79.45000,186.00000,163.00000,615,5.00000,1.50000,11.49034
2,1733044326.00000,665.00000,5,1,1.57320,8.51760,13.43520,8.68000,125.00000,106.00000,106,1.20000,0.00000,22.90281
3,1732953144.00000,6933.00000,5,1,23.37705,12.13920,15.68520,472.01000,166.00000,148.00000,1500,4.20000,0.00000,17.63128
4,1732892652.00000,3526.00000,11,1,20.12749,20.54880,43.76880,374.48000,166.00000,128.00000,516,2.40000,0.30000,13.00000
6,1732775741.00000,1627.00000,5,1,5.66210,12.52800,13.43520,54.83000,165.00000,150.00000,360,2.80000,0.00000,16.18451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,1637765343.00000,2727.00000,9,1,8.05562,10.63440,16.83000,167.00000,163.00000,147.00000,500,2.90000,0.90000,21.05870
1097,1637687915.00000,5121.00000,5,1,14.81816,10.41480,20.18880,59.00000,164.00000,144.00000,838,3.30000,0.50000,19.25532
1098,1637493441.00000,7486.00000,9,1,20.10268,9.66600,17.70120,415.00000,169.00000,141.00000,1082,3.10000,2.00000,17.49492
1099,1637391420.00000,2688.00000,9,1,8.14423,10.90440,17.26560,180.00000,164.00000,146.00000,486,3.20000,0.40000,20.72532


In [484]:
# getting all 10km races
ten_k_races = df.loc[
    (df['Activity Type'] == activity_type_to_int['Running']) &
    (df['Event Type'] == event_type_to_int['Race']) &
    (df['Distance (km)'] >= 9) &
    (df['Distance (km)'] <= 11)
]

In [485]:
from dateutil.relativedelta import relativedelta

# creating a list of 3 month blocks before each race
ten_k_race_training_blocks = ten_k_races['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
ten_k_race_training_blocks = ten_k_race_training_blocks.apply(lambda dt: dt - relativedelta(months=3))
ten_k_race_training_blocks = pd.DataFrame({
    'Training Block Start': ten_k_race_training_blocks.apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp()),
    'Race Date': ten_k_races['Start Time'],
})

# getting all activities within each 3 month block
def get_training_block_activities():
    training_block_activities_list = []
    for index, row in ten_k_race_training_blocks.iterrows():
        # getting all activities within a single training block excluding the race itself
        training_block_activities = df[
            (df['Start Time'] >= row['Training Block Start']) &
            (df['Start Time'] < row['Race Date'])
        ]
        training_block_activities_list.append(training_block_activities)
    return training_block_activities_list

# creating a series of training block activities: [[block1_activities], [block2_activities], ...]
ten_k_race_training_block_activities = pd.Series(get_training_block_activities())

# further reduce data by summing up activities on a per-week basis
def generate_training_weeks():
    columns_to_drop = ['Activity Type', 'Event Type', 'Average Speed (km/h)', 'Max. Speed (km/h)',
                       'Max. Heart Rate (bpm)', 'Average Heart Rate (bpm)', 'Avg. Temp (°C)']

    training_block_weeks_list = []

    for training_block_tuple in ten_k_race_training_block_activities.items():
        training_block = training_block_tuple[1]
        training_block['Start Time'] = training_block['Start Time'].apply(lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc))
        training_block.drop(columns=columns_to_drop, inplace=True)

        training_block_weeks = training_block.groupby(pd.Grouper(key='Start Time', freq='W')).sum()
        training_block_weeks.reset_index(inplace=True)
        training_block_weeks['Start Time'] = training_block_weeks['Start Time'].apply(lambda dt: dt.replace(tzinfo=timezone.utc).timestamp())

        training_block_weeks_list.append(training_block_weeks)

    return training_block_weeks_list

# Creating a series of training block weeks
ten_k_race_training_block_weeks = pd.Series(generate_training_weeks())

In [486]:
from sklearn import preprocessing

# normalize and scale input data to resolve big differences in feature magnitudes
# i.e. Start Time is a big number (> 1 billion), event type is a very small number (0..2)

scaler = preprocessing.StandardScaler()

ten_k_race_training_block_weeks = ten_k_race_training_block_weeks.apply(
    lambda block: pd.DataFrame(scaler.fit_transform(block))
)

ten_k_race_training_block_weeks.iloc[0]

Unnamed: 0,0,1,2,3,4,5,6
0,-1.60357,0.96016,0.96094,-1.12584,0.98232,-0.12852,-1.42237
1,-1.33631,-0.91817,-1.40761,-1.36754,-1.09034,-1.38159,-1.33597
2,-1.06904,0.32667,0.59413,1.23868,0.53529,1.09998,1.86105
3,-0.80178,0.86432,0.98922,0.92103,0.8813,1.17369,1.34262
4,-0.53452,1.4401,1.82336,1.534,1.37631,1.05084,-0.47191
5,-0.26726,-0.94088,-0.83491,-0.38162,-0.66604,0.19089,0.21934
6,0.0,-0.61192,-0.32062,0.16417,-0.99352,-0.91476,-1.42237
7,0.26726,-1.98303,-1.54282,-1.14351,-2.14855,-2.16784,0.39215
8,0.53452,0.92917,0.21625,0.18612,0.73229,0.53487,0.56496
9,0.80178,-1.2147,-1.20341,-1.54868,-0.80831,-0.37422,0.04653


In [487]:
import numpy as np

X = []
y = []

max_input_size = 0

for index, row in ten_k_races.reset_index().iterrows():
    x = np.array(ten_k_race_training_block_weeks.iloc[index])
    if max_input_size < x.shape[0]:
        max_input_size = x.shape[0]
    X.append(np.array(ten_k_race_training_block_weeks.iloc[index]))

# create reference shape, second dimension is constant
input_shape = (max_input_size, X[0].shape[1])

# pad each input with 0 rows to ensure each input has a consistent shape
for i, x in enumerate(X):
    result = np.zeros(input_shape)
    result[:x.shape[0],:x.shape[1]] = x
    X[i] = result

X = np.array(X)
X = X.reshape(X.shape[0], -1)
y = np.array(ten_k_races['Duration (s)']) # solve for the duration

# print(X.shape, y.shape)

In [488]:
train_training_data, test_training_data = X[:-2], X[-2:]
train_racing_data, test_racing_data = y[:-2], y[-2:]

print(train_training_data.shape, test_training_data.shape)
print(train_racing_data.shape, test_racing_data.shape)

(8, 98) (2, 98)
(8,) (2,)


In [489]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression(fit_intercept=True)

model_lr.fit(train_training_data, train_racing_data)

In [490]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on test data
predictions = model_lr.predict(test_training_data)

print(f"predictions: {predictions}, actual race times: {test_racing_data}")

# Calculate evaluation metrics
mae = mean_absolute_error(test_racing_data, predictions)
mse = mean_squared_error(test_racing_data, predictions)
r2 = r2_score(test_racing_data, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

predictions: [2171.98755995 2244.781326  ], actual race times: [2339. 2328.]
Mean Squared Error (MSE): 17409.251417173306
Mean Absolute Error (MAE): 125.1155570248277
R-squared (R2): -574.5124435429192
