In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
leaderboard_data = pd.read_csv('leaderboard_data.csv')
racecourse_data = pd.read_csv('race_course_data.csv')
rider_season_data = pd.read_csv('rider_season_data.csv')
rider_data = pd.read_csv('rider_data.csv')

# # changing time to gaps in seconds
# def convert_to_seconds(time):
#     time = time.split(':')
#     if len(time) < 2:
#         return -1
#     elif len(time) > 2:
#         return 0
#     return int(time[0])*60 + int(time[1])

# leaderboard_data['time'] = leaderboard_data['time'].apply(convert_to_seconds)

# drop date from racecourse data
racecourse_data = racecourse_data.drop(columns=['date', 'year', 'name', 'stage'])

# merge the data based on url, add the course data to the leaderboard data
leaderboard_course_merged = pd.merge(leaderboard_data, racecourse_data, on='url', how='inner', validate='many_to_one')

rider_data = rider_data.rename(columns={'url': 'rider_url', 'name': 'rider_name'})

def standardize_name(name):
    name = name.split()
    name.sort()
    return ' '.join(name)

rider_data['merge_name'] = rider_data['rider_name'].str.lower().apply(standardize_name)
leaderboard_course_merged['merge_name'] = leaderboard_course_merged['rider'].str.lower().apply(standardize_name)


# merge the data based on rider name, add the rider data to the leaderboard data
leaderboard_course_rider_merged = pd.merge(leaderboard_course_merged, rider_data, on='merge_name', how='inner', validate='many_to_one')
leaderboard_course_rider_merged = leaderboard_course_rider_merged.drop(columns=['rider', 'merge_name'])

In [3]:
# preprocess merged data to make number columns ready for training
merged_data = leaderboard_course_rider_merged.copy()

# change rank to a number
def convert_rank(rank):
    try:
        return int(rank)
    except:
        return 0
    
merged_data['rank'] = leaderboard_course_rider_merged['rank'].apply(convert_rank)

# change distance to a number
def convert_distance(distance):
    distance = distance.split()
    return float(distance[0])
    
merged_data['distance'] = leaderboard_course_rider_merged['distance'].apply(convert_distance)

# change speed to a number
def convert_speed(speed):
    speed = speed.split()
    if len(speed) < 2:
        return None
    return float(speed[0])

merged_data['speed'] = leaderboard_course_rider_merged['speed'].apply(convert_speed)

#change weight to a number
def convert_weight(weight):
    if isinstance(weight, float):
        return weight
    weight = weight.split()
    return float(weight[0])

merged_data['weight'] = leaderboard_course_rider_merged['weight'].apply(convert_weight)

# change height to a number
def convert_height(height):
    if isinstance(height, float):
        return height
    height = height.split()
    return float(height[0])

merged_data['height'] = leaderboard_course_rider_merged['height'].apply(convert_height)

def merge_name_stage(row):
    return row['name'] + ' ' + row['stage']

# change ranking to a number
def convert_ranking(ranking):
    try:
        return int(ranking)
    except:
        return 0
    
merged_data['ranking'] = leaderboard_course_rider_merged['ranking'].apply(convert_ranking)

merged_data['name'] = merged_data.apply(merge_name_stage, axis=1)
merged_data = merged_data.drop(columns=['rider_url', 'url', 'stage', 'time', 'won'])

In [11]:
# Define feature groups
race_numerical = [
    'distance', 'vertical_meters', 'speed', 'year', 'score', 'quality', 'ranking'
]
race_categorical = ['name']
rider_numerical = [
    'weight', 'height', 'one_day', 'gc', 'tt', 'sprint',
    'climber', 'hills', 'age'
]
rider_categorical_low = ['speciality']
rider_categorical_high = ['nationality', 'team', 'rider_name']

# Split data into training and testing sets based on 'year'
train_data = merged_data[merged_data['year'] < 2024]
test_data = merged_data[merged_data['year'] == 2024]

# Determine the maximum number of riders across all races in both training and testing data
max_riders_train = train_data.groupby(['name', 'year']).size().max()
max_riders_test = test_data.groupby(['name', 'year']).size().max()
max_riders = max(max_riders_train, max_riders_test)
print(f"Maximum number of riders: {max_riders}")

# Create preprocessing pipelines
race_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

race_categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(
        drop='first', sparse_output=False, handle_unknown='ignore'
    ))
])

rider_numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

rider_categorical_low_pipeline = Pipeline([
    ('imputer', SimpleImputer(
        strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(
        drop='first', sparse_output=False, handle_unknown='ignore'
    ))
])

rider_categorical_high_pipeline = Pipeline([
    ('imputer', SimpleImputer(
        strategy='constant', fill_value='Unknown')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Fit pipelines on training data
race_numeric_pipeline.fit(train_data[race_numerical])
race_categorical_pipeline.fit(train_data[race_categorical])
rider_numeric_pipeline.fit(train_data[rider_numerical])
rider_categorical_low_pipeline.fit(train_data[rider_categorical_low])
rider_categorical_high_pipeline.fit(train_data[rider_categorical_high])

# Initialize lists for training data
races_train = []
targets_train = []
rider_names_train = []

for (race_name, year), group in train_data.groupby(['name', 'year']):
    try:
        # Extract race-level features
        race_num_data = group[race_numerical].iloc[[0]]
        race_cat_data = group[race_categorical].iloc[[0]]

        # Extract rider-level features
        rider_num_data = group[rider_numerical]
        rider_cat_data_low = group[rider_categorical_low]
        rider_cat_data_high = group[rider_categorical_high]

        # Transform features using fitted pipelines
        race_num_processed = race_numeric_pipeline.transform(race_num_data)
        race_cat_processed = race_categorical_pipeline.transform(race_cat_data)
        rider_num_processed = rider_numeric_pipeline.transform(rider_num_data)
        rider_cat_low_processed = rider_categorical_low_pipeline.transform(rider_cat_data_low)
        rider_cat_high_processed = rider_categorical_high_pipeline.transform(rider_cat_data_high)

        # Combine features
        race_features = np.hstack((race_num_processed, race_cat_processed))
        rider_features = np.hstack((rider_num_processed, rider_cat_low_processed, rider_cat_high_processed))

        # Pad or truncate rider_features to max_riders
        n_riders = rider_features.shape[0]
        if n_riders < max_riders:
            pad_width = max_riders - n_riders
            padded_rider_features = np.pad(
                rider_features,
                ((0, pad_width), (0, 0)),
                mode='constant',
                constant_values=0
            )
        else:
            padded_rider_features = rider_features[:max_riders, :]

        # Create feature matrix by repeating race_features and concatenating with rider_features
        feature_matrix = np.hstack((
            np.tile(race_features, (max_riders, 1)),
            padded_rider_features
        ))

        # Calculate probabilities for first 3 riders and pad or truncate to max_riders
        ranks = group['rank'].values
        padded_probabilities = np.zeros(max_riders)
        probabilities = np.array([np.exp(-ranks[:3]) / np.sum(np.exp(-ranks[:3]))])
        padded_probabilities[0:3] = probabilities

        # Collect rider names and pad or truncate to max_riders
        riders = group['rider_name'].tolist()
        if n_riders < max_riders:
            padded_riders = riders + ['PAD'] * (max_riders - n_riders)
        else:
            padded_riders = riders[:max_riders]

        # Append data to lists
        races_train.append(feature_matrix)
        targets_train.append(padded_probabilities)
        rider_names_train.append(padded_riders)

    except Exception as e:
        print(f"Error processing race {race_name} {year}: {e}")
        continue

# Initialize lists for test data
races_test = []
targets_test = []
rider_names_test = []

for (race_name, year), group in test_data.groupby(['name', 'year']):
    try:
        # Extract race-level features
        race_num_data = group[race_numerical].iloc[[0]]
        race_cat_data = group[race_categorical].iloc[[0]]

        # Extract rider-level features
        rider_num_data = group[rider_numerical]
        rider_cat_data_low = group[rider_categorical_low]
        rider_cat_data_high = group[rider_categorical_high]

        # Transform features using pipelines fitted on training data
        race_num_processed = race_numeric_pipeline.transform(race_num_data)
        race_cat_processed = race_categorical_pipeline.transform(race_cat_data)
        rider_num_processed = rider_numeric_pipeline.transform(rider_num_data)
        rider_cat_low_processed = rider_categorical_low_pipeline.transform(rider_cat_data_low)
        rider_cat_high_processed = rider_categorical_high_pipeline.transform(rider_cat_data_high)

        # Combine features
        race_features = np.hstack((race_num_processed, race_cat_processed))
        rider_features = np.hstack((rider_num_processed, rider_cat_low_processed, rider_cat_high_processed))

        # Pad or truncate rider_features to max_riders
        n_riders = rider_features.shape[0]
        if n_riders < max_riders:
            pad_width = max_riders - n_riders
            padded_rider_features = np.pad(
                rider_features,
                ((0, pad_width), (0, 0)),
                mode='constant',
                constant_values=0
            )
        else:
            padded_rider_features = rider_features[:max_riders, :]

        # Create feature matrix by repeating race_features and concatenating with rider_features
        feature_matrix = np.hstack((
            np.tile(race_features, (max_riders, 1)),
            padded_rider_features
        ))

        # Calculate probabilities and pad or truncate to max_riders
        ranks = group['rank'].values
        padded_probabilities = np.zeros(max_riders)
        probabilities = np.array([np.exp(-ranks[:3]) / np.sum(np.exp(-ranks[:3]))])
        padded_probabilities[0:3] = probabilities

        # Collect rider names and pad or truncate to max_riders
        riders = group['rider_name'].tolist()
        if n_riders < max_riders:
            padded_riders = riders + ['PAD'] * (max_riders - n_riders)
        else:
            padded_riders = riders[:max_riders]

        # Append data to lists
        races_test.append(feature_matrix)
        targets_test.append(padded_probabilities)
        rider_names_test.append(padded_riders)

    except Exception as e:
        print(f"Error processing race {race_name} {year}: {e}")
        continue
# Find maximum number of riders across all data
max_riders = max(
    max(len(riders) for riders in rider_names_train),
    max(len(riders) for riders in rider_names_test)
)

def pad_riders(rider_list, max_riders, pad_value='Unknown'):
    if len(rider_list) < max_riders:
        return rider_list + [pad_value] * (max_riders - len(rider_list))
    else:
        return rider_list[:max_riders]
    
# # Pad rider names to max_riders
# rider_names_train = [pad_riders(riders, max_riders) for riders in rider_names_train]
# rider_names_test = [pad_riders(riders, max_riders) for riders in rider_names_test]

# # Initialize lists for padded training data
# races_train_padded = []
# targets_train_padded = []

# for features, targets in zip(races_train, targets_train):
#     n_riders = features.shape[0]
#     if n_riders < max_riders:
#         pad_width = max_riders - n_riders
#         padded_features = np.pad(features, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
#         padded_targets = np.pad(targets, (0, pad_width), mode='constant', constant_values=0)
#     else:
#         padded_features = features[:max_riders, :]
#         padded_targets = targets[:max_riders]
#     races_train_padded.append(padded_features)
#     targets_train_padded.append(padded_targets)

# # Initialize lists for padded testing data
# races_test_padded = []
# targets_test_padded = []

# for features, targets in zip(races_test, targets_test):
#     n_riders = features.shape[0]
#     if n_riders < max_riders:
#         pad_width = max_riders - n_riders
#         padded_features = np.pad(features, ((0, pad_width), (0, 0)), mode='constant', constant_values=0)
#         padded_targets = np.pad(targets, (0, pad_width), mode='constant', constant_values=0)
#     else:
#         padded_features = features[:max_riders, :]
#         padded_targets = targets[:max_riders]
#     races_test_padded.append(padded_features)
#     targets_test_padded.append(padded_targets)

# # Find maximum number of features
# max_features = max(
#     races_train[0].shape[1] if races_train else 0,
#     races_test[0].shape[1] if races_test else 0
# )

# # Function to pad races, targets, and rider names
# def pad_sequence(sequence, max_length, padding_value=0):
#     sequence = list(sequence)
#     padding_needed = max_length - len(sequence)
#     if padding_needed > 0:
#         sequence.extend([padding_value] * padding_needed)
#     return sequence

# Convert lists to NumPy arrays
X_train = np.array(races_train)
y_train = np.array(targets_train)
rider_names_train = np.array(rider_names_train, dtype=object)

X_test = np.array(races_test)
y_test = np.array(targets_test)
rider_names_test = np.array(rider_names_test, dtype=object)

# Save the data
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('rider_names_train.npy', rider_names_train)

np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)
np.save('rider_names_test.npy', rider_names_test)

print("Data preprocessing completed and saved.")

Maximum number of riders: 207
Data preprocessing completed and saved.
