In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/cleaned_data.csv')

In [4]:
df['Rtg.'] = pd.to_numeric(df['Rtg.'], errors = 'coerce')

# Utility function

In [5]:
def reduce_cardinality(series, min_freq = 10):
    counts = series.value_counts()
    rare = counts[counts < min_freq].index
    return series.apply(lambda x: 'unknown' if x in rare else x)

In [6]:
def encode_placing(group):
    finish_mask = group['Pla.'].astype(str).str.isdigit()
    max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
    return group['Pla.'].apply(lambda x: int(x) if str(x).isdigit() else max_rank + 1)

# Approach 1 (training with all variables)

In [7]:
df1 = df.copy()

In [8]:
df1 = df1[~df1['Pla.'].isin(['UR', 'FE', 'TNP', 'PU', 'DNF', 'DISQ'])]

In [9]:
# define target and groupings
target = 'Pla.'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt', 'Declar.Horse Wt.'
]

In [10]:
# missingness indicator for 'age'
df1['age_missing'] = df1['age'].isnull().astype(int)

# impute age with median
age_median = df1['age'].median()
df1['age'] = df1['age'].fillna(age_median)

In [11]:
# fill missing categoricals with 'unknown'
for col in categorical_cols:
    df1[col] = df1[col].fillna('unknown')

# cardinality of the categories being reduced
for col in categorical_cols:
    df1[col] = reduce_cardinality(df1[col])

for col in categorical_cols:
    df1[col] = df1[col].astype(str)

cardinality in Trainer, Jockey, Sire, Dam, Dam sire, and course reduced 

In [12]:
numerical_cols_updated = ['age_missing']

for col in numerical_cols_updated:
    df1[col] = pd.to_numeric(df1[col], errors = 'coerce').fillna(0)

In [13]:
# define the features, target and groups
X = df1[categorical_cols + numerical_cols_updated]
y = pd.to_numeric(df1[target], errors = 'coerce') # ensure target is int
groups = df1[group_col]

In [14]:
# split by race index for train and test sets
unique_races = groups.unique()
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

train_mask = groups.isin(train_races)
test_mask = groups.isin(test_races)

X_train, y_train, group_train = X[train_mask], y[train_mask], groups[train_mask]
X_test, y_test, group_test = X[test_mask], y[test_mask], groups[test_mask]

In [15]:
# sort by group_id for requirements
train_sorted_idx = group_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
group_train = group_train.iloc[train_sorted_idx]

test_sorted_idx = group_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
group_test = group_test.iloc[test_sorted_idx]

In [16]:
train_pool = Pool(
    data = X_train,
    label = y_train,
    group_id = group_train,
    cat_features = categorical_cols
)

test_pool = Pool(
    data = X_test,
    label = y_test,
    group_id = group_test,
    cat_features = categorical_cols
)

In [17]:
# Initialize and train CatBoost ranking model
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.02,
    depth=3,
    loss_function='YetiRank',
    eval_metric='NDCG',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=100
)

In [18]:
model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.8461919	best: 0.8461919 (0)	total: 81.7ms	remaining: 1m 21s
100:	test: 0.9025298	best: 0.9026374 (98)	total: 2.06s	remaining: 18.3s
200:	test: 0.9066555	best: 0.9066555 (200)	total: 4.14s	remaining: 16.4s
300:	test: 0.9092138	best: 0.9092199 (299)	total: 6.26s	remaining: 14.5s
400:	test: 0.9105801	best: 0.9105969 (392)	total: 8.39s	remaining: 12.5s
500:	test: 0.9108325	best: 0.9109858 (491)	total: 10.5s	remaining: 10.5s
600:	test: 0.9112805	best: 0.9113739 (588)	total: 12.6s	remaining: 8.38s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.911504219
bestIteration = 607

Shrink model to first 608 iterations.


<catboost.core.CatBoostRanker at 0x112900ec0>

In [19]:
test_preds = model.predict(X_test)

# Compute predicted ranks within each group (race)
test_results = X_test.copy()
test_results['true_pla'] = y_test
test_results['pred_score'] = test_preds
test_results['race_index'] = group_test

test_results['pred_rank'] = test_results.groupby('race_index')['pred_score'].rank(ascending=False, method='min')

In [20]:
# Evaluate using Spearman rank correlation per race
def race_spearman(group):
    if len(group) <= 1:
        return np.nan
    return spearmanr(group['true_pla'], group['pred_rank']).correlation

spearman_scores = test_results.groupby('race_index').apply(race_spearman)
mean_spearman = spearman_scores.dropna().mean()
print(f'Mean Spearman rank correlation on test races: {mean_spearman:.4f}')

Mean Spearman rank correlation on test races: -0.4465


  spearman_scores = test_results.groupby('race_index').apply(race_spearman)


# Updated approach (include unfinished horses)

In [49]:
df2 = df.copy()

In [50]:
# encode placing including unfinished horses with large numbers
def encode_placing(group):  
    finish_mask = group['Pla.'].astype(str).str.isdigit()
    max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
    return group['Pla.'].apply(lambda x: int(x) if str(x).isdigit() else max_rank + 1)

In [51]:
# define target and groupings
target = 'Pla.'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt.', 'Declar.Horse Wt.'
]

In [52]:
# Apply encoding for places (including unfinished horses)
df2 [target] = encode_placing(df)

In [53]:
# train test split
unique_races = df2[group_col].unique()

# split the race id for masking
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

In [54]:
# define mask of trianing and testing
train_mask = df2[group_col].isin(train_races)
test_mask = df2[group_col].isin(test_races)

# training and testing set
df2_train = df2.loc[train_mask].copy()
df2_test = df2.loc[test_mask].copy()

In [55]:
# median imputer for numerical variables
num_imputer = SimpleImputer(strategy = 'median')
df2_train[numerical_cols] = num_imputer.fit_transform(df2_train[numerical_cols])
df2_test[numerical_cols] = num_imputer.transform(df2_test[numerical_cols])

In [56]:
# fill missing with 'unknown' for categorical
for col in categorical_cols:
    df2_train[col] = df2_train[col].astype(str).fillna('unknown')
    df2_test[col] = df2_test[col].astype(str).fillna('unknown')

In [57]:
# defind the x and y for train and test
X_train = df2_train[categorical_cols + numerical_cols]
y_train = df2_train[target]
groups_train = df2_train[group_col]

X_test = df2_test[categorical_cols + numerical_cols]
y_test = df2_test[target]
groups_test = df2_test[group_col]

# sort the data according to group col
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]


train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [66]:
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.05,
    depth=5,
    loss_function='YetiRank',
    eval_metric='NDCG:top=4',
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.7476582	best: 0.7476582 (0)	total: 38.3ms	remaining: 38.3s
100:	test: 0.8186548	best: 0.8190583 (93)	total: 2.84s	remaining: 25.3s
200:	test: 0.8226846	best: 0.8228631 (181)	total: 5.57s	remaining: 22.2s
300:	test: 0.8227460	best: 0.8235735 (280)	total: 8.38s	remaining: 19.5s
400:	test: 0.8245765	best: 0.8245765 (400)	total: 11.1s	remaining: 16.6s
500:	test: 0.8241866	best: 0.8245846 (401)	total: 13.9s	remaining: 13.8s
600:	test: 0.8245139	best: 0.8253808 (567)	total: 16.7s	remaining: 11.1s
700:	test: 0.8266187	best: 0.8271406 (696)	total: 19.4s	remaining: 8.29s
800:	test: 0.8261030	best: 0.8271406 (696)	total: 22.2s	remaining: 5.52s
900:	test: 0.8272253	best: 0.8274233 (888)	total: 25s	remaining: 2.75s
999:	test: 0.8269414	best: 0.8276140 (910)	total: 27.8s	remaining: 0us

bestTest = 0.8276140152
bestIteration = 910

Shrink model to first 911 iterations.


<catboost.core.CatBoostRanker at 0x114a699d0>

In [67]:
y_pred = model.predict(X_test)

In [68]:
# You can use ranking metrics like NDCG, MAP, etc.
from sklearn.metrics import ndcg_score

# Evaluate NDCG Score
ndcg = ndcg_score([y_test], [y_pred], k=4)
print(f"NDCG Score: {ndcg}")

NDCG Score: 0.7901939816695625


In [69]:
print(groups_test.unique()[:5])

[1080924 2100923 2110922 3080924 4080924]


In [75]:
# Choose the race you want to inspect
specific_race = 4080924  # replace with an actual race_index value from groups_test

# Filter for the specific race in test set
race_mask = (groups_test == specific_race)

# Get the horse info, true placing, predicted scores for the race
race_true = y_test[race_mask]
race_pred = y_pred[race_mask]
race_data = X_test[race_mask]

# Combine into a DataFrame for easy comparison
import pandas as pd

comparison_df = pd.DataFrame({
    'True_Rank': race_true.values,
    'Predicted_Score': race_pred
})

# Sort by predicted score (descending: highest score means better rank)
comparison_df_sorted_by_pred = comparison_df.sort_values(by='Predicted_Score', ascending=False)

# Sort by true rank (ascending: 1 is best rank)
comparison_df_sorted_by_true = comparison_df.sort_values(by='True_Rank')

print("Sorted by Predicted Ranking Scores:")
print(comparison_df_sorted_by_pred)

print("\nSorted by True Ranking:")
print(comparison_df_sorted_by_true)


Sorted by Predicted Ranking Scores:
    True_Rank  Predicted_Score
7          12         3.009545
0           3         0.915008
1           7         0.856832
6          11         0.846419
9          10         0.831640
2           6         0.525494
11          9         0.216802
5           2        -0.352640
10          8        -0.535012
3           4        -0.787465
4           5        -0.936605
8           1        -2.610698

Sorted by True Ranking:
    True_Rank  Predicted_Score
8           1        -2.610698
5           2        -0.352640
0           3         0.915008
3           4        -0.787465
4           5        -0.936605
2           6         0.525494
1           7         0.856832
10          8        -0.535012
11          9         0.216802
9          10         0.831640
6          11         0.846419
7          12         3.009545


In [79]:
# Choose the race you want to inspect
specific_race = 2100923  # replace with an actual race_index value from groups_test

# Filter for the specific race in test set
race_mask = (groups_test == specific_race)

# Get the horse info, true placing, predicted scores for the race
race_true = y_test[race_mask]
race_pred = y_pred[race_mask]
race_data = X_test[race_mask]

# Combine into a DataFrame for easy comparison
import pandas as pd

comparison_df = pd.DataFrame({
    'True_Rank': race_true.values,
    'Predicted_Score': race_pred
})

# If you observe predicted scores are inversely related to true ranks, invert them for intuitive comparison
comparison_df['Adjusted_Predicted_Score'] = -comparison_df['Predicted_Score']

# Sort by predicted score (descending: highest score means better rank)
comparison_df_sorted_by_pred = comparison_df.sort_values(by='Adjusted_Predicted_Score', ascending=False)

# Sort by true rank (ascending: 1 is best rank)
comparison_df_sorted_by_true = comparison_df.sort_values(by='True_Rank')

print("Sorted by Predicted Ranking Scores (Adjusted):")
print(comparison_df_sorted_by_pred)

print("\nSorted by True Ranking:")
print(comparison_df_sorted_by_true)


Sorted by Predicted Ranking Scores (Adjusted):
   True_Rank  Predicted_Score  Adjusted_Predicted_Score
6          9        -1.757405                  1.757405
7          1        -0.887398                  0.887398
4          2        -0.828287                  0.828287
5          3        -0.277257                  0.277257
2          5        -0.261049                  0.261049
0          6         0.465285                 -0.465285
8          7         0.642855                 -0.642855
3          4         0.727797                 -0.727797
1          8         1.740684                 -1.740684

Sorted by True Ranking:
   True_Rank  Predicted_Score  Adjusted_Predicted_Score
7          1        -0.887398                  0.887398
4          2        -0.828287                  0.828287
5          3        -0.277257                  0.277257
3          4         0.727797                 -0.727797
2          5        -0.261049                  0.261049
0          6         0.465285   

In [80]:
import numpy as np

correlation = np.corrcoef(y_pred, y_test)[0, 1]
print(f"Correlation between predicted scores and true ranks: {correlation:.3f}")


Correlation between predicted scores and true ranks: 0.565
