In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/cleaned_data.csv')

In [4]:
df.head()

Unnamed: 0,Pla.,Dist.,track_condition,RaceClass,gate_position,Rtg.,Trainer,Jockey,Win Odds,Act.Wt.,Declar.Horse Wt.,Horse_id,Import type,Sire,Dam,Dam sire,race_index,rc,track,course,origin,age,colour,sex,B,BO,CC,CP,CO,E,H,P,PC,PS,SB,SR,TT,V,VO,XB
0,11,1000,G,5,9,37,K L Man,Y L Chung,154.0,129,1019,J006,PPG,Capitalist,Dorodansa,Bellamy Road,142301024,HV,Turf,C+3,AUS,,Chestnut,Gelding,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,12,1200,G,5,2,39,K L Man,E C W Wong,135.0,126,1023,J006,PPG,Capitalist,Dorodansa,Bellamy Road,48250924,HV,Turf,C,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False
2,10,1200,GF,4,6,44,T P Yung,H T Mo,205.0,117,1013,J006,PPG,Capitalist,Dorodansa,Bellamy Road,661110524,ST,Turf,C,AUS,,Chestnut,Gelding,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
3,14,1600,G,4,12,47,T P Yung,C L Chau,221.0,121,1029,J006,PPG,Capitalist,Dorodansa,Bellamy Road,468030324,ST,Turf,B+2,AUS,,Chestnut,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,12,1200,G,4,2,50,T P Yung,K C Leung,67.0,128,1019,J006,PPG,Capitalist,Dorodansa,Bellamy Road,403070224,HV,Turf,B,AUS,,Chestnut,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


# Utility function

In [69]:
def reduce_cardinality(series, min_freq = 10):
    counts = series.value_counts()
    rare = counts[counts < min_freq].index
    return series.apply(lambda x: 'unknown' if x in rare else x)

In [24]:
def encode_placing(group):
    finish_mask = group['Pla.'].astype(str).str.isdigit()
    max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
    return group['Pla.'].apply(lambda x: int(x) if str(x).isdigit() else max_rank + 1)

# Approach 1 (training with all variables)

In [25]:
df1 = df.copy()

In [26]:
df1 = df1[~df1['Pla.'].isin(['UR', 'FE', 'TNP', 'PU', 'DNF', 'DISQ'])]

In [27]:
# define target and groupings
target = 'Pla.'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt', 'Declar.Horse Wt.'
]

In [28]:
# missingness indicator for 'age'
df1['age_missing'] = df1['age'].isnull().astype(int)

# impute age with median
age_median = df1['age'].median()
df1['age'] = df1['age'].fillna(age_median)

In [29]:
# fill missing categoricals with 'unknown'
for col in categorical_cols:
    df1[col] = df1[col].fillna('unknown')

# cardinality of the categories being reduced
for col in categorical_cols:
    df1[col] = reduce_cardinality(df1[col])

for col in categorical_cols:
    df1[col] = df1[col].astype(str)

cardinality in Trainer, Jockey, Sire, Dam, Dam sire, and course reduced 

In [30]:
numerical_cols_updated = ['age_missing']

for col in numerical_cols_updated:
    df1[col] = pd.to_numeric(df1[col], errors = 'coerce').fillna(0)

In [31]:
# define the features, target and groups
X = df1[categorical_cols + numerical_cols_updated]
y = pd.to_numeric(df1[target], errors = 'coerce') # ensure target is int
groups = df1[group_col]

In [32]:
# split by race index for train and test sets
unique_races = groups.unique()
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

train_mask = groups.isin(train_races)
test_mask = groups.isin(test_races)

X_train, y_train, group_train = X[train_mask], y[train_mask], groups[train_mask]
X_test, y_test, group_test = X[test_mask], y[test_mask], groups[test_mask]

In [33]:
# sort by group_id for requirements
train_sorted_idx = group_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
group_train = group_train.iloc[train_sorted_idx]

test_sorted_idx = group_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
group_test = group_test.iloc[test_sorted_idx]

In [34]:
train_pool = Pool(
    data = X_train,
    label = y_train,
    group_id = group_train,
    cat_features = categorical_cols
)

test_pool = Pool(
    data = X_test,
    label = y_test,
    group_id = group_test,
    cat_features = categorical_cols
)

In [35]:
# Initialize and train CatBoost ranking model
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.02,
    depth=3,
    loss_function='YetiRank',
    eval_metric='NDCG',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=100
)

In [36]:
model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.8082862	best: 0.8082862 (0)	total: 20.7ms	remaining: 20.7s
100:	test: 0.8988427	best: 0.8993684 (97)	total: 952ms	remaining: 8.47s
200:	test: 0.9041507	best: 0.9041507 (200)	total: 2.01s	remaining: 7.98s
300:	test: 0.9080305	best: 0.9080305 (300)	total: 2.99s	remaining: 6.94s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9088402074
bestIteration = 316

Shrink model to first 317 iterations.


<catboost.core.CatBoostRanker at 0x157c6ead0>

In [None]:
test_preds = model.predict(X_test)

# Compute predicted ranks within each group (race)
test_results = X_test.copy()
test_results['true_pla'] = y_test
test_results['pred_score'] = test_preds
test_results['race_index'] = group_test

test_results['pred_rank'] = test_results.groupby('race_index')['pred_score'].rank(ascending=False, method='min')

In [39]:
# Evaluate using Spearman rank correlation per race
def race_spearman(group):
    if len(group) <= 1:
        return np.nan
    return spearmanr(group['true_pla'], group['pred_rank']).correlation

spearman_scores = test_results.groupby('race_index').apply(race_spearman)
mean_spearman = spearman_scores.dropna().mean()
print(f'Mean Spearman rank correlation on test races: {mean_spearman:.4f}')

Mean Spearman rank correlation on test races: -0.4355


  spearman_scores = test_results.groupby('race_index').apply(race_spearman)


# Approach 2 (handling non-finishers)

In [86]:
df1 = df.copy()

In [87]:
# define target and groupings
df1['Pla_encoded'] = df1.groupby('race_index', group_keys = False).apply(encode_placing)
target = 'Pla_encoded'

group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt', 'Declar.Horse Wt.'
]

  df1['Pla_encoded'] = df1.groupby('race_index', group_keys = False).apply(encode_placing)


In [88]:
df1

Unnamed: 0,Pla.,Dist.,track_condition,RaceClass,gate_position,Rtg.,Trainer,Jockey,Win Odds,Act.Wt.,Declar.Horse Wt.,Horse_id,Import type,Sire,Dam,Dam sire,race_index,rc,track,course,origin,age,colour,sex,B,BO,CC,CP,CO,E,H,P,PC,PS,SB,SR,TT,V,VO,XB,Pla_encoded
0,11,1000,G,5,9,37,K L Man,Y L Chung,154.0,129,1019,J006,PPG,Capitalist,Dorodansa,Bellamy Road,142301024,HV,Turf,C+3,AUS,,Chestnut,Gelding,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,11
1,12,1200,G,5,2,39,K L Man,E C W Wong,135.0,126,1023,J006,PPG,Capitalist,Dorodansa,Bellamy Road,48250924,HV,Turf,C,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,12
2,10,1200,GF,4,6,44,T P Yung,H T Mo,205.0,117,1013,J006,PPG,Capitalist,Dorodansa,Bellamy Road,661110524,ST,Turf,C,AUS,,Chestnut,Gelding,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,10
3,14,1600,G,4,12,47,T P Yung,C L Chau,221.0,121,1029,J006,PPG,Capitalist,Dorodansa,Bellamy Road,468030324,ST,Turf,B+2,AUS,,Chestnut,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,14
4,12,1200,G,4,2,50,T P Yung,K C Leung,67.0,128,1019,J006,PPG,Capitalist,Dorodansa,Bellamy Road,403070224,HV,Turf,B,AUS,,Chestnut,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20139,03,1400,G,3,11,68,C Fownes,C Y Ho,3.7,123,1152,H459,PP,Impending,Isola Blu,Blackfriars,278231223,ST,Turf,C,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,3
20140,02,1200,G,3,10,68,C Fownes,C Y Ho,13.0,127,1153,H459,PP,Impending,Isola Blu,Blackfriars,240101223,ST,Turf,A,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2
20141,02,1200,GF,3,6,68,C Fownes,C Y Ho,16.0,124,1155,H459,PP,Impending,Isola Blu,Blackfriars,185191123,ST,Turf,B+2,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2
20142,06,1200,G,4,11,53,M Newnham,Z Purton,2.0,128,1127,K334,PPG,Street Boss,Varanasi,Encosta de Lago,801010725,ST,Turf,C,AUS,4.0,Grey,Gelding,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,6


In [89]:
# missingness indicator for 'age'
df1['age_missing'] = df1['age'].isnull().astype(int)

# impute age with median
age_median = df1['age'].median()
df1['age'] = df1['age'].fillna(age_median)

In [90]:
# fill missing categoricals with 'unknown'
for col in categorical_cols:
    df1[col] = df1[col].fillna('unknown')

# cardinality of the categories being reduced
for col in categorical_cols:
    df1[col] = reduce_cardinality(df1[col])

for col in categorical_cols:
    df1[col] = df1[col].astype(str)

In [91]:
# Update numerical columns used (including age_missing)
numerical_cols_updated = ['age_missing']

for col in numerical_cols_updated:
    df1[col] = pd.to_numeric(df1[col], errors='coerce').fillna(0)

# Define features, target and groups
X = df1[categorical_cols + numerical_cols_updated]
y = df1[target]
groups = df1['race_index']

In [92]:
unique_races = groups.unique()
train_races, test_races = train_test_split(unique_races, test_size=0.2, random_state=42)

train_mask = groups.isin(train_races)
test_mask = groups.isin(test_races)

X_train, y_train, group_train = X[train_mask], y[train_mask], groups[train_mask]
X_test, y_test, group_test = X[test_mask], y[test_mask], groups[test_mask]

# Sort by group_id for CatBoost requirement
train_sorted_idx = group_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
group_train = group_train.iloc[train_sorted_idx]

test_sorted_idx = group_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
group_test = group_test.iloc[test_sorted_idx]

In [93]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=group_train,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=group_test,
    cat_features=categorical_cols
)

# Initialize and train CatBoost ranking model
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.02,
    depth=3,
    loss_function='YetiRank',
    eval_metric='NDCG',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.8319435	best: 0.8319435 (0)	total: 12.8ms	remaining: 12.7s
100:	test: 0.9042307	best: 0.9042619 (99)	total: 967ms	remaining: 8.61s
200:	test: 0.9091864	best: 0.9091864 (200)	total: 1.99s	remaining: 7.89s
300:	test: 0.9124712	best: 0.9125766 (297)	total: 2.96s	remaining: 6.86s
400:	test: 0.9128133	best: 0.9134384 (368)	total: 3.92s	remaining: 5.86s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9134383854
bestIteration = 368

Shrink model to first 369 iterations.


<catboost.core.CatBoostRanker at 0x157f07d10>

In [94]:
test_preds = model.predict(X_test)

# Compute predicted ranks within each group (race)
test_results = X_test.copy()
test_results['true_pla'] = y_test
test_results['pred_score'] = test_preds
test_results['race_index'] = group_test

test_results['pred_rank'] = test_results.groupby('race_index')['pred_score'].rank(ascending=False, method='min')

In [96]:
max_pla = test_results['true_pla'].max()
test_results['true_pla_inverted'] = max_pla - test_results['true_pla'] + 1

def race_spearman_inverted(group):
    if len(group) <= 1:
        return np.nan
    return spearmanr(group['true_pla_inverted'], group['pred_rank']).correlation

spearman_scores = test_results.groupby('race_index').apply(race_spearman_inverted)
mean_spearman = spearman_scores.dropna().mean()
print(f'Mean Spearman rank correlation on test races (inverted true_pla): {mean_spearman:.4f}')


Mean Spearman rank correlation on test races (inverted true_pla): 0.4516


  spearman_scores = test_results.groupby('race_index').apply(race_spearman_inverted)


In [80]:
max_pla = test_results['true_pla'].max()

In [51]:
max_pla

np.int64(14)

In [52]:
test_results['true_pla_inverted'] = max_pla - test_results['true_pla'] + 1

In [53]:
test_results

Unnamed: 0,Dist.,track_condition,RaceClass,gate_position,Trainer,Jockey,Import type,Sire,Dam,Dam sire,rc,track,course,origin,age,colour,sex,age_missing,true_pla,pred_score,race_index,pred_rank,true_pla_inverted
11397,1400,G,2,7,K W Lui,Z Purton,PP,Nicconi,Jolie Joy,Kendargent,ST,Turf,A,AUS,7.0,Chestnut,Gelding,0,2,-0.465721,7080924,5.0,13
12398,1400,G,2,4,A S Cruz,M Chadwick,PP,Acclamation,Up In Time,Noverre,ST,Turf,A,IRE,5.0,Bay,Gelding,0,7,-0.045584,7080924,2.0,8
4259,1400,G,2,8,P C Ng,Y L Chung,PP,Frankel,Janey Muddles,Lawman,ST,Turf,A,GB,6.0,Bay,Gelding,0,8,-0.428069,7080924,4.0,7
4453,1400,G,2,5,C Fownes,C Y Ho,PPG,Turn Me Loose,Lovingthelimelight,Lemon Drop Kid,ST,Turf,A,AUS,7.0,Chestnut,Gelding,0,3,-1.247153,7080924,8.0,12
19408,1400,G,2,2,F C Lor,K Teetan,PP,All Too Hard,Urban Rocker,Fastnet Rock,ST,Turf,A,AUS,7.0,Bay,Gelding,0,4,-0.267889,7080924,3.0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1747,1200,GF,2,4,M Newnham,M F Poon,PPG,Va Pensiero,Ready To Impress,Orpen,HV,Turf,B,AUS,5.0,Bay,Gelding,1,12,-0.234406,846160725,7.0,3
1722,1200,GF,2,1,T P Yung,M Chadwick,PP,Justify,Fine Scent,All Too Hard,HV,Turf,B,AUS,5.0,Bay,Gelding,0,6,-0.073074,846160725,5.0,9
18645,1200,GF,2,3,J Size,H Bowman,PPG,Per Incanto,Perfect Beat,Magnus,HV,Turf,B,NZ,6.0,Bay,Gelding,0,7,-1.608375,846160725,12.0,8
12972,1200,GF,2,11,K W Lui,K De Melo,PPG,Myboycharlie,Isador Amore,Hidden Dragon,HV,Turf,B,AUS,7.0,Bay,Gelding,0,8,0.103046,846160725,3.0,7
