In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
df = pd.read_csv('../data/cleaned_data.csv')

In [8]:
df.head()

Unnamed: 0,Pla.,Dist.,track_condition,RaceClass,gate_position,Rtg.,Trainer,Jockey,Win Odds,Act.Wt.,Declar.Horse Wt.,Horse_id,Import type,Sire,Dam,Dam sire,race_index,rc,track,course,origin,age,colour,sex,B,BO,CC,CP,CO,E,H,P,PC,PS,SB,SR,TT,V,VO,XB
0,11,1000,G,5,9,37,K L Man,Y L Chung,154.0,129,1019,J006,PPG,Capitalist,Dorodansa,Bellamy Road,142301024,HV,Turf,C+3,AUS,,Chestnut,Gelding,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,12,1200,G,5,2,39,K L Man,E C W Wong,135.0,126,1023,J006,PPG,Capitalist,Dorodansa,Bellamy Road,48250924,HV,Turf,C,AUS,,Chestnut,Gelding,False,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False
2,10,1200,GF,4,6,44,T P Yung,H T Mo,205.0,117,1013,J006,PPG,Capitalist,Dorodansa,Bellamy Road,661110524,ST,Turf,C,AUS,,Chestnut,Gelding,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
3,14,1600,G,4,12,47,T P Yung,C L Chau,221.0,121,1029,J006,PPG,Capitalist,Dorodansa,Bellamy Road,468030324,ST,Turf,B+2,AUS,,Chestnut,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
4,12,1200,G,4,2,50,T P Yung,K C Leung,67.0,128,1019,J006,PPG,Capitalist,Dorodansa,Bellamy Road,403070224,HV,Turf,B,AUS,,Chestnut,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


In [30]:
df['Rtg.'] = pd.to_numeric(df['Rtg.'], errors = 'coerce')

# Utility function

In [9]:
def reduce_cardinality(series, min_freq = 10):
    counts = series.value_counts()
    rare = counts[counts < min_freq].index
    return series.apply(lambda x: 'unknown' if x in rare else x)

In [10]:
def encode_placing(group):
    finish_mask = group['Pla.'].astype(str).str.isdigit()
    max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
    return group['Pla.'].apply(lambda x: int(x) if str(x).isdigit() else max_rank + 1)

# Approach 1 (training with all variables)

In [25]:
df1 = df.copy()

In [26]:
df1 = df1[~df1['Pla.'].isin(['UR', 'FE', 'TNP', 'PU', 'DNF', 'DISQ'])]

In [27]:
# define target and groupings
target = 'Pla.'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt', 'Declar.Horse Wt.'
]

In [28]:
# missingness indicator for 'age'
df1['age_missing'] = df1['age'].isnull().astype(int)

# impute age with median
age_median = df1['age'].median()
df1['age'] = df1['age'].fillna(age_median)

In [29]:
# fill missing categoricals with 'unknown'
for col in categorical_cols:
    df1[col] = df1[col].fillna('unknown')

# cardinality of the categories being reduced
for col in categorical_cols:
    df1[col] = reduce_cardinality(df1[col])

for col in categorical_cols:
    df1[col] = df1[col].astype(str)

cardinality in Trainer, Jockey, Sire, Dam, Dam sire, and course reduced 

In [30]:
numerical_cols_updated = ['age_missing']

for col in numerical_cols_updated:
    df1[col] = pd.to_numeric(df1[col], errors = 'coerce').fillna(0)

In [31]:
# define the features, target and groups
X = df1[categorical_cols + numerical_cols_updated]
y = pd.to_numeric(df1[target], errors = 'coerce') # ensure target is int
groups = df1[group_col]

In [32]:
# split by race index for train and test sets
unique_races = groups.unique()
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

train_mask = groups.isin(train_races)
test_mask = groups.isin(test_races)

X_train, y_train, group_train = X[train_mask], y[train_mask], groups[train_mask]
X_test, y_test, group_test = X[test_mask], y[test_mask], groups[test_mask]

In [33]:
# sort by group_id for requirements
train_sorted_idx = group_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
group_train = group_train.iloc[train_sorted_idx]

test_sorted_idx = group_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
group_test = group_test.iloc[test_sorted_idx]

In [34]:
train_pool = Pool(
    data = X_train,
    label = y_train,
    group_id = group_train,
    cat_features = categorical_cols
)

test_pool = Pool(
    data = X_test,
    label = y_test,
    group_id = group_test,
    cat_features = categorical_cols
)

In [35]:
# Initialize and train CatBoost ranking model
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.02,
    depth=3,
    loss_function='YetiRank',
    eval_metric='NDCG',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=100
)

In [36]:
model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.8082862	best: 0.8082862 (0)	total: 20.7ms	remaining: 20.7s
100:	test: 0.8988427	best: 0.8993684 (97)	total: 952ms	remaining: 8.47s
200:	test: 0.9041507	best: 0.9041507 (200)	total: 2.01s	remaining: 7.98s
300:	test: 0.9080305	best: 0.9080305 (300)	total: 2.99s	remaining: 6.94s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9088402074
bestIteration = 316

Shrink model to first 317 iterations.


<catboost.core.CatBoostRanker at 0x157c6ead0>

In [None]:
test_preds = model.predict(X_test)

# Compute predicted ranks within each group (race)
test_results = X_test.copy()
test_results['true_pla'] = y_test
test_results['pred_score'] = test_preds
test_results['race_index'] = group_test

test_results['pred_rank'] = test_results.groupby('race_index')['pred_score'].rank(ascending=False, method='min')

In [39]:
# Evaluate using Spearman rank correlation per race
def race_spearman(group):
    if len(group) <= 1:
        return np.nan
    return spearmanr(group['true_pla'], group['pred_rank']).correlation

spearman_scores = test_results.groupby('race_index').apply(race_spearman)
mean_spearman = spearman_scores.dropna().mean()
print(f'Mean Spearman rank correlation on test races: {mean_spearman:.4f}')

Mean Spearman rank correlation on test races: -0.4355


  spearman_scores = test_results.groupby('race_index').apply(race_spearman)


# Updated approach (include unfinished horses)

In [31]:
df2 = df.copy()

In [32]:
# encode placing including unfinished horses with large numbers
def encode_placing(group):  
    finish_mask = group['Pla.'].astype(str).str.isdigit()
    max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
    return group['Pla.'].apply(lambda x: int(x) if str(x).isdigit() else max_rank + 1)

In [33]:
# define target and groupings
target = 'Pla.'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt.', 'Declar.Horse Wt.'
]

In [34]:
# Apply encoding for places (including unfinished horses)
df2 [target] = encode_placing(df)

In [35]:
# train test split
unique_races = df2[group_col].unique()

# split the race id for masking
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

In [36]:
# define mask of trianing and testing
train_mask = df2[group_col].isin(train_races)
test_mask = df2[group_col].isin(test_races)

# training and testing set
df2_train = df2.loc[train_mask].copy()
df2_test = df2.loc[test_mask].copy()

In [38]:
# median imputer for numerical variables
num_imputer = SimpleImputer(strategy = 'median')
df2_train[numerical_cols] = num_imputer.fit_transform(df2_train[numerical_cols])
df2_test[numerical_cols] = num_imputer.transform(df2_test[numerical_cols])

In [39]:
# fill missing with 'unknown' for categorical
for col in categorical_cols:
    df2_train[col] = df2_train[col].astype(str).fillna('unknown')
    df2_test[col] = df2_test[col].astype(str).fillna('unknown')

In [41]:
# defind the x and y for train and test
X_train = df2_train[categorical_cols + numerical_cols]
y_train = df2_train[target]
groups_train = df2_train[group_col]

X_test = df2_test[categorical_cols + numerical_cols]
y_test = df2_test[target]
groups_test = df2_test[group_col]

# sort the data according to group col
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]


train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [69]:
model = CatBoostRanker(
    iterations=1000,
    learning_rate=0.1,
    depth=5,
    loss_function='YetiRankPairwise',
    eval_metric='MRR',
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool)

Pairwise scoring loss functions on CPU do not support one hot features. OneHotMaxSize set to 1
0:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 62.4ms	remaining: 1m 2s
100:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 5.97s	remaining: 53.2s
200:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 12.6s	remaining: 50s
300:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 19.3s	remaining: 44.7s
400:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 26s	remaining: 38.8s
500:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 32.8s	remaining: 32.7s
600:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 39.9s	remaining: 26.5s
700:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 47.1s	remaining: 20.1s
800:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 54.5s	remaining: 13.5s
900:	learn: 1.0000000	test: 1.0000000	best: 1.0000000 (0)	total: 1m 1s	remaining: 6.76s
999:	learn: 1.0000000	test: 1.

<catboost.core.CatBoostRanker at 0x12dfc3a70>

In [70]:
test_preds = model.predict(X_test)

In [71]:
import numpy as np
import pandas as pd
from catboost import Pool

# Assuming you have your prediction dataframe `test_results` ready:
# X_test, y_test, groups_test, and test_preds as from your earlier code

# Prepare results DataFrame
test_results = X_test.copy()
test_results['true_pla'] = y_test.values
test_results['pred_score'] = test_preds
test_results['race_index'] = groups_test.values

# Convert placing to CatBoost-like relevance (higher is better)
def placing_to_relevance(placing):
    max_plac = placing.max()
    return max_plac + 1 - placing  # Invert placing so 1 -> max relevance

test_results['true_rel'] = placing_to_relevance(test_results['true_pla'])

# Compute predicted ranks within each group (higher score better rank)
test_results['pred_rank'] = test_results.groupby('race_index')['pred_score'] \
                                       .rank(ascending=False, method='min')

# NDCG@4 aligned with CatBoost metric
def dcg_at_k(relevance_scores, k=4):
    relevance = np.array(relevance_scores)[:k].astype(float)
    if relevance.size == 0:
        return 0.0
    discounts = np.log2(np.arange(2, relevance.size + 2))
    return np.sum((2 ** relevance - 1) / discounts)

def ndcg_at_4_aligned(group):
    true_relevance = group['true_rel']
    order = np.argsort(-group['pred_score'])
    sorted_true = true_relevance.iloc[order].values
    ideal = np.sort(true_relevance.values)[::-1][:4]

    dcg_val = dcg_at_k(sorted_true, 4)
    idcg_val = dcg_at_k(ideal, 4)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0

# Precision@4 aligned
def precision_at_4_aligned(group):
    true_top4 = set(group.nlargest(4, 'true_rel').index)
    pred_top4 = set(group.nsmallest(4, 'pred_rank').index)
    return len(true_top4.intersection(pred_top4)) / 4

# Calculate per race
ndcg_scores = test_results.groupby('race_index').apply(ndcg_at_4_aligned)
precision_scores = test_results.groupby('race_index').apply(precision_at_4_aligned)

# Aggregate results
mean_ndcg4 = ndcg_scores.mean()
mean_precision4 = precision_scores.mean()

print(f'Mean NDCG@4 aligned with CatBoost: {mean_ndcg4:.4f}')
print(f'Mean Precision@4 aligned with CatBoost: {mean_precision4:.4f}')


Mean NDCG@4 aligned with CatBoost: 0.0808
Mean Precision@4 aligned with CatBoost: 0.1347


  ndcg_scores = test_results.groupby('race_index').apply(ndcg_at_4_aligned)
  precision_scores = test_results.groupby('race_index').apply(precision_at_4_aligned)
