In [40]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr

In [41]:
pd.set_option('display.max_columns', None)

In [42]:
df = pd.read_csv('../data/cleaned_data.csv')

In [97]:
df

Unnamed: 0,Pla.,Dist.,track_condition,RaceClass,gate_position,Rtg.,Trainer,Jockey,Win Odds,Act.Wt.,Declar.Horse Wt.,Horse_id,Import type,Sire,Dam,Dam sire,race_index,rc,track,course,origin,age,colour,sex,B,BO,CC,CP,CO,E,H,P,PC,PS,SB,SR,TT,V,VO,XB
0,12,1600,G,G1,10,,T Yasuda,Y Kitamura,64.0,126,1187,H811,VIS,Just A Way,Epic Love,Dansili,238101223,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
1,05,2000,G,G1,7,,T Yasuda,C Y Ho,11.0,126,1179,H811,VIS,Just A Way,Epic Love,Dansili,623300423,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2,02,2000,G,G1,6,,T Yasuda,Y Kitamura,15.0,126,1150,H811,VIS,Just A Way,Epic Love,Dansili,240111222,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,11,1200,G,5,2,18.0,C H Yip,C Wong,7.1,108,1045,C017,PPG,Smart Missile,Pyrography,Danzero,402060221,ST,Turf,C,AUS,,Brown,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,05,1200,G,5,14,18.0,C H Yip,M F Poon,42.0,113,1058,C017,PPG,Smart Missile,Pyrography,Danzero,296261220,ST,Turf,A+3,AUS,,Brown,Gelding,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50539,03,1400,G,3,11,68.0,C Fownes,C Y Ho,3.7,123,1152,H459,PP,Impending,Isola Blu,Blackfriars,278231223,ST,Turf,C,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50540,02,1200,G,3,10,68.0,C Fownes,C Y Ho,13.0,127,1153,H459,PP,Impending,Isola Blu,Blackfriars,240101223,ST,Turf,A,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50541,02,1200,GF,3,6,68.0,C Fownes,C Y Ho,16.0,124,1155,H459,PP,Impending,Isola Blu,Blackfriars,185191123,ST,Turf,B+2,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50542,06,1200,G,4,11,53.0,M Newnham,Z Purton,2.0,128,1127,K334,PPG,Street Boss,Varanasi,Encosta de Lago,801010725,ST,Turf,C,AUS,4.0,Grey,Gelding,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# Utility function

In [43]:
def reduce_cardinality(series, min_freq = 10):
    counts = series.value_counts()
    rare = counts[counts < min_freq].index
    return series.apply(lambda x: 'unknown' if x in rare else x)

In [45]:
def encode_placing(group):  
   finish_mask = group['Pla.'].astype(str).str.isdigit()
   max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
   # Assign non-finisher a score of 0, and invert numeric ranks
   return group['Pla.'].apply(
       lambda x: max_rank - int(x) + 1 if str(x).isdigit() else 0
   )


# Full model fit test

In [98]:
df1 = df.copy()

In [99]:
# Apply encoding for places (including unfinished horses)
df1 ['target'] = df1.groupby('race_index').apply(encode_placing).reset_index(level=0, drop=True)

  df1 ['target'] = df1.groupby('race_index').apply(encode_placing).reset_index(level=0, drop=True)


In [100]:
# define target and groupings
target = 'target'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt.', 'Declar.Horse Wt.'
]

In [101]:
# train test split
unique_races = df1[group_col].unique()

# split the race id for masking
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

In [102]:
# define mask of trianing and testing
train_mask = df1[group_col].isin(train_races)
test_mask = df1[group_col].isin(test_races)

# training and testing set
df1_train = df1.loc[train_mask].copy()
df1_test = df1.loc[test_mask].copy()

In [103]:
# median imputer for numerical variables
num_imputer = SimpleImputer(strategy = 'median')
df1_train[numerical_cols] = num_imputer.fit_transform(df1_train[numerical_cols])
df1_test[numerical_cols] = num_imputer.transform(df1_test[numerical_cols])

In [104]:
# fill missing with 'unknown' for categorical
for col in categorical_cols:
    df1_train[col] = df1_train[col].astype(str).fillna('unknown')
    df1_test[col] = df1_test[col].astype(str).fillna('unknown')

In [105]:
# defind the x and y for train and test
X_train = df1_train[categorical_cols + numerical_cols]
y_train = df1_train[target]
groups_train = df1_train[group_col]

X_test = df1_test[categorical_cols + numerical_cols]
y_test = df1_test[target]
groups_test = df1_test[group_col]

# sort the data according to group col
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]


train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [106]:
model = CatBoostRanker(
    iterations=500,
    learning_rate=0.1,
    depth= 6,
    loss_function='YetiRank',
    eval_metric='NDCG:top=3',
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.6902925	best: 0.6902925 (0)	total: 140ms	remaining: 1m 9s
100:	test: 0.8043334	best: 0.8047950 (95)	total: 14.2s	remaining: 55.9s
200:	test: 0.8047155	best: 0.8047950 (95)	total: 32.2s	remaining: 48s
300:	test: 0.8083186	best: 0.8085282 (296)	total: 51s	remaining: 33.7s
400:	test: 0.8100002	best: 0.8100029 (399)	total: 1m 9s	remaining: 17.2s
499:	test: 0.8078542	best: 0.8103265 (416)	total: 1m 28s	remaining: 0us

bestTest = 0.8103264984
bestIteration = 416

Shrink model to first 417 iterations.


<catboost.core.CatBoostRanker at 0x23d22e0a090>

In [107]:
y_pred = model.predict(X_test)

In [108]:
# You can use ranking metrics like NDCG, MAP, etc.
from sklearn.metrics import ndcg_score

# Evaluate NDCG Score
ndcg = ndcg_score([y_test], [y_pred], k=3)
print(f"NDCG Score: {ndcg}")

NDCG Score: 0.7565831301379807


In [109]:
print(groups_test.unique()[5:10])

[5100923 5110922 6050921 7050921 7060920]


In [110]:
# Choose the race you want to inspect
specific_race = 7060920  # replace with an actual race_index value from groups_test

# Filter for the specific race in test set
race_mask = (groups_test == specific_race)
horse_ids = df1_test.loc[race_mask, 'Horse_id']

# Get the horse info, true placing, predicted scores for the race
race_true = y_test[race_mask]
race_pred = y_pred[race_mask]
race_data = X_test[race_mask]

comparison_df = pd.DataFrame({
    'Horse_id': horse_ids,
    'True_Score': race_true.values,
    'Predicted_Score': race_pred
})

# Sort by predicted score (descending: highest score means better rank)
comparison_df_sorted_by_pred = comparison_df.sort_values(by='Predicted_Score', ascending=False)

# Sort by true rank (ascending: 1 is best rank)
comparison_df_sorted_by_true = comparison_df.sort_values(by='True_Score', ascending= False)

print("Sorted by Predicted Ranking Scores:")
print(comparison_df_sorted_by_pred)

print("\nSorted by True Ranking:")
print(comparison_df_sorted_by_true)


Sorted by Predicted Ranking Scores:
      Horse_id  True_Score  Predicted_Score
38940     C004          14         1.494786
16062     C373           2         0.553541
37235     C134          10         0.373689
39349     D387           4         0.171587
20576     D324          12         0.125443
27727     D265           5         0.113760
35979     C314           9        -0.232496
16781     D368          13        -0.563926
37671     C240          11        -0.633255
18750     C483           3        -1.496893
5889      D174           6        -1.549770
946       D270           1        -2.748548
12521     C474           7        -2.848469
4792      D383           8        -3.025296

Sorted by True Ranking:
      Horse_id  True_Score  Predicted_Score
38940     C004          14         1.494786
16781     D368          13        -0.563926
20576     D324          12         0.125443
37671     C240          11        -0.633255
37235     C134          10         0.373689
35979     C314 

# Grid search for hyperparameters


In [111]:
df2 = df.copy()
# Apply encoding for places (including unfinished horses)
df2 ['target'] = df2.groupby('race_index').apply(encode_placing).reset_index(level=0, drop=True)
# define target and groupings
target = 'target'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt.', 'Declar.Horse Wt.'
]
# train test split
unique_races = df2[group_col].unique()

# split the race id for masking
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)
# define mask of trianing and testing
train_mask = df2[group_col].isin(train_races)
test_mask = df2[group_col].isin(test_races)

# training and testing set
df2_train = df2.loc[train_mask].copy()
df2_test = df2.loc[test_mask].copy()
# median imputer for numerical variables
num_imputer = SimpleImputer(strategy = 'median')
df2_train[numerical_cols] = num_imputer.fit_transform(df2_train[numerical_cols])
df2_test[numerical_cols] = num_imputer.transform(df2_test[numerical_cols])
# fill missing with 'unknown' for categorical
for col in categorical_cols:
    df2_train[col] = df2_train[col].astype(str).fillna('unknown')
    df2_test[col] = df2_test[col].astype(str).fillna('unknown')
# defind the x and y for train and test
X_train = df2_train[categorical_cols + numerical_cols]
y_train = df2_train[target]
groups_train = df2_train[group_col]

X_test = df2_test[categorical_cols + numerical_cols]
y_test = df2_test[target]
groups_test = df2_test[group_col]

# sort the data according to group col
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]


train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)
model = CatBoostRanker(
    iterations=500,
    learning_rate=0.1,
    depth= 6,
    loss_function='YetiRank',
    eval_metric='NDCG:top=3',
    random_seed=42,
    verbose=100
)

param_grid = {
    'iterations': [300, 500],
    'learning_rate': [0.05, 0.1],
    'depth': [4, 6]
}

model.grid_search(param_grid, train_pool, cv = 3)

  df2 ['target'] = df2.groupby('race_index').apply(encode_placing).reset_index(level=0, drop=True)


0:	test: 0.6761537	best: 0.6761537 (0)	total: 103ms	remaining: 30.7s
100:	test: 0.8039150	best: 0.8039150 (100)	total: 9.72s	remaining: 19.2s
200:	test: 0.8051830	best: 0.8060114 (180)	total: 22.1s	remaining: 10.9s
299:	test: 0.8065837	best: 0.8069722 (292)	total: 34.5s	remaining: 0us

bestTest = 0.8069722046
bestIteration = 292

Metric PFound is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric NDCG is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.8069722	best: 0.8069722 (0)	total: 34.6s	remaining: 4m 2s
0:	test: 0.6761537	best: 0.6761537 (0)	total: 127ms	remaining: 38s
100:	test: 0.8039397	best: 0.8051396 (87)	total: 13.3s	remaining: 26.2s
200:	test: 0.8070355	best: 0.8079099 (193)	total: 26.5s	remaining: 13.1s
299:	test: 0.8073180	best: 0.8079099 (193)	total: 39.4s	remaining: 0us

bestTest = 0.8079099141
bestIteration = 193

{'params': {'depth': 6, 'learning_rate': 0.1, 'iterations': 500},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    