In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('../data/cleaned_data.csv')

# Utility function

In [5]:
def reduce_cardinality(series, min_freq = 10):
    counts = series.value_counts()
    rare = counts[counts < min_freq].index
    return series.apply(lambda x: 'unknown' if x in rare else x)

In [None]:
# encode placing including unfinished horses with large numbers
# def encode_placing(group):  
#   finish_mask = group['Pla.'].astype(str).str.isdigit()
#   max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
#   return group['Pla.'].apply(lambda x: int(x) if str(x).isdigit() else max_rank + 1)

In [6]:
def encode_placing(group):  
   finish_mask = group['Pla.'].astype(str).str.isdigit()
   max_rank = group.loc[finish_mask, 'Pla.'].astype(int).max()
   # Assign non-finisher a score of 0, and invert numeric ranks
   return group['Pla.'].apply(
       lambda x: max_rank - int(x) + 1 if str(x).isdigit() else 0
   )


# Approach 1 (full model)

In [34]:
df1 = df.copy()

In [35]:
# define target and groupings
target = 'Pla.'
group_col = 'race_index'

# define categorical variables
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'gate_position', 'Trainer', 'Jockey', 'Import type', 'Sire', 'Dam', 'Dam sire', 'rc', 'track', 'course', 'origin', 'age', 'colour', 'sex'
]

# define numerical variables
numerical_cols = [
    'Rtg.', 'Win Odds', 'Act.Wt.', 'Declar.Horse Wt.'
]

In [36]:
# Apply encoding for places (including unfinished horses)
df1 [target] = df.groupby('race_index').apply(encode_placing).reset_index(level=0, drop=True)

  df1 [target] = df.groupby('race_index').apply(encode_placing).reset_index(level=0, drop=True)


In [37]:
df1

Unnamed: 0,Pla.,Dist.,track_condition,RaceClass,gate_position,Rtg.,Trainer,Jockey,Win Odds,Act.Wt.,Declar.Horse Wt.,Horse_id,Import type,Sire,Dam,Dam sire,race_index,rc,track,course,origin,age,colour,sex,B,BO,CC,CP,CO,E,H,P,PC,PS,SB,SR,TT,V,VO,XB
0,3,1600,G,G1,10,,T Yasuda,Y Kitamura,64.0,126,1187,H811,VIS,Just A Way,Epic Love,Dansili,238101223,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
1,3,2000,G,G1,7,,T Yasuda,C Y Ho,11.0,126,1179,H811,VIS,Just A Way,Epic Love,Dansili,623300423,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2,11,2000,G,G1,6,,T Yasuda,Y Kitamura,15.0,126,1150,H811,VIS,Just A Way,Epic Love,Dansili,240111222,ST,Turf,A,JPN,,Bay,Horse,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2,1200,G,5,2,18.0,C H Yip,C Wong,7.1,108,1045,C017,PPG,Smart Missile,Pyrography,Danzero,402060221,ST,Turf,C,AUS,,Brown,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,10,1200,G,5,14,18.0,C H Yip,M F Poon,42.0,113,1058,C017,PPG,Smart Missile,Pyrography,Danzero,296261220,ST,Turf,A+3,AUS,,Brown,Gelding,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50539,12,1400,G,3,11,68.0,C Fownes,C Y Ho,3.7,123,1152,H459,PP,Impending,Isola Blu,Blackfriars,278231223,ST,Turf,C,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50540,13,1200,G,3,10,68.0,C Fownes,C Y Ho,13.0,127,1153,H459,PP,Impending,Isola Blu,Blackfriars,240101223,ST,Turf,A,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50541,13,1200,GF,3,6,68.0,C Fownes,C Y Ho,16.0,124,1155,H459,PP,Impending,Isola Blu,Blackfriars,185191123,ST,Turf,B+2,AUS,6.0,Bay,Gelding,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
50542,7,1200,G,4,11,53.0,M Newnham,Z Purton,2.0,128,1127,K334,PPG,Street Boss,Varanasi,Encosta de Lago,801010725,ST,Turf,C,AUS,4.0,Grey,Gelding,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [38]:
# train test split
unique_races = df1[group_col].unique()

# split the race id for masking
train_races, test_races = train_test_split(unique_races, test_size = 0.2, random_state = 42)

In [39]:
# define mask of trianing and testing
train_mask = df1[group_col].isin(train_races)
test_mask = df1[group_col].isin(test_races)

# training and testing set
df1_train = df1.loc[train_mask].copy()
df1_test = df1.loc[test_mask].copy()

In [40]:
# median imputer for numerical variables
num_imputer = SimpleImputer(strategy = 'median')
df1_train[numerical_cols] = num_imputer.fit_transform(df1_train[numerical_cols])
df1_test[numerical_cols] = num_imputer.transform(df1_test[numerical_cols])

In [41]:
# fill missing with 'unknown' for categorical
for col in categorical_cols:
    df1_train[col] = df1_train[col].astype(str).fillna('unknown')
    df1_test[col] = df1_test[col].astype(str).fillna('unknown')

In [42]:
# defind the x and y for train and test
X_train = df1_train[categorical_cols + numerical_cols]
y_train = df1_train[target]
groups_train = df1_train[group_col]

X_test = df1_test[categorical_cols + numerical_cols]
y_test = df1_test[target]
groups_test = df1_test[group_col]

# sort the data according to group col
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]


train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [43]:
model = CatBoostRanker(
    iterations=500,
    learning_rate=0.1,
    depth= 6 ,
    loss_function='YetiRank',
    eval_metric='NDCG:top=3',
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.6903947	best: 0.6903947 (0)	total: 66ms	remaining: 32.9s
100:	test: 0.8033327	best: 0.8039037 (82)	total: 3.04s	remaining: 12s
200:	test: 0.8045467	best: 0.8056627 (175)	total: 6.02s	remaining: 8.96s
300:	test: 0.8074434	best: 0.8080683 (288)	total: 9.03s	remaining: 5.97s
400:	test: 0.8069399	best: 0.8080683 (288)	total: 12s	remaining: 2.97s
499:	test: 0.8058708	best: 0.8080683 (288)	total: 15s	remaining: 0us

bestTest = 0.8080682702
bestIteration = 288

Shrink model to first 289 iterations.


<catboost.core.CatBoostRanker at 0x152704a50>

In [44]:
y_pred = model.predict(X_test)

In [45]:
# You can use ranking metrics like NDCG, MAP, etc.
from sklearn.metrics import ndcg_score

# Evaluate NDCG Score
ndcg = ndcg_score([y_test], [y_pred], k=4)
print(f"NDCG Score: {ndcg}")

NDCG Score: 0.6975692610676515


In [46]:
print(groups_test.unique()[5:10])

[5100923 5110922 6050921 7050921 7060920]


In [47]:
# Choose the race you want to inspect
specific_race = 5100923  # replace with an actual race_index value from groups_test

# Filter for the specific race in test set
race_mask = (groups_test == specific_race)

# Get the horse info, true placing, predicted scores for the race
race_true = y_test[race_mask]
race_pred = y_pred[race_mask]
race_data = X_test[race_mask]

# Combine into a DataFrame for easy comparison
import pandas as pd

comparison_df = pd.DataFrame({
    'True_Rank': race_true.values,
    'Predicted_Score': race_pred
})

# Sort by predicted score (descending: highest score means better rank)
comparison_df_sorted_by_pred = comparison_df.sort_values(by='Predicted_Score', ascending=False)

# Sort by true rank (ascending: 1 is best rank)
comparison_df_sorted_by_true = comparison_df.sort_values(by='True_Rank')

print("Sorted by Predicted Ranking Scores:")
print(comparison_df_sorted_by_pred)

print("\nSorted by True Ranking:")
print(comparison_df_sorted_by_true)


Sorted by Predicted Ranking Scores:
    True_Rank  Predicted_Score
0           8         1.225398
8          12         0.910826
9          10         0.892642
3          11         0.818908
4           7         0.616022
2          13         0.551849
5          14         0.485534
13          3         0.016926
1           9        -0.251933
10          5        -0.974198
12          4        -1.657595
11          2        -1.690669
6           6        -2.057887
7           1        -2.270043

Sorted by True Ranking:
    True_Rank  Predicted_Score
7           1        -2.270043
11          2        -1.690669
13          3         0.016926
12          4        -1.657595
10          5        -0.974198
6           6        -2.057887
4           7         0.616022
0           8         1.225398
1           9        -0.251933
9          10         0.892642
3          11         0.818908
8          12         0.910826
2          13         0.551849
5          14         0.485534


# updated