In [171]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import ndcg_score
import warnings
import joblib

In [172]:
warnings.filterwarnings('ignore', category = UserWarning)

In [173]:
df = pd.read_csv('../data/cleaned_data_20250930.csv')

In [174]:
df['Date'] = pd.to_datetime(df['Date'], errors = 'coerce')

In [175]:
def temporal_cv_split(df, n_splits=5):
    df_sorted = df.sort_values('Date')
    unique_races = df_sorted['race_index'].unique()
    
    fold_size = len(unique_races) // (n_splits + 1)
    
    for i in range(n_splits):
        train_end = (i + 1) * fold_size
        val_start = train_end
        val_end = train_end + fold_size
        
        train_races = unique_races[:train_end]
        val_races = unique_races[val_start:val_end]
        
        yield train_races, val_races

# spliting train, val, test

In [176]:
print(df['Date'].min())
print(df['Date'].max())

2020-09-06 00:00:00
2025-09-28 00:00:00


In [177]:
# train test split
min_date = pd.to_datetime(df['Date'].min())
max_date = pd.to_datetime(df['Date'].max())

total_days = (max_date - min_date).days
cutoff_days = int(total_days * 0.8)
cutoff_date = min_date + pd.Timedelta(days=cutoff_days)

train_df = df[df['Date'] <= cutoff_date]
test_df = df[df['Date'] > cutoff_date]


In [178]:
print(train_df['Date'].min())
print(train_df['Date'].max())

2020-09-06 00:00:00
2024-09-22 00:00:00


In [179]:
# train validation split
min_date = pd.to_datetime(train_df['Date'].min())
max_date = pd.to_datetime(train_df['Date'].max())

total_days = (max_date - min_date).days
cutoff_days = int(total_days * 0.8)
cutoff_date = min_date + pd.Timedelta(days = cutoff_days)

val_df = train_df[train_df['Date'] > cutoff_date]
train_df = train_df[train_df['Date'] <= cutoff_date]

In [180]:
df.columns

Index(['Date', 'race_index', 'RaceClass', 'rc', 'track', 'course', 'Dist.',
       'track_condition', 'Horse_id', 'Declar.Horse Wt.', 'Act.Wt.',
       'gate_position', 'Rtg.', 'age', 'colour', 'sex', 'origin',
       'Import type', 'Trainer', 'Jockey', 'Sire', 'Dam', 'Dam sire',
       'Finish Time', 'Gear', 'target', 'recent_3_win_rate_horse',
       'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
       'recent_3_consistency', 'jockey_trainer_combo_rate',
       'horse_track_distance_rate'],
      dtype='object')

# define features

In [None]:
# define features
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'Trainer', 'Jockey', 'Dam sire', 'rc', 'track', 'course', 
    'Import type', 'Sire', 'Dam', 'origin', 'age', 'colour', 'sex'
]

numerical_cols = [
    'Rtg.', 'Act.Wt.', 'Declar.Horse Wt.','recent_3_win_rate_horse',
       'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
       'recent_3_consistency', 'jockey_trainer_combo_rate',
       'horse_track_distance_rate'
]

In [182]:
num_imputer = SimpleImputer(strategy='median')
train_df[numerical_cols] = num_imputer.fit_transform(train_df[numerical_cols])
val_df[numerical_cols] = num_imputer.transform(val_df[numerical_cols])
test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])

for col in categorical_cols:
    train_df[col] = train_df[col].astype(str).fillna('unknown')
    val_df[col] = val_df[col].astype(str).fillna('unknown')
    test_df[col] = test_df[col].astype(str).fillna('unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].astype(str).fillna('unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].astype(str).fillna('unknown')
A value is tryin

In [183]:
X_train = train_df[categorical_cols + numerical_cols]
y_train = train_df['target']
groups_train = train_df['race_index']

X_val = val_df[categorical_cols + numerical_cols]
y_val = val_df['target']
groups_val = val_df['race_index']

X_test = test_df[categorical_cols + numerical_cols]
y_test = test_df['target']
groups_test = test_df['race_index']

In [184]:
# Sort by groups
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

val_sorted_idx = groups_val.argsort()
X_val = X_val.iloc[val_sorted_idx]
y_val = y_val.iloc[val_sorted_idx]
groups_val = groups_val.iloc[val_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]

# training

In [185]:
# Create CatBoost Pools
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    group_id=groups_val,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [186]:
# Train the model with optimized parameters
model = CatBoostRanker(
    iterations=1500,
    learning_rate=0.01,
    depth=6,
    loss_function='YetiRank',
    eval_metric='NDCG:top=4',
    l2_leaf_reg=10,
    random_strength=5,
    bagging_temperature=2,
    has_time=True,
    verbose=100,
    early_stopping_rounds=300
)

# Train with validation monitoring
model.fit(train_pool, eval_set=val_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.3494636	best: 0.3494636 (0)	total: 56.7ms	remaining: 1m 25s
100:	test: 0.3962826	best: 0.4269484 (9)	total: 1.43s	remaining: 19.9s
200:	test: 0.3850226	best: 0.4269484 (9)	total: 2.84s	remaining: 18.3s
300:	test: 0.3862548	best: 0.4269484 (9)	total: 4.35s	remaining: 17.3s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.4269483527
bestIteration = 9

Shrink model to first 10 iterations.


<catboost.core.CatBoostRanker at 0x12c97b4a0>

In [187]:
y_pred = model.predict(X_test)

In [188]:
def calculate_ndcg_per_group(y_true, y_pred, groups, k=4):
    """
    Calculate NDCG per race group and return average
    """
    ndcg_scores = []
    unique_groups = np.unique(groups)
    
    for group_id in unique_groups:
        # Get data for this specific race
        group_mask = groups == group_id
        group_true = y_true[group_mask]
        group_pred = y_pred[group_mask]
        
        # Skip if not enough horses in race for k=4
        if len(group_true) < 2:  # Need at least 2 horses to rank
            continue
            
        # Calculate NDCG for this race
        try:
            race_ndcg = ndcg_score([group_true], [group_pred], k=min(k, len(group_true)))
            ndcg_scores.append(race_ndcg)
        except:
            # Skip problematic races (e.g., all zeros)
            continue
    
    if len(ndcg_scores) == 0:
        return 0.0
        
    return np.mean(ndcg_scores), ndcg_scores

# CORRECT EVALUATION:
avg_ndcg, individual_ndcg = calculate_ndcg_per_group(y_test, y_pred, groups_test, k=4)
print(f"Average NDCG@4 across all races: {avg_ndcg:.4f}")
print(f"NDCG calculated on {len(individual_ndcg)} races")
print(f"NDCG std: {np.std(individual_ndcg):.4f}")


Average NDCG@4 across all races: 0.3838
NDCG calculated on 856 races
NDCG std: 0.2501


# saving model

In [145]:
model.save_model(
    fname = '../model/catboost_ranker.cbm',
    format = 'cbm'
)

In [146]:
joblib.dump(num_imputer, '../model/num_imputer.pkl')

['../model/num_imputer.pkl']

# further analysis

In [112]:
def analyze_performance_by_race_type(test_df, y_test, y_pred, groups_test, individual_ndcg):
    """
    Understand which race types your model handles best/worst
    """
    
    results_by_race = []
    
    for i, group_id in enumerate(np.unique(groups_test)):
        group_mask = groups_test == group_id
        race_data = test_df[group_mask].iloc[0]  # Get race characteristics
        
        race_result = {
            'race_index': group_id,
            'ndcg_score': individual_ndcg[i] if i < len(individual_ndcg) else 0,
            'race_class': race_data['RaceClass'],
            'distance': race_data['Dist.'],
            'track': race_data['track'],
            'field_size': np.sum(group_mask)
        }
        results_by_race.append(race_result)
    
    race_analysis = pd.DataFrame(results_by_race)
    
    print("PERFORMANCE BY RACE CHARACTERISTICS:")
    print("="*50)
    print("By Race Class:")
    print(race_analysis.groupby('race_class')['ndcg_score'].agg(['mean', 'std', 'count']))
    
    print("\nBy Distance:")
    print(race_analysis.groupby('distance')['ndcg_score'].agg(['mean', 'std', 'count']))
    
    print("\nBy Track:")
    print(race_analysis.groupby('track')['ndcg_score'].agg(['mean', 'std', 'count']))
    
    return race_analysis

# Run this analysis to understand your model's strengths/weaknesses
race_analysis = analyze_performance_by_race_type(test_df, y_test, y_pred, groups_test, individual_ndcg)


PERFORMANCE BY RACE CHARACTERISTICS:
By Race Class:
                mean       std  count
race_class                           
1           0.464241  0.188248      8
2           0.442483  0.238687     56
3           0.427184  0.237781    243
3R          0.139595       NaN      1
4           0.427368  0.240003    389
4R          0.254241  0.145738      4
4YO         0.208491  0.086757      3
5           0.387695  0.249550    116
G1          0.465919  0.304390     12
G2          0.577168  0.221679      7
G3          0.324665  0.277465     12
GRIFFIN     0.648099  0.243188      5

By Distance:
              mean       std  count
distance                           
1000      0.429561  0.261763     78
1200      0.433062  0.237072    342
1400      0.445232  0.245586    147
1600      0.398353  0.249318     64
1650      0.415151  0.237003    133
1800      0.364238  0.249060     58
2000      0.399064  0.235734     20
2200      0.393517  0.215073     11
2400      0.286610  0.181937      3

By Tr

In [76]:
feature_importance = model.get_feature_importance(data = train_pool)
feature_names = model.feature_names_

In [77]:
total_importance = feature_importance.sum()
cumulative_importance = 0
important_features = []
for feat, imp in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    cumulative_importance += imp
    important_features.append(feat)
    if cumulative_importance / total_importance >= 0.9:
        break


In [78]:
important_features

['Jockey', 'horse_track_distance_rate', 'recent_5_avg_finish_pos']

# sampling

In [189]:
print(groups_test.unique()[:10])

[ 1070925  2070925  3070925  4070925  5070925  6070925  7070925  8070925
  9070925 10070925]


In [190]:
specific_race = 5070925

race_mask = (groups_test == specific_race)
horse_ids = test_df.loc[race_mask, 'Horse_id']

# Get the horse info, true placing, predicted scores for the race
race_true = y_test[race_mask]
race_pred = y_pred[race_mask]
race_data = X_test[race_mask]

comparison_df = pd.DataFrame({
    'Horse_id': horse_ids,
    'True_Score': race_true.values,
    'Predicted_Score': race_pred
})

# Sort by predicted score (descending: highest score means better rank)
comparison_df_sorted_by_pred = comparison_df.sort_values(by='Predicted_Score', ascending=False)

# Sort by true rank (ascending: 1 is best rank)
comparison_df_sorted_by_true = comparison_df.sort_values(by='True_Score', ascending= False)

In [191]:
print("Sorted by Predicted Ranking Scores:")
print(comparison_df_sorted_by_pred)

print("\nSorted by True Ranking:")
print(comparison_df_sorted_by_true)

Sorted by Predicted Ranking Scores:
      Horse_id  True_Score  Predicted_Score
40701     G427    0.000000         0.030592
14783     H459    1.000000         0.021121
1218      J542    0.606531        -0.000800
43349     J392    0.000000        -0.002190
51182     K420    0.000000        -0.011262
25952     K273    0.223130        -0.015843
4109      K039    0.000000        -0.018184
35140     K305    0.000000        -0.019911
33022     J315    0.367879        -0.020000
7397      K364    0.000000        -0.021622
26639     J444    0.000000        -0.025355
30712     J152    0.000000        -0.025868
47773     J529    0.000000        -0.028471
15554     K299    0.000000        -0.031423

Sorted by True Ranking:
      Horse_id  True_Score  Predicted_Score
14783     H459    1.000000         0.021121
1218      J542    0.606531        -0.000800
33022     J315    0.367879        -0.020000
25952     K273    0.223130        -0.015843
4109      K039    0.000000        -0.018184
7397      K364 