In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import ndcg_score

In [2]:
df = pd.read_csv('../data/cleaned_data_new.csv')

In [3]:
df['Date'] = pd.to_datetime(df['Date'], errors = 'coerce')

In [4]:
def temporal_cv_split(df, n_splits=5):
    df_sorted = df.sort_values('Date')
    unique_races = df_sorted['race_index'].unique()
    
    fold_size = len(unique_races) // (n_splits + 1)
    
    for i in range(n_splits):
        train_end = (i + 1) * fold_size
        val_start = train_end
        val_end = train_end + fold_size
        
        train_races = unique_races[:train_end]
        val_races = unique_races[val_start:val_end]
        
        yield train_races, val_races

# spliting train, val, test

In [5]:
print(df['Date'].min())
print(df['Date'].max())

2020-09-06 00:00:00
2025-07-16 00:00:00


In [6]:
# train test split
min_date = pd.to_datetime(df['Date'].min())
max_date = pd.to_datetime(df['Date'].max())

total_days = (max_date - min_date).days
cutoff_days = int(total_days * 0.8)
cutoff_date = min_date + pd.Timedelta(days=cutoff_days)

train_df = df[df['Date'] <= cutoff_date]
test_df = df[df['Date'] > cutoff_date]


In [7]:
print(train_df['Date'].min())
print(train_df['Date'].max())

2020-09-06 00:00:00
2024-07-14 00:00:00


In [8]:
# train validation split
min_date = pd.to_datetime(train_df['Date'].min())
max_date = pd.to_datetime(train_df['Date'].max())

total_days = (max_date - min_date).days
cutoff_days = int(total_days * 0.8)
cutoff_date = min_date + pd.Timedelta(days = cutoff_days)

val_df = train_df[train_df['Date'] > cutoff_date]
train_df = train_df[train_df['Date'] <= cutoff_date]

In [9]:
df.columns

Index(['Date', 'race_index', 'RaceClass', 'rc', 'track', 'course', 'Dist.',
       'track_condition', 'Horse_id', 'Declar.Horse Wt.', 'Act.Wt.',
       'gate_position', 'Rtg.', 'age', 'colour', 'sex', 'origin',
       'Import type', 'Trainer', 'Jockey', 'Sire', 'Dam', 'Dam sire',
       'Finish Time', 'Gear', 'target', 'recent_3_win_rate_horse',
       'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
       'recent_3_consistency', 'jockey_trainer_combo_rate',
       'horse_track_distance_rate'],
      dtype='object')

# define features

In [20]:
# Define features 
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'Trainer', 'Jockey', 'Dam sire', 'rc', 'track', 'course', 
    'Import type', 'Sire', 'Dam', 'origin', 'age', 'colour', 'sex'
]

numerical_cols = [
    'Rtg.', 'Act.Wt.', 'Declar.Horse Wt.','recent_3_win_rate_horse',
       'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
       'recent_3_consistency', 'jockey_trainer_combo_rate',
       'horse_track_distance_rate'
]

In [11]:
categorical_cols = [
    'Jockey', 'track_condition', 'Trainer'
]

numerical_cols = [
 'jockey_trainer_combo_rate', 'recent_5_avg_finish_pos', 'horse_track_distance_rate'
]

In [12]:
categorical_cols = [
    'Jockey'
]

numerical_cols = [
    'recent_5_avg_finish_pos', 'horse_track_distance_rate'
]

In [21]:
num_imputer = SimpleImputer(strategy='median')
train_df[numerical_cols] = num_imputer.fit_transform(train_df[numerical_cols])
val_df[numerical_cols] = num_imputer.transform(val_df[numerical_cols])
test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])

for col in categorical_cols:
    train_df[col] = train_df[col].astype(str).fillna('unknown')
    val_df[col] = val_df[col].astype(str).fillna('unknown')
    test_df[col] = test_df[col].astype(str).fillna('unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].astype(str).fillna('unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].astype(str).fillna('unknown')
A value is tryin

In [22]:
X_train = train_df[categorical_cols + numerical_cols]
y_train = train_df['target']
groups_train = train_df['race_index']

X_val = val_df[categorical_cols + numerical_cols]
y_val = val_df['target']
groups_val = val_df['race_index']

X_test = test_df[categorical_cols + numerical_cols]
y_test = test_df['target']
groups_test = test_df['race_index']

In [23]:
# Sort by groups
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

val_sorted_idx = groups_val.argsort()
X_val = X_val.iloc[val_sorted_idx]
y_val = y_val.iloc[val_sorted_idx]
groups_val = groups_val.iloc[val_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]

In [24]:
# Create CatBoost Pools
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    group_id=groups_val,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [28]:
# Train the model with optimized parameters
model = CatBoostRanker(
    iterations=1500,
    learning_rate=0.05,
    depth=8,
    loss_function='YetiRank',
    eval_metric='NDCG:top=1',
    l2_leaf_reg=3,
    random_strength=2,
    bagging_temperature=1,
    has_time=True,
    verbose=100,
    random_seed=42,
    early_stopping_rounds=150
)

# Train with validation monitoring
model.fit(train_pool, eval_set=val_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.3370678	best: 0.3370678 (0)	total: 249ms	remaining: 6m 13s
100:	test: 0.3066825	best: 0.3611238 (3)	total: 23.9s	remaining: 5m 30s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.3611238065
bestIteration = 3

Shrink model to first 4 iterations.


<catboost.core.CatBoostRanker at 0x263c62caf90>

In [26]:
y_pred = model.predict(X_test)

In [None]:
def calculate_ndcg_per_group(y_true, y_pred, groups, k=4):
    """
    Calculate NDCG per race group and return average
    """
    ndcg_scores = []
    unique_groups = np.unique(groups)
    
    for group_id in unique_groups:
        # Get data for this specific race
        group_mask = groups == group_id
        group_true = y_true[group_mask]
        group_pred = y_pred[group_mask]
        
        # Skip if not enough horses in race for k=4
        if len(group_true) < 2:  # Need at least 2 horses to rank
            continue
            
        # Calculate NDCG for this race
        try:
            race_ndcg = ndcg_score([group_true], [group_pred], k=min(k, len(group_true)))
            ndcg_scores.append(race_ndcg)
        except:
            # Skip problematic races (e.g., all zeros)
            continue
    
    if len(ndcg_scores) == 0:
        return 0.0
        
    return np.mean(ndcg_scores), ndcg_scores

# CORRECT EVALUATION:
avg_ndcg, individual_ndcg = calculate_ndcg_per_group(y_test, y_pred, groups_test, k=1)
print(f"Average NDCG@4 across all races: {avg_ndcg:.4f}")
print(f"NDCG calculated on {len(individual_ndcg)} races")
print(f"NDCG std: {np.std(individual_ndcg):.4f}")


Average NDCG@4 across all races: 0.4386
NDCG calculated on 847 races
NDCG std: 0.2438


In [172]:
# Evaluate NDCG Score
ndcg = ndcg_score([y_test], [y_pred], k=4)
print(f"NDCG Score: {ndcg}")

NDCG Score: 0.8057825400371074


In [173]:
feature_importance = model.get_feature_importance(data = train_pool)
feature_names = model.feature_names_

In [174]:
total_importance = feature_importance.sum()
cumulative_importance = 0
important_features = []
for feat, imp in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    cumulative_importance += imp
    important_features.append(feat)
    if cumulative_importance / total_importance >= 0.9:
        break


In [175]:
important_features

['Jockey', 'horse_track_distance_rate', 'recent_5_avg_finish_pos']

# sampling

In [176]:
print(groups_test.unique()[5:10])

[ 6080924  7080924  8080924  9080924 10080924]


In [177]:
specific_race = 8080924

race_mask = (groups_test == specific_race)
horse_ids = test_df.loc[race_mask, 'Horse_id']

# Get the horse info, true placing, predicted scores for the race
race_true = y_test[race_mask]
race_pred = y_pred[race_mask]
race_data = X_test[race_mask]

comparison_df = pd.DataFrame({
    'Horse_id': horse_ids,
    'True_Score': race_true.values,
    'Predicted_Score': race_pred
})

# Sort by predicted score (descending: highest score means better rank)
comparison_df_sorted_by_pred = comparison_df.sort_values(by='Predicted_Score', ascending=False)

# Sort by true rank (ascending: 1 is best rank)
comparison_df_sorted_by_true = comparison_df.sort_values(by='True_Score', ascending= False)

In [178]:
print("Sorted by Predicted Ranking Scores:")
print(comparison_df_sorted_by_pred)

print("\nSorted by True Ranking:")
print(comparison_df_sorted_by_true)

Sorted by Predicted Ranking Scores:
      Horse_id  True_Score  Predicted_Score
5805      H357    0.223130         0.459401
6810      J079    0.367879         0.387656
36941     G307    0.000000         0.382204
18268     J157    0.000000         0.300779
3552      H326    0.000000         0.182650
34224     H047    0.000000         0.150387
25204     G078    0.000000        -0.106592
8466      J040    0.000000        -0.106783
14392     J390    1.000000        -0.204517
598       J225    0.000000        -0.318930
39650     C531    0.606531        -0.386026
15068     D359    0.000000        -0.571422
10643     G112    0.000000        -0.694960

Sorted by True Ranking:
      Horse_id  True_Score  Predicted_Score
14392     J390    1.000000        -0.204517
39650     C531    0.606531        -0.386026
6810      J079    0.367879         0.387656
5805      H357    0.223130         0.459401
598       J225    0.000000        -0.318930
3552      H326    0.000000         0.182650
8466      J040 