In [89]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import ndcg_score
from sklearn.utils.class_weight import compute_class_weight

In [90]:
df = pd.read_csv('../data/cleaned_data_new.csv')

In [91]:
df['Date'] = pd.to_datetime(df['Date'], errors = 'coerce')

In [92]:
def temporal_cv_split(df, n_splits=5):
    df_sorted = df.sort_values('Date')
    unique_races = df_sorted['race_index'].unique()
    
    fold_size = len(unique_races) // (n_splits + 1)
    
    for i in range(n_splits):
        train_end = (i + 1) * fold_size
        val_start = train_end
        val_end = train_end + fold_size
        
        train_races = unique_races[:train_end]
        val_races = unique_races[val_start:val_end]
        
        yield train_races, val_races

# spliting train, val, test

In [93]:
print(df['Date'].min())
print(df['Date'].max())

2020-09-06 00:00:00
2025-07-16 00:00:00


In [94]:
# train test split
min_date = pd.to_datetime(df['Date'].min())
max_date = pd.to_datetime(df['Date'].max())

total_days = (max_date - min_date).days
cutoff_days = int(total_days * 0.8)
cutoff_date = min_date + pd.Timedelta(days=cutoff_days)

train_df = df[df['Date'] <= cutoff_date]
test_df = df[df['Date'] > cutoff_date]


In [95]:
print(train_df['Date'].min())
print(train_df['Date'].max())

2020-09-06 00:00:00
2024-07-14 00:00:00


In [96]:
# train validation split
min_date = pd.to_datetime(train_df['Date'].min())
max_date = pd.to_datetime(train_df['Date'].max())

total_days = (max_date - min_date).days
cutoff_days = int(total_days * 0.8)
cutoff_date = min_date + pd.Timedelta(days = cutoff_days)

val_df = train_df[train_df['Date'] > cutoff_date]
train_df = train_df[train_df['Date'] <= cutoff_date]

In [97]:
df.columns

Index(['Date', 'race_index', 'RaceClass', 'rc', 'track', 'course', 'Dist.',
       'track_condition', 'Horse_id', 'Declar.Horse Wt.', 'Act.Wt.',
       'gate_position', 'Rtg.', 'age', 'colour', 'sex', 'origin',
       'Import type', 'Trainer', 'Jockey', 'Sire', 'Dam', 'Dam sire',
       'Finish Time', 'Gear', 'target', 'recent_3_win_rate_horse',
       'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
       'recent_3_consistency', 'jockey_trainer_combo_rate',
       'horse_track_distance_rate'],
      dtype='object')

# define features

In [98]:
# Define features 
categorical_cols = [
    'Dist.', 'track_condition', 'RaceClass', 'Trainer', 'Jockey', 'Dam sire', 'rc', 'track', 'course', 
    'Import type', 'Sire', 'Dam', 'origin', 'age', 'colour', 'sex'
]

numerical_cols = [
    'Rtg.', 'Act.Wt.', 'Declar.Horse Wt.','recent_3_win_rate_horse',
       'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
       'recent_3_consistency', 'jockey_trainer_combo_rate',
       'horse_track_distance_rate'
]

In [99]:
categorical_cols = [
    'Jockey'
]

numerical_cols = [
    'recent_5_avg_finish_pos', 'horse_track_distance_rate'
]

In [100]:
num_imputer = SimpleImputer(strategy='median')
train_df[numerical_cols] = num_imputer.fit_transform(train_df[numerical_cols])
val_df[numerical_cols] = num_imputer.transform(val_df[numerical_cols])
test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])

for col in categorical_cols:
    train_df[col] = train_df[col].astype(str).fillna('unknown')
    val_df[col] = val_df[col].astype(str).fillna('unknown')
    test_df[col] = test_df[col].astype(str).fillna('unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].astype(str).fillna('unknown')


In [101]:
X_train = train_df[categorical_cols + numerical_cols]
y_train = train_df['target']
groups_train = train_df['race_index']

X_val = val_df[categorical_cols + numerical_cols]
y_val = val_df['target']
groups_val = val_df['race_index']

X_test = test_df[categorical_cols + numerical_cols]
y_test = test_df['target']
groups_test = test_df['race_index']

In [102]:
# Sort by groups
train_sorted_idx = groups_train.argsort()
X_train = X_train.iloc[train_sorted_idx]
y_train = y_train.iloc[train_sorted_idx]
groups_train = groups_train.iloc[train_sorted_idx]

val_sorted_idx = groups_val.argsort()
X_val = X_val.iloc[val_sorted_idx]
y_val = y_val.iloc[val_sorted_idx]
groups_val = groups_val.iloc[val_sorted_idx]

test_sorted_idx = groups_test.argsort()
X_test = X_test.iloc[test_sorted_idx]
y_test = y_test.iloc[test_sorted_idx]
groups_test = groups_test.iloc[test_sorted_idx]

In [103]:
# Create CatBoost Pools
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=groups_train,
    cat_features=categorical_cols
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    group_id=groups_val,
    cat_features=categorical_cols
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=groups_test,
    cat_features=categorical_cols
)

In [104]:
# Train the model with optimized parameters
model = CatBoostRanker(
    iterations=1500,
    learning_rate=0.05,
    depth=8,
    loss_function='YetiRank',
    eval_metric='NDCG:top=4',
    l2_leaf_reg=3,
    random_strength=2,
    bagging_temperature=1,
    min_data_in_leaf=10,
    has_time=True,
    verbose=100,
    random_seed=42,
    early_stopping_rounds=250
)

# Train with validation monitoring
model.fit(train_pool, eval_set=val_pool)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.4268511	best: 0.4268511 (0)	total: 22.7ms	remaining: 34.1s
100:	test: 0.4523976	best: 0.4557359 (55)	total: 1.19s	remaining: 16.5s
200:	test: 0.4548589	best: 0.4557359 (55)	total: 2.36s	remaining: 15.3s
300:	test: 0.4525273	best: 0.4557359 (55)	total: 3.52s	remaining: 14s
Stopped by overfitting detector  (250 iterations wait)

bestTest = 0.4557359235
bestIteration = 55

Shrink model to first 56 iterations.


<catboost.core.CatBoostRanker at 0x142117360>

In [105]:
y_pred = model.predict(X_test)

In [106]:
# Evaluate NDCG Score
ndcg = ndcg_score([y_test], [y_pred], k=4)
print(f"NDCG Score: {ndcg}")

NDCG Score: 0.7705300790228885


In [107]:
feature_importance = model.get_feature_importance(data = train_pool)
feature_names = model.feature_names_

In [108]:
total_importance = feature_importance.sum()
cumulative_importance = 0
important_features = []
for feat, imp in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    cumulative_importance += imp
    important_features.append(feat)
    if cumulative_importance / total_importance >= 0.9:
        break


In [109]:
important_features

['Jockey', 'horse_track_distance_rate', 'recent_5_avg_finish_pos']