In [1]:
from py.race_function import scrape_current_race, scrape_current_race_no_odds
from catboost import CatBoostRanker
import pandas as pd
import numpy as np
import joblib

# inferences functions

In [2]:
def prediction(race_url, odds_url):
    model = CatBoostRanker()
    model.load_model('model/catboost_ranking_model.cbm')

    df = scrape_current_race(race_url, odds_url)
    X = df.iloc[:, :-1]
    predictions = model.predict(X)

    rated_scores = pd.DataFrame(predictions, columns = ['score'])

    name = df.iloc[:, -1]

    # Combine horse names and predicted scores into one DataFrame
    predicted = pd.concat([name.reset_index(drop=True), rated_scores.reset_index(drop=True)], axis=1)

    # Optionally rename columns if name column has no name
    predicted.columns = ['Horse_name', 'Predicted_Score']

    predicted = predicted.sort_values('Predicted_Score', ascending=False)

    return predicted

In [3]:
def prediction_no_odds(race_url):
    model = CatBoostRanker()
    model.load_model('model/catboost_ranker.cbm')

    df = scrape_current_race_no_odds(race_url)
    X = df.iloc[:, :-1]
    predictions = model.predict(X)

    rated_scores = pd.DataFrame(predictions, columns = ['score'])

    name = df.iloc[:, -1]

    # Combine horse names and predicted scores into one DataFrame
    predicted = pd.concat([name.reset_index(drop=True), rated_scores.reset_index(drop=True)], axis=1)

    # Optionally rename columns if name column has no name
    predicted.columns = ['Horse_name', 'Predicted_Score']

    predicted = predicted.sort_values('Predicted_Score', ascending=False)

    return predicted

In [4]:
def prediction_no_odds_new(race_url):
    model = CatBoostRanker()
    model.load_model('model/catboost_ranker_1.cbm')

    df = scrape_current_race_no_odds(race_url)
    X = df.iloc[:, :-1]
    predictions = model.predict(X)

    rated_scores = pd.DataFrame(predictions, columns = ['score'])

    name = df.iloc[:, -1]

    # Combine horse names and predicted scores into one DataFrame
    predicted = pd.concat([name.reset_index(drop=True), rated_scores.reset_index(drop=True)], axis=1)

    # Optionally rename columns if name column has no name
    predicted.columns = ['Horse_name', 'Predicted_Score']

    predicted = predicted.sort_values('Predicted_Score', ascending=False)

    return predicted

In [5]:
def ver_2_prediction(race_url, date, history):

    """
    function for prediction of race of the given url
    'url': english url of the current race
    'date': for feature engineering, enter the date of the race
    'history': latest dataframe of pass history data
    """

    model = CatBoostRanker()
    model.load_model('model/v2/catboost_ranker_v2.cbm')

    num_imputer = joblib.load('model/v2/num_imputer_v2.pkl')

    df1 = pd.read_csv(history)
    df = scrape_current_race_no_odds(race_url)

    df['Date'] = pd.to_datetime(date)
    new_df = pd.concat([df1, df], ignore_index = True)

    new_df['Date'] = pd.to_datetime(new_df['Date'])

    new_df = new_df.sort_values(['Horse_id', 'Date'])

    new_df['recent_3_win_rate_horse'] = (
        new_df.groupby('Horse_id')['target']
        .transform(lambda x: x.shift().rolling(window=3, min_periods = 3).mean())
    )

    new_df = new_df.sort_values(['Jockey', 'Date'])

    new_df['recent_3_win_rate_jockey'] = (
        new_df.groupby('Jockey')['target']
        .transform(lambda x: x.shift().rolling(window=3, min_periods = 3).mean())
    )

    # Expand your rolling statistics beyond just win rate
    new_df['recent_5_avg_finish_pos'] = (
        new_df.groupby('Horse_id')['target']
        .transform(lambda x: x.shift().rolling(window=5, min_periods=2).mean())
    )

    new_df['recent_3_consistency'] = (
        new_df.groupby('Horse_id')['target']
        .transform(lambda x: x.shift().rolling(window=3, min_periods=2).std())
    )

    # Jockey-Trainer combination performance
    new_df['jockey_trainer_combo_rate'] = (
        new_df.groupby(['Jockey', 'Trainer'])['target']
        .transform(lambda x: x.shift().expanding(min_periods=5).mean())
    )

    # Horse performance on specific track/distance combinations
    new_df['horse_track_distance_rate'] = (
        new_df.groupby(['Horse_id', 'track', 'Dist.'])['target']
        .transform(lambda x: x.shift().expanding(min_periods=2).mean())
    )

    overall_mean_win_rate = np.mean(new_df['target'] == 4)
    new_df['recent_3_win_rate_horse'] = new_df['recent_3_win_rate_horse'].fillna(overall_mean_win_rate)
    new_df['recent_3_win_rate_jockey'] = new_df['recent_3_win_rate_jockey'].fillna(overall_mean_win_rate)
    new_df['recent_5_avg_finish_pos'] = new_df['recent_5_avg_finish_pos'].fillna(overall_mean_win_rate)
    new_df['recent_3_consistency'] = new_df['recent_3_consistency'].fillna(overall_mean_win_rate)
    new_df['jockey_trainer_combo_rate'] = new_df['jockey_trainer_combo_rate'].fillna(overall_mean_win_rate)
    new_df['horse_track_distance_rate'] = new_df['horse_track_distance_rate'].fillna(overall_mean_win_rate)

    # slice only the current date data
    new_df = new_df[new_df['Date'] == pd.to_datetime(date)]

    # Define training features
    categorical_cols = [
        'Dist.', 'track_condition', 'RaceClass', 'Trainer', 'Jockey', 'Dam sire', 'rc', 'track', 'course', 
        'Import type', 'Sire', 'Dam', 'origin', 'age', 'colour', 'sex'
    ]
    
    numerical_cols = [
        'Rtg.', 'Act.Wt.', 'Declar.Horse Wt.', 'recent_3_win_rate_horse',
        'recent_3_win_rate_jockey', 'recent_5_avg_finish_pos',
        'recent_3_consistency', 'jockey_trainer_combo_rate',
        'horse_track_distance_rate'
    ]

    features = categorical_cols + numerical_cols

    new_df[numerical_cols] = num_imputer.transform(new_df[numerical_cols])

    X = new_df[features].copy()
    predictions = model.predict(X)

    rated_scores = pd.DataFrame(predictions, columns = ['score'])

    name = new_df['Horse_name']

    predicted = pd.concat([name.reset_index(drop=True), rated_scores.reset_index(drop=True)], axis=1)

    # Optionally rename columns if name column has no name
    predicted.columns = ['Horse_name', 'Predicted_Score']

    predicted = predicted.sort_values('Predicted_Score', ascending=False)

    return predicted



# Inferences

In [10]:
# model with NDCG = top4
df = ver_2_prediction('https://racing.hkjc.com/racing/information/English/racing/RaceCard.aspx?RaceDate=2025/10/12&Racecourse=ST&RaceNo=8', '2025-10-12', 'data/cleaned_data_20251011.csv')

track condition unknown


In [11]:
df

Unnamed: 0,Horse_name,Predicted_Score
11,MR ENERGIA,0.063803
4,MUST GO,0.04951
3,SOLID SHALAA,0.009891
10,BLAZING WIND,0.000505
5,SUGAR SUGAR,-0.003634
2,GIANT LEAP,-0.021643
1,CAMP LAND,-0.026623
7,NEW FOREST,-0.033515
6,GUMMY GUMMY,-0.038882
8,FATAL BLOW,-0.040702


In [17]:
race_url = 'https://racing.hkjc.com/racing/information/English/racing/RaceCard.aspx?RaceDate=2025/09/10&Racecourse=HV&RaceNo=8'
odds_url = 'https://bet.hkjc.com/en/racing/wp/2025-09-10/HV/8'

# df = prediction(race_url, odds_url)
df = prediction(race_url, odds_url)
df1 = prediction_no_odds(race_url)
df2 = prediction_no_odds_new(race_url)

In [18]:
df

Unnamed: 0,Horse_name,Predicted_Score
5,STORMING DRAGON,1.063786
2,SOVEREIGN FUND,0.874724
9,KING OF FIGHTERS,0.624747
0,DRAGON FOUR SEAS,0.126276
11,MONARCH COUNTY,-0.09845
8,VICTOR THE RAPID,-0.254325
4,REGAL GEM,-0.294512
3,GIANT LEAP,-0.401168
10,THRIVING BROTHERS,-0.571141
1,POWER KOEPP,-0.572337


In [19]:
df1

Unnamed: 0,Horse_name,Predicted_Score
5,STORMING DRAGON,0.57956
11,MONARCH COUNTY,0.573378
3,GIANT LEAP,0.13533
2,SOVEREIGN FUND,0.049979
1,POWER KOEPP,-0.067203
0,DRAGON FOUR SEAS,-0.089463
9,KING OF FIGHTERS,-0.11506
10,THRIVING BROTHERS,-0.549415
7,LUCKY PLANET,-0.761189
8,VICTOR THE RAPID,-1.000591


In [20]:
df2

Unnamed: 0,Horse_name,Predicted_Score
5,STORMING DRAGON,0.741216
11,MONARCH COUNTY,0.522
2,SOVEREIGN FUND,0.184918
3,GIANT LEAP,0.166472
0,DRAGON FOUR SEAS,-0.01021
1,POWER KOEPP,-0.012245
9,KING OF FIGHTERS,-0.089332
10,THRIVING BROTHERS,-0.519442
7,LUCKY PLANET,-0.748513
8,VICTOR THE RAPID,-0.882446
