In [1]:
import xarray as xr
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler

In [2]:
# import dataset

ds = xr.open_dataset('processed_race_data_with_results.nc')

In [3]:
print(ds.data_vars)  # Lists all data variables



Data variables:
    horse                          (race, starter) <U23 379kB ...
    jockey                         (race, starter) <U15 247kB ...
    trainer                        (race, starter) <U15 247kB ...
    program_number                 (race, starter) <U2 33kB ...
    surface                        (race) <U1 716B ...
    distance_f                     (race) float64 1kB ...
    purse                          (race) float64 1kB ...
    recent_race_id                 (race, starter, past_race) <U21 2MB ...
    recent_finish_pos              (race, starter, past_race) float32 82kB ...
    recent_lengths_back_finish     (race, starter, past_race) float32 82kB ...
    recent_lengths_back_last_call  (race, starter, past_race) float32 82kB ...
    recent_last_call_pos           (race, starter, past_race) int16 41kB ...
    recent_surface                 (race, starter, past_race) <U1 82kB ...
    recent_distance                (race, starter, past_race) float32 82kB ...
    rece

In [4]:
# converting to pandas dataframe
df = ds.to_dataframe()
df.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,horse,jockey,trainer,program_number,surface,distance_f,purse,recent_race_id,recent_finish_pos,recent_lengths_back_finish,...,recent_surface,recent_distance,recent_date,recent_purse,recent_start_pos,recent_num_starters,recent_jockey,recent_trainer,finish_position,scratched
race,starter,past_race,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
CD-05-29-23-R09,19,0,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,19,1,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,19,2,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,19,3,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,19,4,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,20,0,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,20,1,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,20,2,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,20,3,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False
CD-05-29-23-R09,20,4,,,,,T,9.0,120000.0,,,,...,,,,,0,0,,,0,False


### feature engineering

In [5]:
# --- STEP 1: AGGREGATE PAST PERFORMANCE FEATURES ---

# Median of recent_finish_pos and recent_lengths_back_finish
median_finish_pos = ds["recent_finish_pos"].median(dim="past_race")
median_lengths_back = ds["recent_lengths_back_finish"].median(dim="past_race")

In [6]:
# Range (max - min) of recent_distance
distance_range = ds["recent_distance"].max(dim="past_race") - ds["recent_distance"].min(dim="past_race")

# Average gate position (recent_start_pos)
avg_start_pos = ds["recent_start_pos"].mean(dim="past_race")

# Average number of starters (recent_num_starters)
avg_num_starters = ds["recent_num_starters"].mean(dim="past_race")

In [7]:
# --- STEP 2: BASE FEATURES ---

program_number = ds["program_number"]
surface_code = (ds["surface"] == "D").astype(int)  # Dirt=1, Turf=0
distance_f = ds["distance_f"]
purse = ds["purse"]

In [8]:
# --- STEP 3: ENCODE recent_jockey and recent_trainer via frequency encoding ---

# Convert to DataFrame
recent_jockey_df = ds["recent_jockey"].to_dataframe(name="recent_jockey").reset_index()
recent_trainer_df = ds["recent_trainer"].to_dataframe(name="recent_trainer").reset_index()

# Frequency of appearance
jockey_freq_map = recent_jockey_df["recent_jockey"].value_counts().to_dict()
trainer_freq_map = recent_trainer_df["recent_trainer"].value_counts().to_dict()

# Map frequencies
recent_jockey_df["jockey_freq"] = recent_jockey_df["recent_jockey"].map(jockey_freq_map)
recent_trainer_df["trainer_freq"] = recent_trainer_df["recent_trainer"].map(trainer_freq_map)

# Aggregate frequency by median across past_race
jockey_encoded = recent_jockey_df.groupby(["race", "starter"])["jockey_freq"].median().reset_index()
trainer_encoded = recent_trainer_df.groupby(["race", "starter"])["trainer_freq"].median().reset_index()


In [9]:
# --- STEP 4: ASSEMBLE FINAL FEATURE SET ---

df = xr.Dataset({
    "program_number": program_number,
    "surface_code": surface_code,
    "distance_f": distance_f,
    "purse": purse,
    "median_finish_pos": median_finish_pos,
    "median_lengths_back": median_lengths_back,
    "distance_range": distance_range,
    "avg_start_pos": avg_start_pos,
    "avg_num_starters": avg_num_starters
}).to_dataframe().reset_index()


In [10]:
# Merge in encoded jockey and trainer
df = df.merge(jockey_encoded, on=["race", "starter"], how="left")
df = df.merge(trainer_encoded, on=["race", "starter"], how="left")

In [11]:
# Extract 'scratched' info and merge if not yet in df
if 'scratched' not in df.columns:
    scratched_df = ds["scratched"].to_dataframe().reset_index()
    df = df.merge(scratched_df, on=["race", "starter"], how="left")

# Filter
df = df[df["scratched"] == False]

# df is now your clean, engineered feature matrix ready for modeling
df.head()

Unnamed: 0,race,starter,program_number,surface_code,distance_f,purse,median_finish_pos,median_lengths_back,distance_range,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,scratched
0,CD-05-02-23-R01,0,1,1,7.0,30000.0,7.0,1250.0,1.0,2.4,8.4,90.0,40.0,False
1,CD-05-02-23-R01,1,2,1,7.0,30000.0,9.5,2042.5,2.0,4.0,8.0,165.0,80.0,False
3,CD-05-02-23-R01,3,4,1,7.0,30000.0,7.0,850.0,2.0,8.2,9.0,20.0,10.0,False
4,CD-05-02-23-R01,4,5,1,7.0,30000.0,4.0,400.0,1.5,4.0,10.0,70.0,54.0,False
5,CD-05-02-23-R01,5,6,1,7.0,30000.0,8.0,1355.0,0.5,4.6,6.6,200.0,34.0,False


In [12]:
# add the target
# Extract the target variable
finish_position_df = ds["finish_position"].to_dataframe().reset_index()

# Merge into your feature DataFrame
df = df.merge(finish_position_df, on=["race", "starter"], how="left")

df

Unnamed: 0,race,starter,program_number,surface_code,distance_f,purse,median_finish_pos,median_lengths_back,distance_range,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,scratched,finish_position
0,CD-05-02-23-R01,0,1,1,7.0,30000.0,7.0,1250.0,1.0,2.4,8.4,90.0,40.0,False,2
1,CD-05-02-23-R01,1,2,1,7.0,30000.0,9.5,2042.5,2.0,4.0,8.0,165.0,80.0,False,3
2,CD-05-02-23-R01,3,4,1,7.0,30000.0,7.0,850.0,2.0,8.2,9.0,20.0,10.0,False,1
3,CD-05-02-23-R01,4,5,1,7.0,30000.0,4.0,400.0,1.5,4.0,10.0,70.0,54.0,False,4
4,CD-05-02-23-R01,5,6,1,7.0,30000.0,8.0,1355.0,0.5,4.6,6.6,200.0,34.0,False,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3835,CD-05-29-23-R09,18,,0,9.0,120000.0,,,,0.0,0.0,13724.0,13724.0,False,0
3836,CD-05-29-23-R09,19,,0,9.0,120000.0,,,,0.0,0.0,13724.0,13724.0,False,0
3837,CD-05-29-23-R09,20,,0,9.0,120000.0,,,,0.0,0.0,13724.0,13724.0,False,0
3838,CD-05-29-23-R09,21,,0,9.0,120000.0,,,,0.0,0.0,13724.0,13724.0,False,0


In [13]:
# Add horse info
horse_df = ds["horse"].to_dataframe(name="horse").reset_index()
df = df.merge(horse_df, on=["race", "starter"], how="left")

# Reorder columns and drop 'starter'
cols = ['race', 'starter', 'horse'] + [col for col in df.columns if col not in ['race', 'horse', 'starter']]
df = df[cols]


In [14]:
df.head(20)

Unnamed: 0,race,starter,horse,program_number,surface_code,distance_f,purse,median_finish_pos,median_lengths_back,distance_range,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,scratched,finish_position
0,CD-05-02-23-R01,0,Gormleyesque,1.0,1,7.0,30000.0,7.0,1250.0,1.0,2.4,8.4,90.0,40.0,False,2
1,CD-05-02-23-R01,1,Kentucky Reign,2.0,1,7.0,30000.0,9.5,2042.5,2.0,4.0,8.0,165.0,80.0,False,3
2,CD-05-02-23-R01,3,Dogwoodsmilliejane,4.0,1,7.0,30000.0,7.0,850.0,2.0,8.2,9.0,20.0,10.0,False,1
3,CD-05-02-23-R01,4,Girls House,5.0,1,7.0,30000.0,4.0,400.0,1.5,4.0,10.0,70.0,54.0,False,4
4,CD-05-02-23-R01,5,Recite,6.0,1,7.0,30000.0,8.0,1355.0,0.5,4.6,6.6,200.0,34.0,False,5
5,CD-05-02-23-R01,6,Start Class,7.0,1,7.0,30000.0,8.0,3450.0,4.0,6.4,9.4,27.0,5.0,False,6
6,CD-05-02-23-R01,7,,,1,7.0,30000.0,,,,0.0,0.0,13724.0,13724.0,False,0
7,CD-05-02-23-R01,8,,,1,7.0,30000.0,,,,0.0,0.0,13724.0,13724.0,False,0
8,CD-05-02-23-R01,9,,,1,7.0,30000.0,,,,0.0,0.0,13724.0,13724.0,False,0
9,CD-05-02-23-R01,10,,,1,7.0,30000.0,,,,0.0,0.0,13724.0,13724.0,False,0


In [15]:
df.tail(20)

Unnamed: 0,race,starter,horse,program_number,surface_code,distance_f,purse,median_finish_pos,median_lengths_back,distance_range,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,scratched,finish_position
3820,CD-05-29-23-R09,0,Blessing the Flag,1.0,0,9.0,120000.0,2.0,525.0,0.5,3.6,9.6,123.0,37.0,False,6
3821,CD-05-29-23-R09,1,Mad Caper,2.0,0,9.0,120000.0,5.5,800.0,0.0,2.2,3.0,13724.0,13724.0,False,12
3822,CD-05-29-23-R09,2,Abbreviation,3.0,0,9.0,120000.0,4.5,562.5,2.0,4.4,6.8,41.0,115.0,False,10
3823,CD-05-29-23-R09,3,Lady Dynamo,4.0,0,9.0,120000.0,4.5,600.0,2.5,1.8,3.2,13724.0,13724.0,False,4
3824,CD-05-29-23-R09,4,Boppy,5.0,0,9.0,120000.0,2.0,100.0,0.0,0.8,1.6,13724.0,13724.0,False,3
3825,CD-05-29-23-R09,5,Summertime Rose,6.0,0,9.0,120000.0,5.5,652.5,0.0,6.2,8.4,177.0,108.0,False,9
3826,CD-05-29-23-R09,6,Dixie Supreme,7.0,0,9.0,120000.0,6.5,592.5,0.5,2.4,3.8,13724.0,13724.0,False,7
3827,CD-05-29-23-R09,7,Harmonica,8.0,0,9.0,120000.0,6.0,1275.0,1.5,1.8,6.4,200.0,40.0,False,8
3828,CD-05-29-23-R09,8,Sri Lanka,9.0,0,9.0,120000.0,3.0,112.5,0.5,2.8,3.8,13724.0,13724.0,False,2
3829,CD-05-29-23-R09,9,Freydis the Red (FR),10.0,0,9.0,120000.0,2.0,300.0,0.5,5.0,9.4,102.0,205.0,False,1


In [16]:
df.where(df['race'] == 'CD-05-29-23-R01').dropna().head(22)

Unnamed: 0,race,starter,horse,program_number,surface_code,distance_f,purse,median_finish_pos,median_lengths_back,distance_range,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,scratched,finish_position
3645,CD-05-29-23-R01,0.0,Lucky Phoenix,1,1.0,8.0,50000.0,4.0,2130.0,1.5,1.0,2.2,13724.0,13724.0,False,6.0
3646,CD-05-29-23-R01,1.0,Origami Girl,2,1.0,8.0,50000.0,7.0,900.0,2.0,4.6,5.8,182.0,56.0,False,3.0
3647,CD-05-29-23-R01,2.0,Lotsandlotsofgold,3,1.0,8.0,50000.0,4.0,975.0,2.5,8.0,8.8,123.0,28.0,False,1.0
3648,CD-05-29-23-R01,3.0,Merrily Rush,4,1.0,8.0,50000.0,7.0,1175.0,0.0,-0.6,1.6,13724.0,13724.0,False,4.0
3649,CD-05-29-23-R01,4.0,Malibu Smart,5,1.0,8.0,50000.0,7.0,1600.0,0.0,-0.6,0.8,13724.0,13724.0,False,2.0
3650,CD-05-29-23-R01,5.0,Embraceable,6,1.0,8.0,50000.0,6.0,1625.0,0.0,0.8,0.8,13724.0,13724.0,False,5.0


In [17]:
# concate the first 3 columns and name them as race_id
df['race_id'] = df['race'].astype(str) + '-' + df['starter'].astype(str) + '-' + df['horse'].astype(str)
df = df[['race_id'] + [col for col in df.columns if col not in ['race', 'starter', 'horse', 'race_id', 'purse']]]

df.head(20)


Unnamed: 0,race_id,program_number,surface_code,distance_f,median_finish_pos,median_lengths_back,distance_range,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,scratched,finish_position
0,CD-05-02-23-R01-0-Gormleyesque,1.0,1,7.0,7.0,1250.0,1.0,2.4,8.4,90.0,40.0,False,2
1,CD-05-02-23-R01-1-Kentucky Reign,2.0,1,7.0,9.5,2042.5,2.0,4.0,8.0,165.0,80.0,False,3
2,CD-05-02-23-R01-3-Dogwoodsmilliejane,4.0,1,7.0,7.0,850.0,2.0,8.2,9.0,20.0,10.0,False,1
3,CD-05-02-23-R01-4-Girls House,5.0,1,7.0,4.0,400.0,1.5,4.0,10.0,70.0,54.0,False,4
4,CD-05-02-23-R01-5-Recite,6.0,1,7.0,8.0,1355.0,0.5,4.6,6.6,200.0,34.0,False,5
5,CD-05-02-23-R01-6-Start Class,7.0,1,7.0,8.0,3450.0,4.0,6.4,9.4,27.0,5.0,False,6
6,CD-05-02-23-R01-7-,,1,7.0,,,,0.0,0.0,13724.0,13724.0,False,0
7,CD-05-02-23-R01-8-,,1,7.0,,,,0.0,0.0,13724.0,13724.0,False,0
8,CD-05-02-23-R01-9-,,1,7.0,,,,0.0,0.0,13724.0,13724.0,False,0
9,CD-05-02-23-R01-10-,,1,7.0,,,,0.0,0.0,13724.0,13724.0,False,0


In [18]:
print(df.columns)

Index(['race_id', 'program_number', 'surface_code', 'distance_f',
       'median_finish_pos', 'median_lengths_back', 'distance_range',
       'avg_start_pos', 'avg_num_starters', 'jockey_freq', 'trainer_freq',
       'scratched', 'finish_position'],
      dtype='object')


#### Working on the Test data to have the same format as df

In [19]:
pred_df = pd.read_csv('C:/inClassTemp/fsan830spring2025/students/okediran_tunmbi/preprocessing/test/CDX0426_processed.csv')
pred_df.head()



Unnamed: 0,track_code,race_date,race_number,post_position,entry,distance,surface_code,race_type,claiming_price_category,race_class,...,trainerPrevYrShows,jockeyPrevYrStarts,jockeyPrevYrWins,jockeyPrevYrPlaces,jockeyPrevYrShows,currentSireStudFee,BrisDirtPedigree,BrisMudPedigree,BrisTurfPedigree,BrisDistPedigree
0,CD,20250426,1,1,,1760,D,C,CUN,Clm 12500,...,22,1015,155,133,128,7500.0,104,103,109,103
1,CD,20250426,1,2,,1760,D,C,CUN,Clm 12500,...,6,365,58,42,41,3500.0,101,100,99,99
2,CD,20250426,1,3,,1760,D,C,CUN,Clm 12500,...,19,449,51,69,50,,111,111,109,110
3,CD,20250426,1,4,,1760,D,C,CUN,Clm 12500,...,6,290,28,29,35,7500.0,101,101,104,100
4,CD,20250426,1,5,,1760,D,C,CUN,Clm 12500,...,4,280,41,26,37,,104,104,104,104


In [20]:
today_df = pd.read_csv('C:/inClassTemp/fsan830spring2025/students/okediran_tunmbi/preprocessing/test/CDX0515_processed.csv')
today_df.head()

Unnamed: 0,track_code,race_date,race_number,post_position,entry,distance,surface_code,race_type,claiming_price_category,race_class,...,trainerPrevYrShows,jockeyPrevYrStarts,jockeyPrevYrWins,jockeyPrevYrPlaces,jockeyPrevYrShows,currentSireStudFee,BrisDirtPedigree,BrisMudPedigree,BrisTurfPedigree,BrisDistPedigree
0,CD,20250515,1,1,,1430,D,C,BUM,Clm 20000b,...,2,151,8,16,14,,102,101,103,102
1,CD,20250515,1,2,,1430,D,C,BUM,Clm 20000b,...,28,694,97,68,86,10000.0,109,107,108,109
2,CD,20250515,1,3,,1430,D,C,BUM,Clm 20000b,...,19,618,79,88,81,15000.0,106,106,104,106
3,CD,20250515,1,4,,1430,D,C,BUM,Clm 20000b,...,11,713,105,117,111,5000.0,102,102,107,103
4,CD,20250515,1,5,,1430,D,C,BUM,Clm 20000b,...,9,252,16,17,19,2500.0,100,101,100,101


In [21]:
def process_test_data(test_df, scaler=None):
    # Create initial features
    processed_df = pd.DataFrame()

    # Convert race_date from int/str to datetime
    race_date = pd.to_datetime(test_df['race_date'].astype(str), format='%Y%m%d')

    # Format as MM-DD-YY
    formatted_date = race_date.dt.strftime('%m-%d-%y')

    # Create race and starter columns
    processed_df['race'] = test_df['track_code'] + '-' + formatted_date + '-R' + test_df['race_number'].astype(str).str.zfill(2)
    processed_df['starter'] = test_df.groupby(['track_code', 'race_date', 'race_number']).cumcount()

    # Add horse name
    processed_df['horse_name'] = test_df['horse_name']

    # Basic features
    processed_df['surface_code'] = (test_df['surface_code'] == 'D').astype(int)
    processed_df['distance_f'] = test_df['distance'].astype(float) / 220
    processed_df['purse'] = test_df['purse'].astype(float)
    processed_df['scratched'] = test_df.get('scratched', False)

    # Aggregated features
    recent_finish_cols = [f'recentFinishPosition{i}' for i in range(1, 11)]
    for col in recent_finish_cols:
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    processed_df['median_finish_pos'] = test_df[recent_finish_cols].median(axis=1)

    processed_df['median_lengths_back'] = np.nan  # placeholder
    processed_df['distance_range'] = np.nan       # placeholder

    recent_post_cols = [f'recentPostPosition{i}' for i in range(1, 11)]
    processed_df['avg_start_pos'] = test_df[recent_post_cols].mean(axis=1)

    recent_entrants_cols = [f'recentNumEntrants{i}' for i in range(1, 11)]
    processed_df['avg_num_starters'] = test_df[recent_entrants_cols].mean(axis=1)

    # Frequencies
    processed_df['jockey_freq'] = test_df['jockeyPrevYrStarts']
    processed_df['trainer_freq'] = test_df['trainerPrevYrStarts']
    processed_df['finish_position'] = 0

    # Reset and construct race_id
    processed_df = processed_df.reset_index(drop=True)
    processed_df['race_id'] = (
        processed_df['race'].astype(str) + '-' +
        processed_df['starter'].astype(str) + '-' +
        processed_df['horse_name'].astype(str)
    ).str.lower()

    # Reorder and drop unnecessary columns
    processed_df = processed_df[['race_id'] + [col for col in processed_df.columns
                                               if col not in ['race', 'starter', 'horse_name', 'race_id', 'purse']]]

    # Optional scaling
    if scaler is not None:
        numeric_cols = ['surface_code', 'distance_f', 'median_finish_pos', 'median_lengths_back',
                        'distance_range', 'avg_start_pos', 'avg_num_starters',
                        'jockey_freq', 'trainer_freq']
        processed_df[numeric_cols] = scaler.transform(processed_df[numeric_cols])

    return processed_df


In [22]:
pred_df = process_test_data(pred_df)

In [23]:
today_df = process_test_data(today_df)



In [24]:
df['race_id'] = df['race_id'].str.lower()
df = df.drop(columns=['program_number', 'distance_range', 'scratched'])
df.head()



Unnamed: 0,race_id,surface_code,distance_f,median_finish_pos,median_lengths_back,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,finish_position
0,cd-05-02-23-r01-0-gormleyesque,1,7.0,7.0,1250.0,2.4,8.4,90.0,40.0,2
1,cd-05-02-23-r01-1-kentucky reign,1,7.0,9.5,2042.5,4.0,8.0,165.0,80.0,3
2,cd-05-02-23-r01-3-dogwoodsmilliejane,1,7.0,7.0,850.0,8.2,9.0,20.0,10.0,1
3,cd-05-02-23-r01-4-girls house,1,7.0,4.0,400.0,4.0,10.0,70.0,54.0,4
4,cd-05-02-23-r01-5-recite,1,7.0,8.0,1355.0,4.6,6.6,200.0,34.0,5


In [25]:
df['finish_position'] = np.where(df['finish_position'].isin([1]), 1, 0)
df.head()


Unnamed: 0,race_id,surface_code,distance_f,median_finish_pos,median_lengths_back,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,finish_position
0,cd-05-02-23-r01-0-gormleyesque,1,7.0,7.0,1250.0,2.4,8.4,90.0,40.0,0
1,cd-05-02-23-r01-1-kentucky reign,1,7.0,9.5,2042.5,4.0,8.0,165.0,80.0,0
2,cd-05-02-23-r01-3-dogwoodsmilliejane,1,7.0,7.0,850.0,8.2,9.0,20.0,10.0,1
3,cd-05-02-23-r01-4-girls house,1,7.0,4.0,400.0,4.0,10.0,70.0,54.0,0
4,cd-05-02-23-r01-5-recite,1,7.0,8.0,1355.0,4.6,6.6,200.0,34.0,0


In [26]:
pred_df = pred_df.drop(columns=['distance_range', 'scratched'])
pred_df.head()

Unnamed: 0,race_id,surface_code,distance_f,median_finish_pos,median_lengths_back,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,finish_position
0,cd -04-26-25-r01-0-tiberius mercurius,1,8.0,8.0,,3.8,10.2,1015,177,0
1,cd -04-26-25-r01-1-where is lisa,1,8.0,4.5,,4.3,7.9,365,43,0
2,cd -04-26-25-r01-2-coyote road,1,8.0,3.0,,5.2,8.1,449,144,0
3,cd -04-26-25-r01-3-ready pursuit,1,8.0,5.5,,4.9,8.6,290,44,0
4,cd -04-26-25-r01-4-peruvian lucky,1,8.0,3.5,,6.2,10.0,280,29,0


In [27]:
today_df = today_df.drop(columns=['distance_range', 'scratched'])
today_df.head()


Unnamed: 0,race_id,surface_code,distance_f,median_finish_pos,median_lengths_back,avg_start_pos,avg_num_starters,jockey_freq,trainer_freq,finish_position
0,cd -05-15-25-r01-0-balladry,1,6.5,4.5,,4.8,7.8,151,42,0
1,cd -05-15-25-r01-1-where's the wine,1,6.5,3.0,,5.0,9.0,694,217,0
2,cd -05-15-25-r01-2-princess pom pom,1,6.5,3.5,,5.4,9.0,618,115,0
3,cd -05-15-25-r01-3-ask amanda,1,6.5,3.5,,6.166667,8.666667,713,120,0
4,cd -05-15-25-r01-4-spirit rules,1,6.5,3.5,,4.7,7.9,252,79,0


In [28]:
df = df.set_index('race_id')
pred_df = pred_df.set_index('race_id')
today_df = today_df.set_index('race_id')

In [29]:
today_df.columns

Index(['surface_code', 'distance_f', 'median_finish_pos',
       'median_lengths_back', 'avg_start_pos', 'avg_num_starters',
       'jockey_freq', 'trainer_freq', 'finish_position'],
      dtype='object')

### PyMC BART Regression

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import pymc as pm
import pymc_bart as pmb
import arviz as az
import matplotlib.pyplot as plt

In [31]:
# === Load and preprocess data ===
# Assuming df is already loaded and indexed by race_id
features = ['surface_code', 'distance_f', 'median_finish_pos', 'avg_start_pos']
target_col = 'finish_position'

# Replace empty strings with NaN and cast to float
df[features] = df[features].replace(r'^\s*$', np.nan, regex=True).astype(float)
y = df[target_col].astype(float)
X = df[features]

In [None]:


# Drop non-numeric columns if needed (defensive)
numeric_cols = X.select_dtypes(include=[np.number]).columns
if len(numeric_cols) < X.shape[1]:
    print("Warning: Dropping non-numeric columns:", set(X.columns) - set(numeric_cols))
    X = X[numeric_cols]

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Optional: Apply SMOTE (good if you're treating finish_position as ordinal score)
smote = SMOTE(random_state=123)
X_train, y_train = smote.fit_resample(X_train, y_train)

# === Build and train BART model ===
with pm.Model() as model:
    X_shared = pm.Data("X", X_train)
    μ = pmb.BART("μ", X=X_shared, Y=y_train, m=50)

    p = pm.Deterministic("p", pm.math.sigmoid(μ))
   
    y_obs = pm.Bernoulli("y_obs", p=p, observed=y_train)
    trace = pm.sample(draws=1000, tune=1000, chains=4, cores=4, random_seed=42, return_inferencedata=True)

Multiprocess sampling (4 chains in 4 jobs)
PGBART: [μ]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 307 seconds.
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details


In [33]:
# === Predict on test set ===
with model:
    pm.set_data({"X": X_test})
    ppc = pm.sample_posterior_predictive(trace, var_names=["p"])#, random_seed=42)

Sampling: [μ]


Output()

In [34]:
# Extract predictions
# Extract predictions
y_pred_samples = ppc.posterior_predictive["p"]#.values
y_pred_samples_array = y_pred_samples.values
# y_pred_mean = y_pred_samples_array.mean(axis=0) #.squeeze()
# y_pred_q05 = np.quantile(y_pred_samples, 0.05, axis=0)
# y_pred_q95 = np.quantile(y_pred_samples, 0.95, axis=0)

# Reshape to (total_samples, n_obs)
n_chains, n_draws, n_obs = y_pred_samples_array.shape
y_pred_samples_array = y_pred_samples_array.reshape(-1, n_obs)  # shape: (4000, n_obs)



In [35]:
# Compute posterior predictive summaries
y_pred_mean = y_pred_samples_array.mean(axis=0)   # mean predicted probability
y_pred_q05 = np.quantile(y_pred_samples_array, 0.05, axis=0)
y_pred_q95 = np.quantile(y_pred_samples_array, 0.95, axis=0)

In [36]:
az.rhat(trace)


In [37]:
# az.plot_trace(trace, var_names=["p"])


### Prediction

In [38]:
today_df[features] = today_df[features].replace(r'^\s*$', np.nan, regex=True).astype(float)

# Drop non-numeric columns defensively
numeric_cols = today_df[features].select_dtypes(include=[np.number]).columns
if len(numeric_cols) < len(features):
    print("Warning: Dropping non-numeric columns from today_df:", set(features) - set(numeric_cols))
    today_df = today_df[numeric_cols]

# Impute using previously fitted imputer
X_today_imputed = imputer.transform(today_df[features])

In [39]:
X_today_scaled = scaler.transform(X_today_imputed)

In [40]:
# === Step 2: Predict using the BART model ===
# with model:
#     pm.set_data({"X": X_today_scaled})
#     ppc_today = pm.sample_posterior_predictive(trace, var_names=["μ"], random_seed=42)

# # === Predict on test set ===
with model:
    pm.set_data({"X": X_today_scaled})
    ppc_today = pm.sample_posterior_predictive(trace, var_names=["p"])#, random_seed=42)

Sampling: [μ]


Output()

In [41]:
# Extract predictions
# Extract predictions
y_pred_samples = ppc.posterior_predictive["p"]#.values
y_pred_samples_array = y_pred_samples.values
# y_pred_mean = y_pred_samples_array.mean(axis=0) #.squeeze()
# y_pred_q05 = np.quantile(y_pred_samples, 0.05, axis=0)
# y_pred_q95 = np.quantile(y_pred_samples, 0.95, axis=0)

# Reshape to (total_samples, n_obs)
n_chains, n_draws, n_obs = y_pred_samples_array.shape
y_pred_samples_array = y_pred_samples_array.reshape(-1, n_obs)  # shape: (4000, n_obs)

In [45]:
# === Step 3: Extract posterior mean predictions ===
y_today_samples = ppc_today.posterior_predictive["p"]  # shape: (1000, n_today_obs)
y_today_samples_array = y_today_samples.values

# Reshape to (total_samples, n_obs)
n_chains, n_draws, n_obs = y_today_samples_array.shape
y_today_samples_array = y_today_samples_array.reshape(-1, n_obs)  # shape: (4000, n_obs)

In [46]:
# Compute posterior predictive summaries
y_pred_mean = y_pred_samples_array.mean(axis=0)   # mean predicted probability
y_pred_q05 = np.quantile(y_pred_samples_array, 0.05, axis=0)
y_pred_q95 = np.quantile(y_pred_samples_array, 0.95, axis=0)

In [47]:
y_today_mean = y_today_samples_array.mean(axis=0) 
y_today_q05 = np.quantile(y_today_samples, 0.05, axis=0)
y_today_q95 = np.quantile(y_today_samples, 0.95, axis=0)

In [48]:
print("Raw shape:", y_today_samples_array.shape)  # Should be (1000, 86)



Raw shape: (4000, 86)


In [49]:
print("today_df.shape:", today_df.shape)
print("y_today_mean shape:", y_today_mean.shape)


today_df.shape: (86, 9)
y_today_mean shape: (86,)


In [57]:
# === Step 4: Save or view results ===
today_predictions = pd.DataFrame({
    "race_id": today_df.index,
    "predicted_mean": y_today_mean
})

today_predictions

Unnamed: 0,race_id,predicted_mean
0,cd -05-15-25-r01-0-balladry,0.712093
1,cd -05-15-25-r01-1-where's the wine,0.795484
2,cd -05-15-25-r01-2-princess pom pom,0.748742
3,cd -05-15-25-r01-3-ask amanda,0.685241
4,cd -05-15-25-r01-4-spirit rules,0.759900
...,...,...
81,cd -05-15-25-r08-9-nyquist frequency,0.638055
82,cd -05-15-25-r08-10-electioneering,0.591449
83,cd -05-15-25-r08-11-tapakena,0.691892
84,cd -05-15-25-r08-12-mo jackson,0.759106


In [58]:
today_predictions.head(12)

Unnamed: 0,race_id,predicted_mean
0,cd -05-15-25-r01-0-balladry,0.712093
1,cd -05-15-25-r01-1-where's the wine,0.795484
2,cd -05-15-25-r01-2-princess pom pom,0.748742
3,cd -05-15-25-r01-3-ask amanda,0.685241
4,cd -05-15-25-r01-4-spirit rules,0.7599
5,cd -05-15-25-r01-5-itsablingthing,0.725071
6,cd -05-15-25-r02-0-chiquita's way,0.757566
7,cd -05-15-25-r02-1-swiftliketaylor,0.677314
8,cd -05-15-25-r02-2-color comin' in,0.677314
9,cd -05-15-25-r02-3-shez twisted,0.677314


In [70]:
## Extracting winners

today_predictions['raceID'] = today_predictions["race_id"].str.extract(r"(cd\s*-\d{2}-\d{2}-\d{2}-r\d{2})")
today_predictions["horse_name"] = today_predictions["race_id"].str.extract(r"-r\d{2}-\d+-(.*)$")

# Get horse with highest predicted_mean per race
winners = today_predictions.loc[today_predictions.groupby("raceID")["predicted_mean"].idxmax()]

In [71]:
# Clean up display
winners = winners[["raceID", "horse_name", "predicted_mean"]].sort_values(by="raceID").reset_index(drop=True)

winners = winners.rename(columns={
    "horse_name": "Winner(horsename)",
    "predicted_mean": "probability"
})

winners


Unnamed: 0,raceID,Winner(horsename),probability
0,cd -05-15-25-r01,where's the wine,0.795484
1,cd -05-15-25-r02,chiquita's way,0.757566
2,cd -05-15-25-r03,maerdama,0.817671
3,cd -05-15-25-r04,intermittent fast,0.770992
4,cd -05-15-25-r05,ghost prince,0.74461
5,cd -05-15-25-r06,pharoah's heart,0.782369
6,cd -05-15-25-r07,stellar asset,0.786861
7,cd -05-15-25-r08,ambridge,0.798849
