In [11]:
import pandas as pd
import numpy as np

In [15]:
def process_brisnet_file(raw_csv_path, output_csv_path):
    # Step 1: Load raw BRISNET CSV (no headers)
    df = pd.read_csv(raw_csv_path, header=None)

    # Step 2: Compute jockey-level stats from columns 616–625 (past 10 finishes)
    jockey_col = 32
    finish_cols = list(range(616, 626))  # 10 past finish positions

    jockey_finish_df = df[[jockey_col] + finish_cols].copy()
    jockey_finish_df.columns = ["Jockey"] + [f"PP{i}_FinishPosition" for i in range(1, 11)]

    # Melt into long format and clean
    jockey_perf = (
        jockey_finish_df
        .melt(id_vars="Jockey", value_vars=[f"PP{i}_FinishPosition" for i in range(1, 11)],
              value_name="FinishPosition")
        .dropna()
    )
    jockey_perf["FinishPosition"] = pd.to_numeric(jockey_perf["FinishPosition"], errors="coerce")

    # Compute average finish position and win rate
    jockey_avg_finish = jockey_perf.groupby("Jockey")["FinishPosition"].mean().to_dict()
    jockey_winrate = (
        jockey_perf.assign(Won=jockey_perf["FinishPosition"] == 1)
        .groupby("Jockey")["Won"]
        .mean()
        .to_dict()
    )

    # Step 3: Select relevant columns for engineered dataset
    column_map = {
        "Track": 0,
        "Date": 1,
        "RaceNumber": 2,
        "Distance": 5,
        "Surface": 6,
        "RaceType": 8,
        "Jockey": 32,
        "Odds": 43,
        "HorseName": 44,
        "Sex": 48,
        "Color": 49,
        "WeightCarried": 50,
        "FinishPosition": 624,
    }

    for i in range(5):
        column_map.update({
            f"PP{i+1}_FinishPosition": 615 + i,
            f"PP{i+1}_Distance": 315 + i,
            f"PP{i+1}_Surface": 325 + i,
            f"PP{i+1}_LengthsBack": 745 + i,
            f"PP{i+1}_NumStarters": 345 + i,
            f"PP{i+1}_Purse": 555 + i,
            f"PP{i+1}_Jockey": 1065 + i,
            f"PP{i+1}_Trainer": 1055 + i,
        })

    # Step 4: Extract and rename columns
    df = df[list(column_map.values())].copy()
    df.columns = list(column_map.keys())

    # Step 5: Standardize Surface
    for col in [c for c in df.columns if "Surface" in c]:
        df[col] = df[col].replace({"D": "Dirt", "T": "Turf"})

    # Step 6: Cap LengthsBack at 100
    for col in [c for c in df.columns if "LengthsBack" in c]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df[col] = df[col].apply(lambda x: min(x, 100.0) if pd.notna(x) else x)

    # Step 7: Handle missing values
    numerical_pp_cols = [c for c in df.columns if c.startswith("PP") and any(x in c for x in ["Distance", "Purse", "Lengths", "NumStarters", "Position"])]
    categorical_pp_cols = [c for c in df.columns if c.startswith("PP") and any(x in c for x in ["Surface", "Jockey", "Trainer"])]

    df[numerical_pp_cols] = df[numerical_pp_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
    df[categorical_pp_cols] = df[categorical_pp_cols].fillna("Unknown")

    # Step 8: Feature Engineering
    pp_finish_cols = [f"PP{i}_FinishPosition" for i in range(1, 6)]
    pp_distance_cols = [f"PP{i}_Distance" for i in range(1, 6)]

    # 1. AvgPastFinishPosition
    df["AvgPastFinishPosition"] = df[pp_finish_cols].replace(0, np.nan).mean(axis=1).fillna(0)

    # 2. WinRate (fraction of past races won)
    df["WinRate"] = df[pp_finish_cols].apply(lambda row: (row == 1).sum() / (row != 0).sum() if (row != 0).sum() > 0 else 0, axis=1)

    # 3. DistanceSuitability
    df["Distance"] = pd.to_numeric(df["Distance"], errors="coerce")
    for col in pp_distance_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df["DistanceSuitability"] = df[pp_distance_cols].sub(df["Distance"], axis=0).abs().mean(axis=1).fillna(0)

    # 4. Jockey metrics
    df["JockeyWinRate"] = df["Jockey"].map(jockey_winrate).fillna(0)
    df["JockeyAvgFinish"] = df["Jockey"].map(jockey_avg_finish).fillna(np.nan)

    # 5. FieldStrength and RaceID
    df["RaceID"] = df["Track"].astype(str) + "_" + df["Date"].astype(str) + "_" + df["RaceNumber"].astype(str)
    df["FieldStrength"] = df.groupby("RaceID")["AvgPastFinishPosition"].transform("mean")
    df
    # 6. SurfaceWinRate
    def surface_win_rate(row):
        matches, total = 0, 0
        for i in range(1, 6):
            surf = row.get(f"PP{i}_Surface")
            win = row.get(f"PP{i}_FinishPosition")
            if pd.notna(surf) and surf != "Unknown":
                total += 1
                if win == 1 and surf == row["Surface"]:
                    matches += 1
        return matches / total if total > 0 else 0

    df["SurfaceWinRate"] = df.apply(surface_win_rate, axis=1)

    # Normalize Distance
    df["Distance"] = (df["Distance"] - df["Distance"].min()) / (df["Distance"].max() - df["Distance"].min())

    # One-hot encode Surface
    df = pd.get_dummies(df, columns=["Surface"], prefix="Surface")

    # Step 9: Select final columns
    final_columns = [
        "HorseName", "RaceID", "RaceNumber",
        "Distance", "Surface_Dirt", "Surface_Turf",
        "AvgPastFinishPosition", "WinRate", "DistanceSuitability",
        "JockeyWinRate", "JockeyAvgFinish", "FieldStrength", "SurfaceWinRate"
    ]
    df = df[final_columns]

    # Step 10: Save
    df.to_csv(output_csv_path, index=False)
    print("DONE")
    return df



In [16]:
# Run the function
df = process_brisnet_file("data/CDX0515.csv", "data/CDX0515_processed_for_prediction.csv")

DONE


In [14]:
df.head(100)

Unnamed: 0,HorseName,RaceID,RaceNumber,Distance,Surface_Dirt,Surface_Turf,AvgPastFinishPosition,WinRate,DistanceSuitability,JockeyWinRate,JockeyAvgFinish,FieldStrength,SurfaceWinRate
0,BALLADRY,CD _20250515_1,1,0.375,True,False,5.6,0.0,110.0,0.100000,5.300000,4.233333,0.0
1,WHERE'S THE WINE,CD _20250515_1,1,0.375,True,False,2.8,0.4,1466.0,0.166667,3.250000,4.233333,0.2
2,PRINCESS POM POM,CD _20250515_1,1,0.375,True,False,3.4,0.4,638.0,0.178571,4.535714,4.233333,0.4
3,ASK AMANDA,CD _20250515_1,1,0.375,True,False,4.0,0.2,308.0,0.333333,3.500000,4.233333,0.2
4,SPIRIT RULES,CD _20250515_1,1,0.375,True,False,3.4,0.4,154.0,0.111111,5.666667,4.233333,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,NYQUIST FREQUENCY,CD _20250515_8,8,0.500,True,False,5.2,0.0,792.0,0.071429,5.642857,5.271429,0.0
82,ELECTIONEERING,CD _20250515_8,8,0.500,True,False,0.0,0.0,1540.0,0.000000,4.857143,5.271429,0.0
83,TAPAKENA,CD _20250515_8,8,0.500,True,False,6.0,0.0,154.0,0.142857,4.619048,5.271429,0.0
84,MO JACKSON,CD _20250515_8,8,0.500,True,False,3.0,0.0,1298.0,0.230769,4.153846,5.271429,0.0
