In [3]:
import pandas as pd

# Load with low_memory=False to avoid dtype guessing errors
df = pd.read_csv("raceform.csv", low_memory=False)

# Show shape and top rows
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
df.head()


Rows: 1679393
Columns: 37


Unnamed: 0,date,course,race_id,off,race_name,type,class,pattern,rating_band,age_band,...,trainer,prize,or,rpr,ts,sire,dam,damsire,owner,comment
0,2015-01-01,Catterick,615704,12:30,Happy New Year Novices Hurdle,Hurdle,Class 4,,,4yo+,...,Brian Ellison,4873.5,–,114,80,Definite Article (GB),The Red Wench (IRE),Aahsaylad,P J Martin,Tracked leaders - effort 3 out - led approachi...
1,2015-01-01,Tramore (IRE),616859,12:35,2015 Waterford & Tramore Racecourse Supporters...,Hurdle,,,80-109,4yo+,...,P J Rothwell,,81,–,–,Pyrus (USA),Lorli (IRE),Mukaddamah,Michael Cawe,Always in rear - never a factor - pulled up be...
2,2015-01-01,Tramore (IRE),616860,1:05,Padraig Curran - South East Cleaners Maiden Hu...,Hurdle,,,,5yo+,...,W P Mullins,€6037.50,–,122,32,Saint Des Saints (FR),Rainbow Crest (FR),Baryshnikov,Shanakiel Racing Syndicate,Soon chased leaders in 3rd - progress to dispu...
3,2015-01-01,Tramore (IRE),616860,1:05,Padraig Curran - South East Cleaners Maiden Hu...,Hurdle,,,,5yo+,...,Michael Winters,€1400,–,116,26,Blueprint (IRE),Garrisker (IRE),Kings Ride,John J Madden,Chased leaders in 4th until took closer order ...
4,2015-01-01,Tramore (IRE),616860,1:05,Padraig Curran - South East Cleaners Maiden Hu...,Hurdle,,,,5yo+,...,Paul Nolan,€612.50,–,95,–,Librettist (USA),Zulbis (TUR),Down The Flag,D P Sharkey,Led from 1st until headed after 3 out - not qu...


In [5]:
df.columns
df.head()


Unnamed: 0,date,course,race_id,off,race_name,type,class,pattern,rating_band,age_band,...,trainer,prize,or,rpr,ts,sire,dam,damsire,owner,comment
0,2015-01-01,Catterick,615704,12:30,Happy New Year Novices Hurdle,Hurdle,Class 4,,,4yo+,...,Brian Ellison,4873.5,–,114,80,Definite Article (GB),The Red Wench (IRE),Aahsaylad,P J Martin,Tracked leaders - effort 3 out - led approachi...
1,2015-01-01,Tramore (IRE),616859,12:35,2015 Waterford & Tramore Racecourse Supporters...,Hurdle,,,80-109,4yo+,...,P J Rothwell,,81,–,–,Pyrus (USA),Lorli (IRE),Mukaddamah,Michael Cawe,Always in rear - never a factor - pulled up be...
2,2015-01-01,Tramore (IRE),616860,1:05,Padraig Curran - South East Cleaners Maiden Hu...,Hurdle,,,,5yo+,...,W P Mullins,€6037.50,–,122,32,Saint Des Saints (FR),Rainbow Crest (FR),Baryshnikov,Shanakiel Racing Syndicate,Soon chased leaders in 3rd - progress to dispu...
3,2015-01-01,Tramore (IRE),616860,1:05,Padraig Curran - South East Cleaners Maiden Hu...,Hurdle,,,,5yo+,...,Michael Winters,€1400,–,116,26,Blueprint (IRE),Garrisker (IRE),Kings Ride,John J Madden,Chased leaders in 4th until took closer order ...
4,2015-01-01,Tramore (IRE),616860,1:05,Padraig Curran - South East Cleaners Maiden Hu...,Hurdle,,,,5yo+,...,Paul Nolan,€612.50,–,95,–,Librettist (USA),Zulbis (TUR),Down The Flag,D P Sharkey,Led from 1st until headed after 3 out - not qu...


In [7]:
df.columns



Index(['date', 'course', 'race_id', 'off', 'race_name', 'type', 'class',
       'pattern', 'rating_band', 'age_band', 'sex_rest', 'dist', 'going',
       'ran', 'num', 'pos', 'draw', 'ovr_btn', 'btn', 'horse', 'age', 'sex',
       'wgt', 'hg', 'time', 'sp', 'jockey', 'trainer', 'prize', 'or', 'rpr',
       'ts', 'sire', 'dam', 'damsire', 'owner', 'comment'],
      dtype='object')

In [15]:
# Horse Racing Win Prediction Model – Full Notebook Code

# STEP 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# STEP 2: Utility Functions
def convert_stone_pounds(wgt_str):
    """Convert 'stone-pounds' (e.g., '9-0') to total pounds."""
    try:
        if isinstance(wgt_str, str) and '-' in wgt_str:
            stone, pounds = wgt_str.split('-')
            return int(stone) * 14 + int(pounds)
        return float(wgt_str)
    except:
        return np.nan

def frac_to_dec(odds):
    """Convert fractional odds (e.g., '4/1') or decimal odds to float."""
    try:
        if isinstance(odds, str) and '/' in odds:
            num, denom = odds.split('/')
            return round(float(num) / float(denom) + 1, 2)
        elif isinstance(odds, str):
            return float(odds)
        return odds
    except:
        return np.nan

# STEP 3: Load Data
df = pd.read_csv(
    "raceform.csv",
    low_memory=False,
    parse_dates=['date'],
    na_values=['–', '-', '']
)
print("Initial shape:", df.shape)

# STEP 4: Convert weight column
df['wgt'] = df['wgt'].apply(convert_stone_pounds)

# STEP 5: Clean + Filter rows with necessary data
df = df[
    df['pos'].notna() &
    df['sp'].notna() &
    df['jockey'].notna() &
    df['trainer'].notna()
]
df = df[df['pos'].astype(str).str.isnumeric()]

# STEP 6: Create target variable
df['win'] = df['pos'].astype(int).apply(lambda x: 1 if x == 1 else 0)

# STEP 7: Convert Odds and compute implied probability
df['sp_dec'] = df['sp'].apply(frac_to_dec)
df['implied_prob'] = 1 / df['sp_dec']
df = df.dropna(subset=['sp_dec', 'implied_prob'])

# STEP 8: Clean numeric columns, coercing invalid entries to NaN
for col in ['draw', 'age', 'wgt', 'or', 'rpr', 'ts']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna(subset=['draw', 'age', 'wgt', 'or', 'rpr', 'ts'])

# STEP 9: Encode categorical columns
le_jockey  = LabelEncoder()
le_trainer = LabelEncoder()
le_going   = LabelEncoder()
le_type    = LabelEncoder()
le_course  = LabelEncoder()

df['jockey_enc']  = le_jockey.fit_transform(df['jockey'].str.lower())
df['trainer_enc'] = le_trainer.fit_transform(df['trainer'].str.lower())
df['going_enc']   = le_going.fit_transform(df['going'].str.lower())
df['type_enc']    = le_type.fit_transform(df['type'].str.lower())
df['course_enc']  = le_course.fit_transform(df['course'].str.lower())

# STEP 10: Define features and target
features = [
    'draw', 'age', 'wgt', 'or', 'rpr', 'ts', 'implied_prob',
    'jockey_enc', 'trainer_enc', 'going_enc', 'type_enc', 'course_enc'
]
X = df[features]
y = df['win']

# STEP 11: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# STEP 12: Train the model
model = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
model.fit(X_train, y_train)

# STEP 13: Evaluate
y_pred = model.predict(X_test)
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_pred))

# STEP 14: Prediction function for new race data
def predict_race(model, race_df):
    df2 = race_df.copy()
    # convert weight and odds
    df2['wgt']    = df2['wgt'].apply(convert_stone_pounds)
    df2['sp_dec'] = df2['sp'].apply(frac_to_dec)
    df2['implied_prob'] = 1 / df2['sp_dec']
    # clean numeric
    for col in ['draw', 'age', 'wgt', 'or', 'rpr', 'ts']:
        df2[col] = pd.to_numeric(df2[col], errors='coerce')
    # encode categorical
    df2['jockey_enc']  = le_jockey.transform(df2['jockey'].str.lower())
    df2['trainer_enc'] = le_trainer.transform(df2['trainer'].str.lower())
    df2['going_enc']   = le_going.transform(df2['going'].str.lower())
    df2['type_enc']    = le_type.transform(df2['type'].str.lower())
    df2['course_enc']  = le_course.transform(df2['course'].str.lower())
    # drop NA and predict
    df2 = df2.dropna(subset=features)
    df2['Predicted_Win_Prob'] = model.predict_proba(df2[features])[:, 1]
    return df2.sort_values('Predicted_Win_Prob', ascending=False)

# STEP 15: Example usage
example_race = pd.DataFrame({
    "draw": [2, 4, 1],
    "age": [5, 4, 6],
    "wgt": ["9-4", "8-12", "10-0"],
    "or": [75, 72, 78],
    "rpr": [80, 76, 82],
    "ts": [60, 58, 65],
    "sp": ["4/1", "10/1", "7/2"],
    "jockey": ["ryan moore", "oisin murphy", "ben curtis"],
    "trainer": ["a p o'brien", "andrew balding", "karl burke"],
    "going": ["good", "good", "good"],
    "type": ["Hurdle", "Hurdle", "Hurdle"],
    "course": ["Catterick", "Tramore (IRE)", "Tramore (IRE)"]
})

print("\n📈 Example race predictions:")



Initial shape: (1679393, 37)

📊 Classification Report:

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     82382
           1       0.87      0.17      0.29      8331

    accuracy                           0.92     90713
   macro avg       0.89      0.59      0.62     90713
weighted avg       0.92      0.92      0.90     90713


📈 Example race predictions:


In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# y_test and y_pred should already exist from your train/test step:
# y_pred = model.predict(X_test)

# 1) Overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"🔹 Accuracy: {accuracy:.2%}")

# 2) Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print("🔹 Confusion Matrix:")
print(conf_mat)

# 3) ROC AUC
# (needs predicted probabilities for the positive class)
y_prob = model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f"🔹 ROC AUC: {roc_auc:.3f}")


🔹 Accuracy: 92.17%
🔹 Confusion Matrix:
[[82156   226]
 [ 6878  1453]]
🔹 ROC AUC: 0.944


In [19]:
import joblib

# assuming you have these names in your notebook
joblib.dump(model,      "race_model.joblib")
joblib.dump(le_jockey,  "le_jockey.joblib")
joblib.dump(le_trainer, "le_trainer.joblib")
joblib.dump(le_going,   "le_going.joblib")
joblib.dump(le_type,    "le_type.joblib")
joblib.dump(le_course,  "le_course.joblib")


['le_course.joblib']

In [23]:
import os, shutil

# 1) Make the folder
os.makedirs('horse_predictor_api', exist_ok=True)

# 2) Copy your .joblib files
for fname in [
    'race_model.joblib',
    'le_jockey.joblib',
    'le_trainer.joblib',
    'le_going.joblib',
    'le_type.joblib',
    'le_course.joblib'
]:
    shutil.copy(fname, 'horse_predictor_api/')

# 3) Write requirements.txt
with open('horse_predictor_api/requirements.txt', 'w') as f:
    f.write('\n'.join([
        'fastapi',
        'uvicorn[standard]',
        'pandas',
        'scikit-learn',
        'joblib'
    ]) + '\n')

print("✅ horse_predictor_api/ created with your model files and requirements.txt")



✅ horse_predictor_api/ created with your model files and requirements.txt


In [25]:
!pip install -r horse_predictor_api/requirements.txt


Collecting fastapi (from -r horse_predictor_api/requirements.txt (line 1))
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn[standard] (from -r horse_predictor_api/requirements.txt (line 2))
  Downloading uvicorn-0.34.3-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi->-r horse_predictor_api/requirements.txt (line 1))
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting httptools>=0.6.3 (from uvicorn[standard]->-r horse_predictor_api/requirements.txt (line 2))
  Downloading httptools-0.6.4-cp312-cp312-win_amd64.whl.metadata (3.7 kB)
Collecting watchfiles>=0.13 (from uvicorn[standard]->-r horse_predictor_api/requirements.txt (line 2))
  Downloading watchfiles-1.0.5-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting websockets>=10.4 (from uvicorn[standard]->-r horse_predictor_api/requirements.txt (line 2))
  Downloading websockets-15.0.1-cp312-cp312-win_amd64.whl.metadata (7.0 kB)
Downloadi

In [None]:
!uvicorn horse_predictor_api.app:app --reload
