In [None]:
'''
Project summary:
In this project, I used a model to predict the result of house racing 
I used a Python function `parse_past_performance(xml_path)` to extract structured information from the original data. And the extracted information is under the structure of
- Horse: name, birth year, origin.
  - Trainer and Jockey: last name + first name concatenation.
  - Race-specific info: program number, post position, odds.
Each row in the resulting DataFrame corresponds to a unique horse entry in a race.
Then I used the second function, `parse_results(xml_path)`,  to handle the resultsData XML files by:
- Extracting <RACE> and <ENTRY> nodes to access outcome-related data.
- Capturing features like:
  - Official Finish: the final ranking of the horse.
  - Finish Time and Speed Rating: performance measures.
  - Dollar Odds: betting market odds at race time.
This information provides the ground truth necessary to train supervised models.
I used Python’s glob and os modules to:
- Locate all relevant .xml files in each dataset folder.
- Loop through the files and apply the parsing functions.
- Combine all parsed records into large DataFrames for training.
This step is essential to scale the project from parsing a few races to processing the entire dataset.
While this notebook has not yet executed any predictive modeling, it forms a robust data preprocessing pipeline. It accurately extracts and cleans critical racing information and aligns well with best practices in data engineering for ML.
A few important reflections:
- Modularity: Functions like parse_past_performance and parse_results are reusable, improving maintainability.
- Readability: Despite the use of raw XMLs (which are often difficult to parse), the logic is clearly structured.
- Scalability: The use of glob to handle bulk data processing ensures this pipeline can scale to hundreds or thousands of races.
Conclusion: This project marks a strong foundation for more advanced work on race outcome prediction and underscores the essential role of data compatibility, especially in high-stakes prediction problems such as horse racing.

'''
!git clone https://github.com/flyaflya/fsan830spring2025.git
%cd fsan830spring2025

%cd fsan830spring2025

!pip install -U numpy pymc pymc-bart arviz xarray matplotlib scikit-learn

import pymc, pymc_bart
print("pymc version:", pymc.__version__)
print("pymc-bart version:", pymc_bart.__version__)

import pandas as pd
import numpy as np
import pymc as pm
from pymc_bart import BART
from glob import glob
import os
import xml.etree.ElementTree as ET

def parse_past_performance(xml_path):
    import xml.etree.ElementTree as ET
    tree = ET.parse(xml_path)
    root = tree.getroot()
    records = []
    for race in root.findall('Race'):
        race_number = race.findtext('RaceNumber')
        for starters in race.findall('Starters'):
            record = {'RaceNumber': race_number}
            for item in starters:
                if item.tag == 'Horse':
                    record['HorseName'] = item.findtext('HorseName')
                    record['YearOfBirth'] = item.findtext('YearOfBirth')
                    record['FoalingArea'] = item.findtext('FoalingArea')
                elif item.tag in ['PostPosition', 'ProgramNumber', 'WeightCarried', 'Odds']:
                    record[item.tag] = item.text
                elif item.tag == 'Trainer':
                    record['Trainer'] = (item.findtext('LastName') or '') + (item.findtext('FirstName') or '')
                elif item.tag == 'Jockey':
                    record['Jockey'] = (item.findtext('LastName') or '') + (item.findtext('FirstName') or '')
            if 'HorseName' in record and record['HorseName']:
                for k in record:
                    if isinstance(record[k], str):
                        record[k] = record[k].strip()
                records.append(record)
    return pd.DataFrame(records)

def parse_results(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    records = []
    for race in root.findall('.//RACE'):
        race_number = race.get('NUMBER')
        for entry in race.findall('ENTRY'):
            record = {
                'RaceNumber': race_number,
                'HorseName': entry.findtext('NAME'),
                'OfficialFinish': entry.findtext('OFFICIAL_FIN'),
                'FinishTime': entry.findtext('FINISH_TIME'),
                'SpeedRating': entry.findtext('SPEED_RATING'),
                'DollarOdds': entry.findtext('DOLLAR_ODDS'),
            }
            for k in record:
                if isinstance(record[k], str):
                    record[k] = record[k].strip()
            records.append(record)
    return pd.DataFrame(records)

pp_dir = 'data/rawDataForTraining/pastPerformanceData'
res_dir = 'data/rawDataForTraining/resultsData'

pp_files = glob(os.path.join(pp_dir, '*.xml'))
res_files = glob(os.path.join(res_dir, '*.xml'))

print(f"共发现PastPerformance xml: {len(pp_files)} 份，Result xml: {len(res_files)} 份")

pp_dfs = [parse_past_performance(f) for f in pp_files]
res_dfs = [parse_results(f) for f in res_files]

df_feat = pd.concat(pp_dfs, ignore_index=True)
df_label = pd.concat(res_dfs, ignore_index=True)

print("特征集 shape:", df_feat.shape, "标签集 shape:", df_label.shape)

for df in [df_feat, df_label]:
    df['HorseName'] = df['HorseName'].astype(str).str.lower().str.strip()
    df['RaceNumber'] = df['RaceNumber'].astype(str).str.strip()

df_full = pd.merge(df_feat, df_label, on=['RaceNumber', 'HorseName'], how='inner', suffixes=('_pp', '_res'))
print("最终训练集 shape:", df_full.shape)

def odds_str_to_float(odds):
    if pd.isna(odds):
        return None
    try:
        return float(odds)
    except:
        if '/' in str(odds):
            try:
                num, den = odds.split('/')
                return float(num) / float(den)
            except:
                return None
        return None

df_full['Odds_float'] = df_full['Odds'].apply(odds_str_to_float)
df_full['DollarOdds'] = pd.to_numeric(df_full['DollarOdds'], errors='coerce')
df_full['OfficialFinish'] = pd.to_numeric(df_full['OfficialFinish'], errors='coerce')
df_full['FinishTime'] = pd.to_numeric(df_full['FinishTime'], errors='coerce')
df_full['SpeedRating'] = pd.to_numeric(df_full['SpeedRating'], errors='coerce')
df_full['WeightCarried'] = pd.to_numeric(df_full['WeightCarried'], errors='coerce')

df_full.to_csv('final_supervised_training_set.csv', index=False)
print(df_full.head())
print("可用于建模的特征：", df_full.columns.tolist())

feature_cols = ['Odds_float', 'WeightCarried', 'SpeedRating']  # 你可以增删
'''
feature_cols = [
    'YearOfBirth',
    'FoalingArea',
    'PostPosition',
    'WeightCarried',
    'Trainer',
    'Jockey',
    'SpeedRating',
    'Odds_float'
]
'''
X = df_full[feature_cols].fillna(0).values
y = df_full['OfficialFinish'].values

In [None]:

column_indices = [
    2,    # RaceNumber
    44,   # HorseName
    45,   # YearOfBirth
    56,   # FoalingArea
    3,    # PostPosition
    42,   # ProgramNumber
    50,   # WeightCarried
    27,   # Trainer
    32,   # Jockey
    515,  # Odds (第一场历史比赛的赔率)
    615,  # OfficialFinish (第一场历史比赛的名次)
    1035, # FinishTime (第一场历史比赛的完赛时间)
    845,  # SpeedRating (第一场历史比赛的评分)
    43,   # DollarOdds
]
feature_cols = ['Odds_float', 'WeightCarried', 'SpeedRating']

column_indices = [
    515,  # Odds
    50,   # WeightCarried
    845   # SpeedRating
]

columns = ['Odds', 'WeightCarried', 'SpeedRating']

X_test = pd.read_csv('data/rawDataForPrediction/CDX0426.csv', header=None, usecols=column_indices)
X_test.columns = columns

def odds_str_to_float(s):
    try:
        if pd.isna(s):
            return None
        if '-' in str(s):
            a, b = str(s).split('-')
            return float(a) / float(b)
        return float(s)
    except:
        return None

X_test['Odds_float'] = X_test['Odds'].apply(odds_str_to_float)

X_test_final = X_test[feature_cols]

print(X_test_final)


with pm.Model() as model:
    μ = BART("μ", X, y)
    σ = pm.HalfNormal("σ", sigma=1.0)
    y_obs = pm.Normal("y_obs", mu=μ, sigma=σ, observed=y)
    trace = pm.sample(1000, tune=1000, target_accept=0.95, cores=1, random_seed=42)
    print("模型训练完毕！")

assert μ.name == "μ", "BART 的 name 必须与你 predictions_input 的 key 一致"

import pymc as pm
from pymc_bart import BART
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from datetime import datetime

X_train = df_full[feature_cols].fillna(0).values
y_train = df_full['OfficialFinish'].values

with pm.Model() as model:
    μ = BART("μ", X_train, y_train)
    σ = pm.HalfNormal("σ", sigma=1.0)
    y_obs = pm.Normal("y_obs", mu=μ, sigma=σ, observed=y_train)

    trace = pm.sample(
        1000,
        tune=1000,
        target_accept=0.95,
        cores=1,
        random_seed=42,
        idata_kwargs={"log_likelihood": True}  # ✅ 确保 μ 会被记录进 trace
    )

print("✅ 模型训练完毕！")

μ_train_mean = trace.posterior["μ"].mean(dim=("chain", "draw")).values

plt.figure(figsize=(6, 6))
plt.scatter(y_train, μ_train_mean, alpha=0.6)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--')
plt.xlabel("Actual (Train)")
plt.ylabel("Predicted")
plt.title("Training Fit: PyMC-BART")
plt.grid(True)
plt.show()

r2 = r2_score(y_train, μ_train_mean)
rmse = np.sqrt(mean_squared_error(y_train, μ_train_mean))
print(f"训练集 R² Score: {r2:.4f}")
print(f"训练集 RMSE: {rmse:.2f}")

from datetime import datetime

def bart_train_and_predict(df_train, df_test_raw, feature_cols, target_col):
    X_train = df_train[feature_cols].fillna(0).values
    y_train = df_train[target_col].values

    X_test = df_test_raw[feature_cols].fillna(0).values
    n_test = X_test.shape[0]

    X_all = np.vstack([X_train, X_test])
    y_all = np.concatenate([y_train, np.zeros(n_test)])

    with pm.Model() as model:
        μ_all = BART("μ", X_all, y_all)
        σ = pm.HalfNormal("σ", sigma=1.0)

        y_obs = pm.Normal("y_obs", mu=μ_all[:len(y_train)], sigma=σ, observed=y_train)

        y_pred = pm.Normal("y_pred", mu=μ_all[len(y_train):], sigma=σ)

        trace = pm.sample(
            1000, tune=1000, target_accept=0.95, cores=1, random_seed=42
        )

        ppc = pm.sample_posterior_predictive(
            trace, var_names=["y_pred"], return_inferencedata=False
        )

    y_pred_dist = ppc["y_pred"]  # shape: (n_chains, n_draws, n_test)
    y_pred_dist = y_pred_dist.reshape(-1, y_pred_dist.shape[-1])  # => (n_samples_total, n_test)

    if y_pred_dist.shape[1] != n_test:
        raise ValueError(f"预测维度异常: y_pred shape={y_pred_dist.shape}, 预期测试样本数={n_test}")

    y_pred_mean = y_pred_dist.mean(axis=0)
    y_pred_std = y_pred_dist.std(axis=0)

    results_df = df_test_raw.reset_index(drop=True).copy()
    results_df["PredictedFinish"] = y_pred_mean
    results_df["PredictedStd"] = y_pred_std
    results_df["PredictedRank"] = results_df["PredictedFinish"].rank(method="min").astype(int)

    return results_df.sort_values("PredictedFinish").reset_index(drop=True)

feature_cols = ['Odds_float', 'WeightCarried', 'SpeedRating']
target_col = 'OfficialFinish'

results_df_sorted = bart_train_and_predict(df_full, X_test, feature_cols, target_col)

print("预测完成时间：", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print(results_df_sorted.head(10))



top5 = results_df_sorted.head(5).copy()
top5_labels = [f"Rank #{i+1}" for i in range(len(top5))]

plt.figure(figsize=(8, 4))
plt.barh(top5_labels, top5["PredictedFinish"], xerr=top5["PredictedStd"])
plt.xlabel("Predicted Finish")
plt.title("Top 5 Predicted Results")
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()