In [None]:
import os

# Delete all uploaded files in the current directory
for filename in os.listdir():
    if os.path.isfile(filename):
        try:
            os.remove(filename)
        except Exception as e:
            print(f"Could not delete {filename}: {e}")

print("All uploaded files have been deleted.")


All uploaded files have been deleted.


In [None]:
# Upload XML files
from google.colab import files
uploaded = files.upload()

# Import libraries
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from io import BytesIO
import re

# Robust distance parser
def parse_distance(dist_str):
    try:
        if not dist_str:
            return np.nan
        dist_str = dist_str.lower()
        if 'm' in dist_str:
            parts = dist_str.split('m')[0].strip().split()
            if len(parts) == 1:
                distance = float(parts[0])
            else:
                whole = float(parts[0])
                num, denom = map(float, parts[1].split('/'))
                distance = whole + (num / denom)
            return distance * 8
        elif 'f' in dist_str:
            parts = dist_str.split('f')[0].strip().split()
            if len(parts) == 1:
                return float(parts[0])
            else:
                whole = float(parts[0])
                num, denom = map(float, parts[1].split('/'))
                return whole + (num / denom)
        else:
            return float(dist_str)
    except:
        return np.nan

# Main parser
def parse_past_performance_xml_colab(xml_file, filename="unknown.xml"):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        all_rows = []

        for race in root.findall("Race"):
            race_number = race.findtext("RaceNumber")
            post_time = race.findtext("PostTime")
            distance = race.find("Distance/PublishedValue")
            surface = race.find("Course/CourseType/Description")
            race_type = race.find("RaceType/Description")
            purse = race.findtext("PurseUSA")

            distance_val = parse_distance(distance.text) if distance is not None else np.nan
            surface_val = surface.text if surface is not None else None
            race_type_val = race_type.text if race_type is not None else None
            purse_val = float(purse) if purse else np.nan

            for starter in race.findall("Starters"):
                horse_elem = starter.find("Horse")
                if horse_elem is None:
                    continue

                horse_name = horse_elem.findtext("HorseName")
                jockey = starter.find("Jockey/LastName")
                trainer = starter.find("Trainer/LastName")
                program_number = starter.findtext("ProgramNumber")

                current_row = {
                    "RaceID": f"{filename}_R{race_number}",
                    "RaceNumber": race_number,
                    "PostTime": post_time,
                    "Distance": distance_val,
                    "Surface": surface_val,
                    "RaceType": race_type_val,
                    "Purse": purse_val,
                    "HorseName": horse_name,
                    "Jockey": jockey.text if jockey is not None else '',
                    "Trainer": trainer.text if trainer is not None else '',
                    "ProgramNumber": program_number
                }

                past_perf_elements = starter.findall('PastPerformance')[:5]
                for i, perf in enumerate(past_perf_elements):
                    start_elem = perf.find('Start')
                    if start_elem is None:
                        continue

                    try:
                        race_date = perf.findtext("RaceDate")
                        surface = perf.find("Course/Surface/Value")
                        distance = perf.find("Distance/PublishedValue")
                        purse = perf.findtext("PurseUSA")
                        num_starters = perf.findtext("NumberOfStarters")
                        track = perf.find("Track/TrackID")
                        race_num = perf.find("RaceNumber")

                        lengths_last_call = np.nan
                        lengths_finish = np.nan
                        start_pos = np.nan
                        finish_pos = np.nan
                        last_call_pos = np.nan

                        for call in start_elem.findall("PointOfCall"):
                            poc = call.findtext("PointOfCall")
                            if poc == 'S':
                                pos = call.findtext("Position")
                                if pos: start_pos = int(pos)
                            elif poc == 'F':
                                pos = call.findtext("Position")
                                if pos: finish_pos = int(pos)
                                if finish_pos != 1:
                                    lb = call.findtext("LengthsBehind")
                                    if lb: lengths_finish = float(lb)
                            if call.findtext("PointOfCallPrint") == 'Y':
                                pos = call.findtext("Position")
                                if pos: last_call_pos = int(pos)
                                lb = call.findtext("LengthsBehind")
                                if lb: lengths_last_call = float(lb)

                        past_prefix = f"PP{i+1}_"
                        current_row.update({
                            past_prefix + "RaceID": f"{track.text}-{race_date}-R{race_num.text}" if track is not None and race_num is not None else None,
                            past_prefix + "RaceDate": race_date,
                            past_prefix + "Surface": surface.text if surface is not None else None,
                            past_prefix + "Distance": parse_distance(distance.text) if distance is not None else np.nan,
                            past_prefix + "Purse": float(purse) if purse else np.nan,
                            past_prefix + "NumStarters": int(num_starters) if num_starters else np.nan,
                            past_prefix + "FinishPosition": finish_pos,
                            past_prefix + "LengthsBackFinish": lengths_finish,
                            past_prefix + "LengthsBackLastCall": lengths_last_call,
                            past_prefix + "StartPosition": start_pos,
                            past_prefix + "LastCallPosition": last_call_pos,
                            past_prefix + "Jockey": start_elem.findtext("Jockey/LastName") or '',
                            past_prefix + "Trainer": start_elem.findtext("Trainer/LastName") or '',
                        })
                    except Exception:
                        continue

                all_rows.append(current_row)

        return pd.DataFrame(all_rows)

    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")
        return pd.DataFrame()

# Parse all uploaded files
all_dfs = []

for fname in uploaded:
    print(f"📄 Parsing {fname}...")
    df = parse_past_performance_xml_colab(BytesIO(uploaded[fname]), filename=fname)
    all_dfs.append(df)

full_df = pd.concat(all_dfs, ignore_index=True)
full_df["DistanceYards"] = full_df["Distance"] * 220
full_df.to_csv("training_data_from_xml.csv", index=False)

from google.colab import files
files.download("training_data_from_xml.csv")


Saving SIMD20230502CD_USA.xml to SIMD20230502CD_USA.xml
Saving SIMD20230503CD_USA.xml to SIMD20230503CD_USA.xml
Saving SIMD20230504CD_USA.xml to SIMD20230504CD_USA.xml
Saving SIMD20230505CD_USA.xml to SIMD20230505CD_USA.xml
Saving SIMD20230506CD_USA.xml to SIMD20230506CD_USA.xml
Saving SIMD20230511CD_USA.xml to SIMD20230511CD_USA.xml
Saving SIMD20230512CD_USA.xml to SIMD20230512CD_USA.xml
Saving SIMD20230513CD_USA.xml to SIMD20230513CD_USA.xml
Saving SIMD20230514CD_USA.xml to SIMD20230514CD_USA.xml
Saving SIMD20230518CD_USA.xml to SIMD20230518CD_USA.xml
Saving SIMD20230519CD_USA.xml to SIMD20230519CD_USA.xml
Saving SIMD20230520CD_USA.xml to SIMD20230520CD_USA.xml
Saving SIMD20230521CD_USA.xml to SIMD20230521CD_USA.xml
Saving SIMD20230525CD_USA.xml to SIMD20230525CD_USA.xml
Saving SIMD20230526CD_USA.xml to SIMD20230526CD_USA.xml
Saving SIMD20230527CD_USA.xml to SIMD20230527CD_USA.xml
Saving SIMD20230528CD_USA.xml to SIMD20230528CD_USA.xml
Saving SIMD20230529CD_USA.xml to SIMD20230529CD_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Upload race result XMLs
from google.colab import files
uploaded_results = files.upload()

# Imports
import xml.etree.ElementTree as ET
import pandas as pd
import re
from io import BytesIO

# Helpers
def create_race_id(track_code, date_str, race_number):
    try:
        # Ensure date_str is in MM-DD-YY format
        month, day, year = date_str.split("-")
        year = year[-2:]  # Use last two digits of year
        return f"{track_code}-{month.zfill(2)}-{day.zfill(2)}-{year}-R{int(race_number):02d}"
    except:
        return None

def standardize_race_id(race_id):
    try:
        # Handle RaceID from features_df (e.g., SIMD20231104CD_USA.xml_R1)
        match = re.match(r"SIMD(\d{4})(\d{2})(\d{2})CD_USA\.xml_R(\d+)", race_id)
        if match:
            year, month, day, race_num = match.groups()
            year = year[-2:]
            return f"CD-{month}-{day}-{year}-R{int(race_num):02d}"
        # Handle RaceID already in CD-MM-DD-YY-RNN format
        match = re.match(r"CD-(\d{2})-(\d{2})-(\d{2})-R(\d+)", race_id)
        if match:
            return race_id  # Already standardized
        return race_id
    except:
        return race_id

def normalize_horse_name(name):
    if not name or pd.isna(name):
        return ""
    name = str(name).lower().strip()
    name = name.replace("’", "'").replace("`", "'")  # Handle different apostrophes
    name = re.sub(r"[^a-z0-9'\s]", "", name)  # Remove all punctuation except apostrophes
    name = re.sub(r"\s+", " ", name)  # Normalize spaces
    return name.strip()

def parse_race_result_xml(xml_file, filename="unknown.xml"):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        results = []

        races = root.findall("RACE") if root.tag != "CHART" else root.findall(".//RACE")

        for idx, race in enumerate(races):
            race_date = race.findtext("RACE_DATE")
            race_number = race.findtext("RACE_NUMBER")
            track_code = "CD"

            if not race_date or not race_number:
                match = re.match(r"cd(\d{8})tch.*\.xml", filename)
                if match:
                    date_str = f"{match.group(1)[4:6]}-{match.group(1)[6:8]}-{match.group(1)[2:4]}"
                    race_number = str(idx + 1)
                    race_date = date_str
                else:
                    continue

            # Standardize race_date to MM-DD-YY
            try:
                month, day, year = race_date.split("-")
                year = year[-2:]
                race_date = f"{month.zfill(2)}-{day.zfill(2)}-{year}"
            except:
                continue

            race_id = create_race_id(track_code, race_date, race_number)
            if not race_id:
                continue

            for entry in race.findall("ENTRY"):
                horse_name = entry.findtext("NAME")
                finish = entry.findtext("OFFICIAL_FIN")
                if horse_name and finish and finish.isdigit():
                    results.append({
                        "RaceID": race_id,
                        "HorseName": normalize_horse_name(horse_name),
                        "FinishPosition": int(finish)
                    })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"❌ Error parsing {filename}: {e}")
        return pd.DataFrame()

# Parse result files
all_results = []
for fname in uploaded_results:
    print(f"📄 Parsing {fname}...")
    df = parse_race_result_xml(BytesIO(uploaded_results[fname]), filename=fname)
    all_results.append(df)

results_df = pd.concat(all_results, ignore_index=True)

# Load features
features_df = pd.read_csv("training_data_from_xml.csv")
features_df["RaceID"] = features_df["RaceID"].apply(standardize_race_id)
features_df["HorseName"] = features_df["HorseName"].apply(normalize_horse_name)
results_df["HorseName"] = results_df["HorseName"].apply(normalize_horse_name)

# Log unique RaceIDs for debugging
print("Unique RaceIDs in features_df:", features_df["RaceID"].unique())
print("Unique RaceIDs in results_df:", results_df["RaceID"].unique())

# Merge with diagnostics
merged_df = pd.merge(
    features_df,
    results_df,
    on=["RaceID", "HorseName"],
    how="left",
    indicator=True  # Add merge indicator to track matched/unmatched rows
)

# Analyze unmatched rows
unmatched = merged_df[merged_df["_merge"] == "left_only"]
print(f"Number of unmatched rows (potential -1 FinishPosition): {len(unmatched)}")
if len(unmatched) > 0:
    print("Sample of unmatched rows:")
    print(unmatched[["RaceID", "HorseName"]].head())

# Instead of filling NaN with -1, drop unmatched rows (or handle differently based on your needs)
merged_df = merged_df[merged_df["_merge"] == "both"].copy()
merged_df.drop(columns=["_merge"], inplace=True)

# Ensure FinishPosition is an integer
merged_df["FinishPosition"] = merged_df["FinishPosition"].astype(int)

# Save
merged_df.to_csv("final_training_dataset.csv", index=False)
files.download("final_training_dataset.csv")

print(f"✅ Final dataset shape: {merged_df.shape}")
print("Finish position distribution:\n", merged_df["FinishPosition"].value_counts().sort_index())

Saving cd20230502tch.xml to cd20230502tch.xml
Saving cd20230503tch.xml to cd20230503tch.xml
Saving cd20230504tch.xml to cd20230504tch.xml
Saving cd20230505tch.xml to cd20230505tch.xml
Saving cd20230506tch.xml to cd20230506tch.xml
Saving cd20230511tch.xml to cd20230511tch.xml
Saving cd20230512tch.xml to cd20230512tch.xml
Saving cd20230513tch.xml to cd20230513tch.xml
Saving cd20230514tch.xml to cd20230514tch.xml
Saving cd20230518tch.xml to cd20230518tch.xml
Saving cd20230519tch.xml to cd20230519tch.xml
Saving cd20230520tch.xml to cd20230520tch.xml
Saving cd20230521tch.xml to cd20230521tch.xml
Saving cd20230525tch.xml to cd20230525tch.xml
Saving cd20230526tch.xml to cd20230526tch.xml
Saving cd20230527tch.xml to cd20230527tch.xml
Saving cd20230528tch.xml to cd20230528tch.xml
Saving cd20230529tch.xml to cd20230529tch.xml
📄 Parsing cd20230502tch.xml...
📄 Parsing cd20230503tch.xml...
📄 Parsing cd20230504tch.xml...
📄 Parsing cd20230505tch.xml...
📄 Parsing cd20230506tch.xml...
📄 Parsing cd20230

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Final dataset shape: (1470, 78)
Finish position distribution:
 FinishPosition
1     177
2     175
3     177
4     178
5     176
6     160
7     134
8     104
9      71
10     55
11     33
12     19
13      5
14      2
15      1
16      1
17      1
18      1
Name: count, dtype: int64


In [None]:
import pandas as pd

# Load the final dataset
df = pd.read_csv("final_training_dataset.csv")

# Count unique races
num_races = df["RaceID"].nunique()

# Count total entries (each row is an entry)
num_entries = len(df)

# Print results
print(f"Number of unique races: {num_races}")
print(f"Total number of entries: {num_entries}")

# Optional: Average entries per race
if num_races > 0:
    avg_entries_per_race = num_entries / num_races
    print(f"Average entries per race: {avg_entries_per_race:.2f}")

Number of unique races: 179
Total number of entries: 1470
Average entries per race: 8.21


In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("final_training_dataset.csv")

# --- Step 1: Standardize Surface ---
for col in [col for col in df.columns if "Surface" in col]:
    df[col] = df[col].replace({"D": "Dirt", "T": "Turf"})

# --- Step 2: Cap LengthsBack outliers at 100.0 ---
for col in [col for col in df.columns if "LengthsBack" in col]:
    df[col] = df[col].apply(lambda x: min(x, 100.0) if pd.notna(x) else x)

# --- Step 3: Handle missing past performance data ---
numerical_pp_cols = [col for col in df.columns if col.startswith("PP") and
                    ("Distance" in col or "Purse" in col or "Lengths" in col or
                     "NumStarters" in col or "Position" in col)]
categorical_pp_cols = [col for col in df.columns if col.startswith("PP") and
                      ("Surface" in col or "Jockey" in col or "Trainer" in col)]

df[numerical_pp_cols] = df[numerical_pp_cols].fillna(0)
df[categorical_pp_cols] = df[categorical_pp_cols].fillna("Unknown")

# --- Step 4: Feature Engineering ---

# 1. AvgPastFinishPosition (treat 0 as missing)
pp_finish_cols = [f"PP{i}_FinishPosition" for i in range(1, 6)]
df["AvgPastFinishPosition"] = df[pp_finish_cols].applymap(lambda x: np.nan if x == 0 else x).mean(axis=1).fillna(0)

# 2. WinRate (fraction of past races won)
df["WinRate"] = df[pp_finish_cols].applymap(lambda x: np.nan if x == 0 else x).eq(1).sum(axis=1) / \
                df[pp_finish_cols].applymap(lambda x: np.nan if x == 0 else x).notna().sum(axis=1)
df["WinRate"] = df["WinRate"].fillna(0)

# 3. DistanceSuitability
pp_distance_cols = [f"PP{i}_Distance" for i in range(1, 6)]
df["DistanceSuitability"] = df[pp_distance_cols].sub(df["Distance"], axis=0).abs().mean(axis=1).fillna(0)

# 4. JockeyWinRate (global average win rate per jockey)
df["JockeyWinRate"] = df.groupby("Jockey")["FinishPosition"].transform(lambda x: (x == 1).mean())

# 5. FieldStrength (average past finish ability of other horses in the race)
df["FieldStrength"] = df.groupby("RaceID")["AvgPastFinishPosition"].transform("mean")

# 6. SurfaceWinRate
pp_surface_cols = [f"PP{i}_Surface" for i in range(1, 6)]
def surface_win_rate(row):
    matches = 0
    total = 0
    for i in range(1, 6):
        surf = row.get(f"PP{i}_Surface")
        win = row.get(f"PP{i}_FinishPosition")
        if pd.notna(surf) and surf != "Unknown":
            total += 1
            if win == 1 and surf == row["Surface"]:
                matches += 1
    return matches / total if total > 0 else 0
df["SurfaceWinRate"] = df.apply(surface_win_rate, axis=1)

# --- Step 5: Normalize Race Distance ---
df["Distance"] = (df["Distance"] - df["Distance"].min()) / (df["Distance"].max() - df["Distance"].min())

# --- Step 6: One-hot encode Surface ---
df = pd.get_dummies(df, columns=["Surface"], prefix="Surface")

# --- Step 7: Select final features ---
selected_columns = [
    "HorseName", "RaceID", "FinishPosition",  # Target = FinishPosition
    "Distance", "Surface_Dirt", "Surface_Turf",
    "AvgPastFinishPosition", "WinRate", "DistanceSuitability",
    "JockeyWinRate", "FieldStrength", "SurfaceWinRate"
]
final_df = df[selected_columns]

# --- Step 8: Save to CSV ---
final_df.to_csv("selected_features_dataset.csv", index=False)

# --- Summary ---
print("✅ Dataset created for FinishPosition regression.")
print(f"Shape: {final_df.shape}")
print("Columns:", final_df.columns.tolist())
print("\nSample rows:")
print(final_df.head())


✅ Dataset created for FinishPosition regression.
Shape: (1470, 12)
Columns: ['HorseName', 'RaceID', 'FinishPosition', 'Distance', 'Surface_Dirt', 'Surface_Turf', 'AvgPastFinishPosition', 'WinRate', 'DistanceSuitability', 'JockeyWinRate', 'FieldStrength', 'SurfaceWinRate']

Sample rows:
            HorseName           RaceID  FinishPosition  Distance  \
0        gormleyesque  CD-05-02-23-R01               2  0.333333   
1      kentucky reign  CD-05-02-23-R01               3  0.333333   
2  dogwoodsmilliejane  CD-05-02-23-R01               1  0.333333   
3         girls house  CD-05-02-23-R01               4  0.333333   
4              recite  CD-05-02-23-R01               5  0.333333   

   Surface_Dirt  Surface_Turf  AvgPastFinishPosition  WinRate  \
0          True         False               6.750000      0.0   
1          True         False               8.000000      0.0   
2          True         False               7.000000      0.0   
3          True         False               

  df["AvgPastFinishPosition"] = df[pp_finish_cols].applymap(lambda x: np.nan if x == 0 else x).mean(axis=1).fillna(0)
  df["WinRate"] = df[pp_finish_cols].applymap(lambda x: np.nan if x == 0 else x).eq(1).sum(axis=1) / \
  df[pp_finish_cols].applymap(lambda x: np.nan if x == 0 else x).notna().sum(axis=1)
