In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the processed dataset
df = pd.read_csv('/Users/jerry/Desktop/fsan830spring2025/data/processed/CDX0515_processed.csv')

# Display the first few rows to understand the data structure
print("Dataset loaded successfully. Here's a preview:")
display(df.head())

# Basic information about the dataset
print("\nDataset Information:")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum().sum())
print(f"Percentage of missing values: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")

# Display the dataframe
df.head()

# Function to extract first name from trainer and jockey names
def extract_first_name(name):
    if pd.isna(name):
        return name
    # Convert to string if it's not already
    if not isinstance(name, str):
        return name
    # Handle special cases with suffixes like "Jr." or "II"
    if "," in name:
        # For names like "Hernandez, Jr." - return just the last name
        return name.split(",")[0].strip()
    # For regular names, take the first word
    return name.split()[0].strip()

# Let's create a new dataframe with the features we want to keep
features_df = df[['race_number', 'distance', 'purse', 'horse_name', 'surface_code','trainer_name','trainer_roi']]
# Apply the function to extract first names
features_df['jockey_name'] = features_df['trainer_roi'].apply(extract_first_name)
features_df['trainer_name'] = features_df['trainer_name'].apply(extract_first_name)

# Display the first few rows after transformation
print("\nAfter extracting first names:")
display(features_df[['jockey_name', 'trainer_name']].head())
# Load the mapping files for horse_name, surface_code, jockey, and trainer
horse_mapping = pd.read_csv('/Users/jerry/Desktop/fsan830spring2025/students/Zhiyuan_Dong/horse_mapping.csv')
surface_mapping = pd.read_csv('/Users/jerry/Desktop/fsan830spring2025/students/Zhiyuan_Dong/surface_mapping.csv')
jockey_mapping = pd.read_csv('/Users/jerry/Desktop/fsan830spring2025/students/Zhiyuan_Dong/jockey_mapping.csv')
trainer_mapping = pd.read_csv('/Users/jerry/Desktop/fsan830spring2025/students/Zhiyuan_Dong/trainer_mapping.csv')  # Added error handling for potential bad lines


# Convert names to lowercase for case-insensitive matching
features_df['horse_name_lower'] = features_df['horse_name'].astype(str).str.lower()
horse_mapping['horse_name_lower'] = horse_mapping['horse_name'].astype(str).str.lower()
features_df['jockey_name_lower'] = features_df['jockey_name'].astype(str).str.lower()
jockey_mapping['jockey_name_lower'] = jockey_mapping['jockey_name'].astype(str).str.lower()
features_df['trainer_name_lower'] = features_df['trainer_name'].astype(str).str.lower()
trainer_mapping['trainer_name_lower'] = trainer_mapping['trainer_name'].astype(str).str.lower()

# Create dictionaries for quick lookup
horse_dict = dict(zip(horse_mapping['horse_name_lower'], horse_mapping['horse_id']))
surface_dict = dict(zip(surface_mapping['surface_type'], surface_mapping['surface_id']))
jockey_dict = dict(zip(jockey_mapping['jockey_name_lower'], jockey_mapping['jockey_id']))
trainer_dict = dict(zip(trainer_mapping['trainer_name_lower'], trainer_mapping['trainer_id']))

# Add encoded columns to the features dataframe
features_df['horse_id'] = features_df['horse_name_lower'].map(horse_dict)
features_df['surface_id'] = features_df['surface_code'].map(surface_dict)
features_df['jockey_id'] = features_df['jockey_name_lower'].map(jockey_dict)
features_df['trainer_id'] = features_df['trainer_name_lower'].map(trainer_dict)

# Remove the temporary lowercase columns
features_df = features_df.drop(['horse_name_lower', 'jockey_name_lower', 'trainer_name_lower'], axis=1)

# Check if any horses, surfaces, jockeys, or trainers weren't found in the mapping
missing_horses = features_df[features_df['horse_id'].isna()]['horse_name'].unique()
missing_surfaces = features_df[features_df['surface_id'].isna()]['surface_code'].unique()
missing_jockeys = features_df[features_df['jockey_id'].isna()]['jockey_name'].unique()
missing_trainers = features_df[features_df['trainer_id'].isna()]['trainer_name'].unique()

# Print the number of missing values for each ID column
print(f"Missing horse_id: {features_df['horse_id'].isna().sum()}")
print(f"Missing surface_id: {features_df['surface_id'].isna().sum()}")
print(f"Missing jockey_id: {features_df['jockey_id'].isna().sum()}")
print(f"Missing trainer_id: {features_df['trainer_id'].isna().sum()}")

# For missing jockey_ids, assign random values from the existing mapping
if features_df['jockey_id'].isna().sum() > 0:
    # Get all available jockey IDs from the mapping file
    available_jockey_ids = jockey_mapping['jockey_id'].values
    
    # For each row with missing jockey_id, assign a random ID from the available ones
    for idx in features_df[features_df['jockey_id'].isna()].index:
        features_df.at[idx, 'jockey_id'] = np.random.choice(available_jockey_ids)
    
    print(f"Assigned random jockey_ids to {features_df['jockey_id'].isna().sum()} missing values")

# For missing trainer_ids, assign random values from the existing mapping
if features_df['trainer_id'].isna().sum() > 0:
    # Get all available trainer IDs from the mapping file
    available_trainer_ids = trainer_mapping['trainer_id'].values
    
    # For each row with missing trainer_id, assign a random ID from the available ones
    for idx in features_df[features_df['trainer_id'].isna()].index:
        features_df.at[idx, 'trainer_id'] = np.random.choice(available_trainer_ids)
    
    print(f"Assigned random trainer_ids to {features_df['trainer_id'].isna().sum()} missing values")

# For missing horse_ids, assign random values from the existing mapping
if features_df['horse_id'].isna().sum() > 0:
    # Get all available horse IDs from the mapping file
    available_horse_ids = horse_mapping['horse_id'].values
    
    # For each row with missing horse_id, assign a random ID from the available ones
    for idx in features_df[features_df['horse_id'].isna()].index:
        features_df.at[idx, 'horse_id'] = np.random.choice(available_horse_ids)
    
    print(f"Assigned random horse_ids to {features_df['horse_id'].isna().sum()} missing values")

# For missing surface_ids, assign random values from the existing mapping
if features_df['surface_id'].isna().sum() > 0:
    # Get all available surface IDs from the mapping file
    available_surface_ids = surface_mapping['surface_id'].values
    
    # For each row with missing surface_id, assign a random ID from the available ones
    for idx in features_df[features_df['surface_id'].isna()].index:
        features_df.at[idx, 'surface_id'] = np.random.choice(available_surface_ids)
    
    print(f"Assigned random surface_ids to {features_df['surface_id'].isna().sum()} missing values")

# Check if there are still any missing values after random assignment
print("\nAfter random assignment:")
print(f"Missing horse_id: {features_df['horse_id'].isna().sum()}")
print(f"Missing surface_id: {features_df['surface_id'].isna().sum()}")
print(f"Missing jockey_id: {features_df['jockey_id'].isna().sum()}")
print(f"Missing trainer_id: {features_df['trainer_id'].isna().sum()}")


if len(missing_horses) > 0:
    print(f"Warning: {len(missing_horses)} horses not found in mapping file")
    
if len(missing_surfaces) > 0:
    print(f"Warning: {len(missing_surfaces)} surface types not found in mapping file")

# Rename race_number to race_num as requested
features_df = features_df.rename(columns={'race_number': 'race_num'})

# Display the first few rows of our selected features
print("Selected features:")
display(features_df.head())

# Get basic statistics for our selected features
print("\nStatistics for selected features:")
display(features_df.describe())


# Save the dataset to a CSV file
output_path = '/Users/jerry/Desktop/fsan830spring2025/students/Zhiyuan_Dong/features_dataset_0515.csv'
features_df.to_csv(output_path, index=False)

print(f"\nDataset saved to {output_path}")
print(f"Final dataset shape: {features_df.shape}")
print("\nColumns in the final dataset:")
print(features_df.columns.tolist())


Dataset loaded successfully. Here's a preview:


Unnamed: 0,track_code,race_date,race_number,post_position,entry,distance,surface_code,col_8,race_type,claiming_price_category,...,col_1426,col_1427,col_1428,col_1429,col_1430,col_1431,col_1432,col_1433,col_1434,col_1435
0,CD,20250515,1,1,,1430,D,,C,BUM,...,fClm50000NW2L,fClm40000,fOC100000NW1X,Clm20000C,,,,,O ?/a{,
1,CD,20250515,1,2,,1430,D,,C,BUM,...,fMdSpWt,fMdSpWt,,Clm20000C,,,,,Q2CzaH,
2,CD,20250515,1,3,,1430,D,,C,BUM,...,fMd30000,fMdSpWt,fMdSpWt,Clm20000C,,,,,MfLO8.,
3,CD,20250515,1,4,,1430,D,,C,BUM,...,,,,Clm20000C,,,,,>#>0uM,
4,CD,20250515,1,5,,1430,D,,C,BUM,...,fMd50000,fMdSpWt,fMdSpWt,Clm20000C,,,,,HS##@Q,



Dataset Information:
Number of rows: 86
Number of columns: 1435

Missing values per column:
69095
Percentage of missing values: 55.99%

After extracting first names:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['jockey_name'] = features_df['trainer_roi'].apply(extract_first_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['trainer_name'] = features_df['trainer_name'].apply(extract_first_name)


Unnamed: 0,jockey_name,trainer_name
0,ROCCO,PITTS
1,LOVEBERRY,CATALANO
2,LANERIE,FORSTER
3,PEDROZA,BAHENA
4,SAEZ,SANTAMARIA


Missing horse_id: 80
Missing surface_id: 0
Missing jockey_id: 20
Missing trainer_id: 19
Assigned random jockey_ids to 0 missing values
Assigned random trainer_ids to 0 missing values
Assigned random horse_ids to 0 missing values

After random assignment:
Missing horse_id: 0
Missing surface_id: 0
Missing jockey_id: 0
Missing trainer_id: 0
Selected features:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['horse_name_lower'] = features_df['horse_name'].astype(str).str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['jockey_name_lower'] = features_df['jockey_name'].astype(str).str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df['trainer_name_lower'] =

Unnamed: 0,race_num,distance,purse,horse_name,surface_code,trainer_name,trainer_roi,jockey_name,horse_id,surface_id,jockey_id,trainer_id
0,1,1430,60000,BALLADRY,D,PITTS,ROCCO J S JR,ROCCO,6562.0,1,63.0,39.0
1,1,1430,60000,WHERE'S THE WINE,D,CATALANO,LOVEBERRY JARETH,LOVEBERRY,6736.0,1,84.0,553.0
2,1,1430,60000,PRINCESS POM POM,D,FORSTER,LANERIE C J,LANERIE,4804.0,1,65.0,898.0
3,1,1430,60000,ASK AMANDA,D,BAHENA,PEDROZA MARCELINO,PEDROZA,5051.0,1,6.0,867.0
4,1,1430,60000,SPIRIT RULES,D,SANTAMARIA,SAEZ GABRIEL,SAEZ,5296.0,1,42.0,106.0



Statistics for selected features:


Unnamed: 0,race_num,distance,purse,horse_id,surface_id,jockey_id,trainer_id
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,4.953488,1404.418605,90465.116279,4802.139535,1.372093,53.476744,471.918605
std,2.269527,298.839342,32563.674339,2858.738184,0.486198,26.367416,279.801914
min,1.0,1100.0,35000.0,10.0,1.0,6.0,29.0
25%,3.0,1100.0,67000.0,1903.75,1.0,36.0,244.0
50%,5.0,1320.0,83000.0,4818.0,1.0,46.5,410.5
75%,7.0,1540.0,120000.0,7041.5,2.0,78.0,720.25
max,8.0,1980.0,127000.0,9843.0,2.0,98.0,986.0



Dataset saved to /Users/jerry/Desktop/fsan830spring2025/students/Zhiyuan_Dong/features_dataset_0515.csv
Final dataset shape: (86, 12)

Columns in the final dataset:
['race_num', 'distance', 'purse', 'horse_name', 'surface_code', 'trainer_name', 'trainer_roi', 'jockey_name', 'horse_id', 'surface_id', 'jockey_id', 'trainer_id']


In [32]:
# Let's filter the dataframe to find horses in race 4567
race_4567 = df[df['race_number'] == 4567]

# Check if we found any horses for this race
if len(race_4567) > 0:
    # Print the horse names and their starting positions
    print("Horses and starting positions for race 4567:")
    for _, row in race_4567.iterrows():
        horse_name = row['horse_name'] if 'horse_name' in row else "Name not available"
        post_position = row['post_position'] if 'post_position' in row else "Position not available"
        print(f"Horse: {horse_name}, Starting Position: {post_position}")
else:
    print("No horses found for race 4567.")
    
    # Let's also check what race numbers are available in the dataset
    available_races = df['race_number'].unique()
    print(f"Available race numbers in the dataset: {sorted(available_races)}")


No horses found for race 4567.
Available race numbers in the dataset: [1, 2, 3, 4, 5, 6, 7, 8]
