1. data_loading.py

In [213]:
%%writefile ../../src/mariners_interview/data_loading.py

import pandas as pd

def load_data(train_path, test_path, dict_path, debug=False):
    """
    Load the training and testing datasets.
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    data_dict_df = pd.read_csv(dict_path)
    
    if debug:
        print("Train Data Shape:", train_df.head())
        print("Test Data Shape:", test_df.head())
        print("Data Dictionary Shape:", data_dict_df)
    
    return train_df, test_df, data_dict_df

if __name__ == '__main__':
    # Paths to data files
    train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train.csv'
    test_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-test.csv'
    dict_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-dictionary.csv'
    
    # Load data with debug mode enabled
    train_df, test_df, data_dict_df = load_data(train_path, test_path, dict_path, debug=True)

Overwriting ../../src/mariners_interview/data_loading.py


2. preprocessing.py

In [214]:
%%writefile ../../src/mariners_interview/preprocessing.py

def clean_data(df, debug=False):
    if debug:
        print(f"DataFrame shape after loading data: {df.shape}")
        print(f"Columns before cleaning: {df.columns.tolist()}")
    # Assuming the cleaning process doesn't drop the 'inning' column
    if debug:
        print(f"Columns after cleaning: {df.columns.tolist()}")
    return df

def handle_missing_values(df, debug=False):
    if debug:
        print(f"Columns before handling missing values: {df.columns.tolist()}")
    # Example of a missing value handling process
    df = df[df['hit_spin_rate'].notnull()]  # Assuming hit_spin_rate is critical and should not have nulls

    if debug:
        print(f"Columns after handling missing values: {df.columns.tolist()}")
    return df

if __name__ == '__main__':
    # import pandas as pd
    # from data_loading import load_data
    
    # Load data for testing
    train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train.csv'
    test_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-test.csv'
    dict_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-dictionary.csv'
    
    # Load data with debug mode enabled
    train_df, test_df, data_dict_df = load_data(train_path, test_path, dict_path, debug=True)
    
    # Clean data with debug mode enabled
    train_df = clean_data(train_df, debug=True)
    train_df = handle_missing_values(train_df, debug=True)


Overwriting ../../src/mariners_interview/preprocessing.py


3. feature_engineering.py

In [215]:
%%writefile ../../src/mariners_interview/feature_engineering.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_hit_trajectory(vert_exit_angle, horz_exit_angle, exit_speed, spin_rate, gravity=32.174, time_intervals=100):
    """
    Calculate the hit trajectory based on initial speed and angles.
    Args:
    - vert_exit_angle: Vertical exit angle in degrees.
    - horz_exit_angle: Horizontal exit angle in degrees.
    - exit_speed: Initial exit speed in mph.
    - spin_rate: Spin rate in rpm.
    - gravity: Gravity constant in ft/s^2.
    - time_intervals: Number of time intervals for calculating the trajectory.

    Returns:
    - x_values: List of x coordinates of the trajectory.
    - y_values: List of y coordinates of the trajectory.
    """
    # Convert exit speed to ft/s (1 mph = 1.46667 ft/s)
    exit_speed_ft_s = exit_speed * 1.46667

    # Convert angles to radians
    vert_angle_rad = np.radians(vert_exit_angle)
    horz_angle_rad = np.radians(horz_exit_angle)

    # Calculate initial velocity components
    vx = exit_speed_ft_s * np.cos(vert_angle_rad)
    vy = exit_speed_ft_s * np.sin(vert_angle_rad)

    # Calculate the effect of spin on horizontal and vertical distances
    # Approximate adjustment factor due to spin rate
    spin_effect = 1 + (spin_rate / 15000)

    # Calculate trajectory points
    time_points = np.linspace(0, 5, time_intervals)  # 5 seconds max trajectory time
    x_values = (vx * time_points) * np.cos(horz_angle_rad) * spin_effect
    y_values = (vy * time_points - 0.5 * gravity * time_points ** 2) * spin_effect

    # Remove any points where y is negative (ground level or below)
    valid_indices = y_values >= 0
    x_values = x_values[valid_indices]
    y_values = y_values[valid_indices]

    return x_values, y_values

def visualize_hit_trajectory(df, boundary=None, debug=False):
    """
    Visualize the hit's trajectory and landing point with respect to the field boundaries.
    Ensure that the boundary and trajectory visualization match the calculation methods.
    """
    fig, ax = plt.subplots(figsize=(12, 10))

    # Simulate outfield boundary using the same function and parameters as used for determining home runs
    if boundary is None:
        boundary = simulate_outfield_boundary(average_distances, debug=debug)

    boundary_x, boundary_y, _, _ = boundary

    # Visualize the outfield boundary
    ax.plot(boundary_x, boundary_y, color='black', linestyle='--', linewidth=2, label='Outfield Boundary')

    # Plot each hit's trajectory
    for idx, row in df.iterrows():
        # Calculate the trajectory using the initial speed, angles, and spin rate
        x_traj, y_traj = calculate_hit_trajectory(
            row['vert_exit_angle'],
            row['horz_exit_angle'],
            row['exit_speed'],
            row['hit_spin_rate']
        )

        # Plot trajectory line
        ax.plot(x_traj, y_traj, linestyle='-', alpha=0.6)

        # Plot the landing point
        ax.scatter(row['landing_x_adjusted'], row['landing_y_adjusted'], color='red', s=50, label='Landing Point' if idx == 0 else "")

    # Set field boundary limits to show the full field
    max_x = max(boundary_x) + 50
    max_y = max(boundary_y) + 50
    ax.set_xlim(-max_x, max_x)
    ax.set_ylim(0, max_y)

    # Formatting the plot
    ax.set_title("Hit Trajectory and Landing Points with Outfield Boundary")
    ax.set_xlabel("Landing X (ft)")
    ax.set_ylabel("Landing Y (ft)")
    ax.axhline(0, color='black', linewidth=1)  # Baseline for y-axis
    ax.axvline(0, color='black', linewidth=1)  # Baseline for x-axis
    ax.legend(title='Legend')
    ax.grid(True)

    return fig



#https://www.si.com/mlb/2021/03/24/mlb-outfield-walls-ranked-fenway-park-yankee-stadium

# Provided data
stadium_data = {
    'Kauffman Stadium': [330, 387, 410, 387, 330],
    'Rogers Centre': [328, 375, 400, 375, 328],
    'TD Ballpark': [333, 380, 400, 363, 336],
    'Busch Stadium': [336, 375, 400, 375, 335],
    'Dodger Stadium': [330, 360, 375, 400, 375, 360, 330],
    'Guaranteed Rate Field': [330, 375, 400, 375, 335],
    'Oakland Coliseum': [330, 388, 400, 388, 330],
    'Marlins Park': [344, 386, 400, 387, 335],
    'Miller Park': [344, 371, 400, 374, 345],
    'T-Mobile Park': [331, 378, 401, 381, 326],
    'Citi Field': [335, 358, 385, 408, 398, 375, 330],
    'Tropicana Field': [315, 370, 404, 370, 322],
    'Truist Park': [335, 385, 400, 375, 325],
    'Wrigley Field': [355, 368, 400, 368, 353],
    'Coors Field': [347, 390, 415, 375, 350],
    'Angel Stadium': [347, 390, 396, 370, 365, 350],
    'Comerica Park': [345, 370, 420, 365, 330],
    'Great American Ball Park': [328, 379, 404, 370, 325],
    'Nationals Park': [337, 377, 402, 370, 335],
    'Progressive Field': [325, 370, 400, 410, 375, 325],
    'Target Field': [339, 377, 411, 403, 367, 328],
    'Oriole Park at Camden Yards': [333, 364, 410, 400, 373, 318],
    'Chase Field': [330, 374, 413, 407, 413, 374, 334],
    'Globe Life Field': [329, 372, 407, 374, 326],
    'Petco Park': [334, 357, 390, 396, 391, 382, 322],
    'Citizens Bank Park': [329, 374, 409, 401, 369, 330],
    'Yankee Stadium': [318, 399, 408, 385, 314],
    'PNC Park': [325, 383, 410, 399, 375, 320],
    'Minute Maid Park': [315, 362, 404, 409, 408, 373, 326],
    'Oracle Park': [339, 364, 399, 391, 415, 365, 309],
    'Fenway Park': [310, 379, 390, 420, 380, 302]
}

# Initialize an empty list to collect data
data = []

for stadium, distances in stadium_data.items():
    num_points = len(distances)
    if num_points == 5:
        # Map directly
        P1 = distances[0]  # Left Field Line
        P2 = distances[1]  # Left-Center Field
        P3 = distances[2]  # Center Field
        P4 = distances[3]  # Right-Center Field
        P5 = distances[4]  # Right Field Line
    elif num_points == 6:
        # Use positions 0,1,2,3,5
        P1 = distances[0]
        P2 = distances[1]
        P3 = distances[2]
        P4 = distances[3]
        P5 = distances[5]
    elif num_points == 7:
        # Use positions 0,2,3,4,6
        P1 = distances[0]
        P2 = distances[2]
        P3 = distances[3]
        P4 = distances[4]
        P5 = distances[6]
    else:
        # Handle other cases if necessary
        continue  # Skip if the number of distances is not 5,6,7

    data.append({
        'Stadium': stadium,
        'P1': P1,
        'P2': P2,
        'P3': P3,
        'P4': P4,
        'P5': P5
    })

# Create DataFrame
df = pd.DataFrame(data)

# Calculate averages
average_P1 = df['P1'].mean()
average_P2 = df['P2'].mean()
average_P3 = df['P3'].mean()
average_P4 = df['P4'].mean()
average_P5 = df['P5'].mean()

# Display the DataFrame and averages
print("Stadium Distances DataFrame:")
print(df.to_string(index=False))

print("\nAverage Distances:")
print(f"Left Field Line (P1): {average_P1:.2f} ft")
print(f"Left-Center Field (P2): {average_P2:.2f} ft")
print(f"Center Field (P3): {average_P3:.2f} ft")
print(f"Right-Center Field (P4): {average_P4:.2f} ft")
print(f"Right Field Line (P5): {average_P5:.2f} ft")



def categorize_inning(df, debug=False):
    if debug:
        print(f"Columns before categorizing innings: {df.columns.tolist()}")
        print(f"Checking if 'inning' exists in the DataFrame: {'inning' in df.columns}")
        if 'inning' not in df.columns:
            print("ERROR: 'inning' column is missing. Exiting function early.")
            return df  # Exit early if the 'inning' column is missing

    # Proceed to categorize 'inning' only if it exists
    if 'inning' in df.columns:
        df['inning_group'] = pd.cut(df['inning'], bins=[0, 3, 6, np.inf], labels=['Early', 'Mid', 'Late'])
        df = df.drop(columns=['inning'])  # Drop the original inning column if not needed
    if debug:
        print(f"Columns after categorizing innings: {df.columns.tolist()}")
    return df

def create_count_scenario(df, debug=False):
    """
    Combine 'pre_balls', 'pre_strikes', 'pre_outs', and 'inning_group' into 'count_scenario'.
    """
    if debug:
        print(f"Columns before create_count_scenario: {df.columns.tolist()}")
    # Check for required columns before proceeding
    required_columns = ['pre_balls', 'pre_strikes', 'pre_outs', 'inning_group']
    for col in required_columns:
        if col not in df.columns:
            raise KeyError(f"Missing required column: {col}. Available columns: {df.columns.tolist()}")
    
    # Combine game context features into a single string representation
    df['count_scenario'] = (df['pre_balls'].astype(str) + '-' + 
                            df['pre_strikes'].astype(str) + '-' +
                            df['pre_outs'].astype(str) + '-' +
                            df['inning_group'].astype(str))
    
    # Drop the original columns if not needed anymore
    df = df.drop(columns=['pre_balls', 'pre_strikes', 'pre_outs', 'inning_group'])
    
    if debug:
        print("Unique 'count_scenario's:", df['count_scenario'].unique())
    
    return df

def transform_gamedate(df, debug=False):
    if debug:
        print("Transforming 'gamedate' column...")
    df['gamedate'] = pd.to_datetime(df['gamedate'])
    df['month'] = df['gamedate'].dt.month
    df['day_of_week'] = df['gamedate'].dt.day_name()
    df['is_weekend'] = df['gamedate'].dt.dayofweek >= 5
    df = df.drop(columns=['gamedate'])
    if debug:
        print("Transformed columns:", df.columns)
    return df

def categorize_temperature(df, debug=False):
    if debug:
        print("Categorizing 'temperature' column...")
    bins = [0, 70, 90, np.inf]
    labels = ['Cold', 'Moderate', 'Hot']
    df['temperature_category'] = pd.cut(df['temperature'], bins=bins, labels=labels)
    df = df.drop(columns=['temperature'])
    if debug:
        print("Temperature categories assigned:", df['temperature_category'].unique())
    return df

def calculate_physics_features(df, debug=False):
    if debug:
        print("Calculating physics-based features...")
    GRAVITY = 32.174  # ft/s^2
    df['vert_angle_rad'] = np.radians(df['vert_exit_angle'])
    df['estimated_distance'] = ((df['exit_speed'] ** 2) * np.sin(2 * df['vert_angle_rad'])) / GRAVITY
    df['landing_x'] = df['estimated_distance'] * np.sin(np.radians(df['horz_exit_angle']))
    df['landing_y'] = df['estimated_distance'] * np.cos(np.radians(df['horz_exit_angle']))
    df['adjusted_distance'] = df['estimated_distance'] * (1 + (df['hit_spin_rate'] / 15000))
    df['landing_x_adjusted'] = df['adjusted_distance'] * np.sin(np.radians(df['horz_exit_angle']))
    df['landing_y_adjusted'] = df['adjusted_distance'] * np.cos(np.radians(df['horz_exit_angle']))
    
    if debug:
        print("Physics features calculated.")
        print(df[['estimated_distance', 'adjusted_distance', 'landing_x_adjusted', 'landing_y_adjusted']].head())
    
    return df

def determine_home_run(df, boundary=None, debug=False):
    """
    Determine if each hit in the DataFrame is a home run using the precomputed outfield boundary.
    """
    if boundary is None:
        boundary = simulate_outfield_boundary(average_distances, debug=False)  # Use precomputed boundary

    boundary_x, boundary_y, _, _ = boundary

    def is_home_run(row):
        x = row['landing_x_adjusted']
        y = row['landing_y_adjusted']
        distance = np.sqrt(x**2 + y**2)

        # Calculate the angle of the hit
        hit_angle = np.degrees(np.arctan2(x, y))

        # Find the corresponding boundary distance for this angle
        if -45 <= hit_angle <= 45:
            # Find the closest point in the boundary array
            idx = np.abs(np.linspace(-45, 45, 100) - hit_angle).argmin()
            boundary_distance = np.sqrt(boundary_x[idx]**2 + boundary_y[idx]**2)
            return distance >= boundary_distance

        return False

    df['is_home_run'] = df.apply(is_home_run, axis=1)
    if debug:
        print("Number of home runs:", df['is_home_run'].sum())
        print(df[df['is_home_run'] == True][['hit_direction', 'adjusted_distance']].head())

    return df

global_boundary = None

average_distances = {
    'P1': 332.45,  # Average Left Field Line
    'P2': 381.55,  # Average Left-Center Field
    'P3': 403.48,  # Average Center Field
    'P4': 385.81,  # Average Right-Center Field
    'P5': 329.16   # Average Right Field Line
}

# Updated function to simulate outfield boundary with debug outputs at every 5 feet
def simulate_outfield_boundary(average_distances, debug=False):
    """
    Create a gradual outfield boundary using a polynomial curve fit or similar, using average distances.
    Output dimensions once for debugging purposes if debug is True.
    """
    global global_boundary  # Use a global boundary to avoid repeated calculations
    
    # If the boundary is already calculated, return it directly.
    if global_boundary is not None:
        return global_boundary

    # Extract average distances for each field position
    P1 = average_distances['P1']
    P2 = average_distances['P2']
    P3 = average_distances['P3']
    P4 = average_distances['P4']
    P5 = average_distances['P5']

    # Define angles corresponding to each point
    angles = np.linspace(-45, 45, 100)  # Covering left to right field (in degrees)
    angles_rad = np.radians(angles)

    # Calculate polynomial coefficients based on these points
    # Create a smooth curve that fits through (Left Field Line, Left-Center, Center Field, Right-Center, Right Field Line)
    boundary_coefficients = np.polyfit(
        [-45, -22.5, 0, 22.5, 45],  # angles corresponding to each average point
        [P1, P2, P3, P4, P5],  # distance values
        deg=3  # Cubic polynomial fit
    )

    # Generate boundary distances using the fitted polynomial
    boundary_distances = np.polyval(boundary_coefficients, angles)

    # Calculate boundary x and y positions based on these distances
    boundary_x = boundary_distances * np.sin(angles_rad)
    boundary_y = boundary_distances * np.cos(angles_rad)

    # Generate outfield boundary points every 5 feet for debugging purposes
    distances_5ft = np.arange(0, max(boundary_distances), 5)
    boundary_x_5ft = []
    boundary_y_5ft = []

    # Populate boundary values at every 5-foot interval for the outfield
    for d in distances_5ft:
        # Find the corresponding x and y for this distance
        idx = (np.abs(boundary_distances - d)).argmin()
        boundary_x_5ft.append(boundary_x[idx])
        boundary_y_5ft.append(boundary_y[idx])

        if debug:
            print(f"Outfield Boundary at {d} ft: (x={boundary_x[idx]:.2f}, y={boundary_y[idx]:.2f})")

    # Store the calculated boundary in the global variable
    global_boundary = (boundary_x, boundary_y, boundary_x_5ft, boundary_y_5ft)

    return global_boundary


def visualize_in_park_foul_balls(df, title='In-Park Foul Balls'):
    """
    Visualize the in-park foul balls that are catchable.
    """
    plt.figure(figsize=(12, 10))
    in_park_foul_df = df[df['is_foul'] & ~df['is_home_run']]  # Filter only catchable in-park foul balls
    
    sns.scatterplot(x='landing_x_adjusted', y='landing_y_adjusted', hue='hit_direction', data=in_park_foul_df, palette='deep')
    
    # Additional visual aids
    plt.axhline(0, color='black', linewidth=1)  # Baseline for y-axis
    plt.axvline(0, color='black', linewidth=1)  # Baseline for x-axis
    plt.title(title)
    plt.xlabel('Landing X (ft)')
    plt.ylabel('Landing Y (ft)')
    plt.legend(title='Hit Direction')
    plt.grid()
    plt.show()
    
def visualize_hits_with_field_boundary(df, title='Hit Landing Points', hits_type='all', debug=False):
    """
    Visualize the landing points of hits with different categories, including the outfield boundary.
    """
    # Generate the field boundary
    boundary_x, boundary_y, _, _ = simulate_outfield_boundary(average_distances, debug=True)
    
    plt.figure(figsize=(12, 10))
    
    # Determine the data to plot based on the hit type
    if hits_type == 'all':
        data = df
    elif hits_type == 'home_runs':
        data = df[df['is_home_run']]
    elif hits_type == 'foul_balls':
        data = df[df['is_foul']]
    else:
        data = df

    # Plot the hit points
    sns.scatterplot(x='landing_x_adjusted', y='landing_y_adjusted', hue='hit_direction', data=data, palette='deep')

    # Plot the field boundary
    plt.plot(boundary_x, boundary_y, color='black', linestyle='--', linewidth=2, label='Outfield Boundary')
    
    # Additional visual aids
    plt.axhline(0, color='black', linewidth=1)  # Baseline for y-axis
    plt.axvline(0, color='black', linewidth=1)  # Baseline for x-axis
    
    # Set the title and labels
    plt.title(title)
    plt.xlabel('Landing X (ft)')
    plt.ylabel('Landing Y (ft)')
    plt.legend(title='Hit Direction')
    plt.grid()
    plt.show()

# Modify is_within_field_boundaries to use the simulated boundary curve
def is_within_field_boundaries(row, boundary=None, debug=False):
    """
    Check if the ball lands within the field boundaries using the precomputed boundary.
    """
    if boundary is None:
        boundary = simulate_outfield_boundary(average_distances, debug=False)  # Use precomputed boundary

    boundary_x, boundary_y, boundary_x_5ft, boundary_y_5ft = boundary

    x = row['landing_x_adjusted']
    y = row['landing_y_adjusted']

    # Determine if point is inside the boundary curve by checking the distance to the origin
    distance = np.sqrt(x**2 + y**2)

    # Calculate the angle of the hit
    hit_angle = np.degrees(np.arctan2(x, y))

    # Find the corresponding boundary distance for this angle
    if -45 <= hit_angle <= 45:
        # Find the closest point in the boundary array
        idx = np.abs(np.linspace(-45, 45, 100) - hit_angle).argmin()
        boundary_distance = np.sqrt(boundary_x[idx]**2 + boundary_y[idx]**2)
        return distance <= boundary_distance

    return False

def determine_catchable_home_run(df, debug=False):
    if debug:
        print("Determining catchable home runs...")
    def is_catchable(row):
        if row['is_home_run'] and row['exit_speed'] < 110 and row['hit_spin_rate'] < 3500 and is_within_field_boundaries(row):
            return True
        return False
    df['is_catchable_home_run'] = df.apply(is_catchable, axis=1)
    if debug:
        print("Number of catchable home runs:", df['is_catchable_home_run'].sum())
        print(df[df['is_catchable_home_run'] == True][['hit_direction', 'adjusted_distance', 'landing_x_adjusted', 'landing_y_adjusted']].head())
    return df

def categorize_hit_direction(df, debug=False):
    """
    Categorize hits into 'Left', 'Center', 'Right' based on 'horz_exit_angle'.
    """
    if debug:
        print("Categorizing hit directions...")
    
    conditions = [
        df['horz_exit_angle'] < -15,
        df['horz_exit_angle'] > 15
    ]
    choices = ['Left', 'Right']
    df['hit_direction'] = np.select(conditions, choices, default='Center')
    
    if debug:
        print("Hit directions assigned:", df['hit_direction'].unique())
    
    return df

def filter_features(df, selected_features, include_target=False, target_variable='is_airout', debug=False):
    """
    Filter the DataFrame to only include columns specified in selected_features,
    plus the target variable if include_target is True.
    """
    if debug:
        print(f"Original columns: {df.columns.tolist()}")
    
    # Prepare the list of columns to keep
    columns_to_keep = selected_features.copy()
    if include_target and target_variable in df.columns:
        columns_to_keep.append(target_variable)
    
    # Filter columns
    df_filtered = df[columns_to_keep].copy()

    if debug:
        print(f"Filtered columns: {df_filtered.columns.tolist()}")
    
    return df_filtered


def visualize_hits(df, title='Hit Landing Points'):
    """
    Visualize the landing points of hits with different categories.
    """
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='landing_x_adjusted', y='landing_y_adjusted', hue='hit_direction', style='is_home_run', data=df, palette='deep')
    plt.axhline(0, color='black', linewidth=1)  # Baseline for y-axis
    plt.axvline(0, color='black', linewidth=1)  # Baseline for x-axis
    plt.title(title)
    plt.xlabel('Landing X (ft)')
    plt.ylabel('Landing Y (ft)')
    plt.legend(title='Hit Direction / Home Run')
    plt.grid()
    plt.show()

    # Show catchable home runs
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='landing_x_adjusted', y='landing_y_adjusted', hue='is_catchable_home_run', data=df, palette={True: 'green', False: 'red'})
    plt.title(f'{title} - Catchable Home Runs')
    plt.xlabel('Landing X (ft)')
    plt.ylabel('Landing Y (ft)')
    plt.legend(title='Catchable Home Run')
    plt.grid()
    plt.show()

# Updated feature engineering pipeline to include pre-filtering visualizations
def feature_engineering_pipeline(df, selected_features=None, include_target=False, debug=False):
    """
    Run all feature engineering functions and filter for selected features if provided.
    """
    if debug:
        print(f"Initial columns: {df.columns.tolist()}")
        
    global global_boundary  # Use global boundary to ensure consistency
    if global_boundary is None:
        global_boundary = simulate_outfield_boundary(average_distances, debug=debug)  # Compute once if not set
    df = categorize_inning(df, debug)
    df = create_count_scenario(df, debug)
    df = transform_gamedate(df, debug)
    df = categorize_temperature(df, debug)
    df = calculate_physics_features(df, debug)
    df = categorize_hit_direction(df, debug)
    df = determine_home_run(df, boundary=None, debug=debug)
    df = determine_catchable_home_run(df, debug=debug)

    # Create 'is_foul' before the first visualization
    if debug:
        print("Determining foul balls...")
    df['is_foul'] = ~df.apply(lambda row: is_within_field_boundaries(row, boundary=global_boundary), axis=1)

    # Visualize before filtering
    print("Visualizing hits before applying any filters...")
    visualize_hits_with_field_boundary(df, title='All Hit Landing Points Before Filtering', hits_type='all', debug=debug)

    print("Visualizing home runs before filtering...")
    visualize_hits_with_field_boundary(df[df['is_home_run']], title='Home Runs Before Filtering', hits_type='home_runs', debug=debug)

    print("Visualizing foul balls before filtering...")
    visualize_hits_with_field_boundary(df[df['is_foul']], title='Foul Balls Before Filtering', hits_type='foul_balls', debug=debug)

    # Apply filtering for non-catchable and foul balls
    df = filter_non_catchable_and_foul(df, debug)

    # Visualize after filtering
    print("Visualizing hits after filtering uncatchable balls...")
    visualize_hits_with_field_boundary(df, title='All Hit Landing Points After Filtering', hits_type='all', debug=debug)

    print("Visualizing home runs after filtering...")
    visualize_hits_with_field_boundary(df[df['is_home_run']], title='Home Runs After Filtering', hits_type='home_runs', debug=debug)

    print("Visualizing foul balls after filtering...")
    visualize_hits_with_field_boundary(df[df['is_foul']], title='Foul Balls After Filtering', hits_type='foul_balls', debug=debug)

    # Drop intermediate columns if necessary
    df = df.drop(columns=['vert_angle_rad'], errors='ignore')

    # Filter the DataFrame for selected features
    if selected_features:
        df = filter_features(df, selected_features, include_target=include_target, debug=debug)

    return df

# Updated filter_non_catchable_and_foul function to clarify debug prints and ensure visibility
def filter_non_catchable_and_foul(df, debug=False):
    if debug:
        print("Filtering non-catchable and foul balls...")
        total_rows_before = df.shape[0]
        
    # Determine if each hit is a foul ball or not within field boundaries
    df['is_foul'] = ~df.apply(is_within_field_boundaries, axis=1)
    
    # Filter to keep only catchable home runs or hits that are not fouls (remain in the park)
    df_filtered = df[df['is_catchable_home_run'] | ~df['is_foul']]
    
    if debug:
        total_rows_after = df_filtered.shape[0]
        print(f"Rows before filtering: {total_rows_before}")
        print(f"Rows after filtering: {total_rows_after}")
        print(f"Number of rows filtered out: {total_rows_before - total_rows_after}")
        print("Filtered out rows (foul balls outside park):")
        print(df[df['is_foul'] & ~df['is_catchable_home_run']][['hit_direction', 'adjusted_distance', 'landing_x_adjusted', 'landing_y_adjusted']].head())
    
    return df_filtered

# Run the updated pipeline
if __name__ == '__main__':
    import pandas as pd
    # from data_loading import load_data
    # from preprocessing import clean_data, handle_missing_values
    
    train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train.csv'
    test_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-test.csv'
    dict_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-dictionary.csv'

    # Load data with debug mode enabled
    train_df, test_df, data_dict_df = load_data(train_path, test_path, dict_path, debug=True)

    # Preprocess data
    train_df = clean_data(train_df, debug=True)
    train_df = handle_missing_values(train_df, debug=True)

    # Apply feature engineering with debug mode enabled
    train_df = feature_engineering_pipeline(train_df, debug=True)

Overwriting ../../src/mariners_interview/feature_engineering.py


In [216]:
%%writefile ../../src/mariners_interview/defensive_feature_analysis.py


import numpy as np
import pandas as pd

# Constants
GRAVITY = 32.174  # Gravity constant in ft/s^2

# Function to calculate hang time based on exit speed and vertical angle
def calculate_hang_time(df, debug=False):
    if debug:
        print("\nCalculating hang time for each hit...")
    
    # Convert vertical exit angle to radians for trigonometric calculations
    df['vert_angle_rad'] = np.radians(df['vert_exit_angle'])
    
    # Hang time calculation using kinematic equation: t = (2 * exit_speed * sin(vert_exit_angle)) / GRAVITY
    df['hang_time'] = (2 * df['exit_speed'] * np.sin(df['vert_angle_rad'])) / GRAVITY
    
    # Ensure no negative or zero hang times
    df['hang_time'] = df['hang_time'].clip(lower=0)

    if debug:
        # Show the first few hang time calculations
        print("Hang time calculated for the first few records:\n", df[['hang_time', 'exit_speed', 'vert_exit_angle']].head())
    return df

# Function to estimate distance covered by the fielder based on their starting position
def calculate_distance_covered(df, debug=False):
    if debug:
        print("\nEstimating distance covered by fielder...")

    # Define starting positions for each outfielder based on standard field layout
    lf_start_x, lf_start_y = -90, 250  # Left Fielder starting position
    cf_start_x, cf_start_y = 0, 350    # Center Fielder starting position
    rf_start_x, rf_start_y = 90, 250   # Right Fielder starting position

    # Function to calculate the distance from the starting position to the landing position
    def get_distance_covered(row):
        # Select the starting position based on fielder role
        if row['first_fielder'] == row['lf_id']:
            start_x, start_y = lf_start_x, lf_start_y
        elif row['first_fielder'] == row['cf_id']:
            start_x, start_y = cf_start_x, cf_start_y
        elif row['first_fielder'] == row['rf_id']:
            start_x, start_y = rf_start_x, rf_start_y
        else:
            return np.nan  # Return NaN if no matching fielder found
        
        # Calculate distance covered from starting position to landing position
        distance = np.sqrt((row['landing_x_adjusted'] - start_x) ** 2 + (row['landing_y_adjusted'] - start_y) ** 2)
        return distance

    # Apply distance calculation to each row
    df['distance_covered'] = df.apply(get_distance_covered, axis=1)

    if debug:
        # Display intermediate distances and validate with initial starting positions
        print("Distance covered calculated for the first few records:\n", df[['distance_covered', 'landing_x_adjusted', 'landing_y_adjusted']].head())
    return df

# Function to estimate fielder reaction speed
def calculate_reaction_speed(df, debug=False):
    if debug:
        print("\nCalculating fielder reaction speed...")

    # Reaction speed is calculated as distance covered divided by hang time
    df['reaction_speed'] = df['distance_covered'] / df['hang_time']
    
    # Replace infinite or NaN values with zero for reaction speed
    df['reaction_speed'] = df['reaction_speed'].replace([np.inf, -np.inf], 0).fillna(0)

    if debug:
        # Validate the reaction speed for the first few records
        print("Reaction speed calculated for the first few records:\n", df[['reaction_speed', 'distance_covered', 'hang_time']].head())
    return df

# Function to calculate estimated catch probability
def calculate_catch_probability(df, debug=False):
    if debug:
        print("\nEstimating catch probability...")

    # Estimate catch probability using an exponential decay model based on distance covered and hang time
    df['catch_probability'] = np.exp(-0.05 * df['distance_covered']) * (df['hang_time'] / 5)

    # Clip catch probability to be between 0 and 1
    df['catch_probability'] = np.clip(df['catch_probability'], 0, 1)

    if debug:
        # Display catch probability and related calculations
        print("Catch probability estimated for the first few records:\n", df[['catch_probability', 'distance_covered', 'hang_time']].head())
    return df

# Function to categorize catch difficulty based on distance and hang time
def categorize_catch_difficulty(df, debug=False):
    if debug:
        print("\nCategorizing catch difficulty...")

    # Step 1: Initialize catch_difficulty to 'Not_Caught' where is_airout is 0
    df['catch_difficulty'] = np.where(df['is_airout'] == 0, 'Not_Caught', 'Unknown')

    # Step 2: Define categorization conditions based on available metrics for other rows
    conditions = [
        (df['distance_covered'] < 50) & (df['hang_time'] > 4),             # Short distance, high hang time = Easy catch
        (df['distance_covered'] < 100) & (df['hang_time'] > 3),            # Medium distance, moderate hang time = Moderate catch
        (df['distance_covered'] >= 100) & (df['hang_time'] < 3),           # Long distance, low hang time = Difficult catch
        (df['distance_covered'] >= 150) | (df['hang_time'] < 2)            # Very long distance or very low hang time = Very Difficult catch
    ]
    choices = ['Easy', 'Moderate', 'Difficult', 'Very Difficult']

    # Step 3: Calculate catch difficulty for all rows using np.select
    df['temp_catch_difficulty'] = np.select(conditions, choices, default='Uncatchable')

    # Step 4: Overwrite catch_difficulty only for rows where valid metrics exist
    mask_valid_metrics = ~df['first_fielder'].isna() | (~df['distance_covered'].isna() & ~df['hang_time'].isna())
    df.loc[mask_valid_metrics, 'catch_difficulty'] = df.loc[mask_valid_metrics, 'temp_catch_difficulty']

    # Drop the temporary column used for calculation
    df.drop(columns=['temp_catch_difficulty'], inplace=True)

    if debug:
        # Show categorized difficulty for initial records
        print("Catch difficulty categorized:\n", df[['catch_difficulty', 'distance_covered', 'hang_time', 'first_fielder', 'is_airout']].head())
    
    return df

# Function to determine the position of the fielder who attempted the catch
def determine_fielder_position(row):
    if row['first_fielder'] == row['lf_id']:
        return 'LF'
    elif row['first_fielder'] == row['cf_id']:
        return 'CF'
    elif row['first_fielder'] == row['rf_id']:
        return 'RF'
    return 'Unknown'  # Catch not made or missing fielder ID information

# Function to get count of unique values for catch difficulty
def get_catch_difficulty_count(df, debug=False):
    if debug:
        print("\nGetting catch difficulty counts...")
    
    # Get counts for each unique value in the catch_difficulty column
    catch_difficulty_counts = df['catch_difficulty'].value_counts()
    if debug:
        print("Catch difficulty counts:\n", catch_difficulty_counts)
    return catch_difficulty_counts

# Extend the existing feature engineering pipeline with these new metrics
def feature_engineering_with_defensive_metrics(df, selected_features=None, include_target=False, debug=False):

    # Calculate new metrics for defensive evaluation
    df = calculate_hang_time(df, debug)
    df = calculate_distance_covered(df, debug)
    df = calculate_reaction_speed(df, debug)
    df = calculate_catch_probability(df, debug)
    df = categorize_catch_difficulty(df, debug)
    df['fielder_position'] = df.apply(determine_fielder_position, axis=1)

    return df

# Example use of the pipeline
if __name__ == '__main__':
    # Assuming train_df is already loaded and preprocessed
    train_df = feature_engineering_with_defensive_metrics(train_df, debug=True)
    print("train_df columns =", train_df.columns)
    
    # Get and print catch difficulty counts
    catch_difficulty_counts = get_catch_difficulty_count(train_df, debug=True)


# defensive metrics cause data leakage, great for post prediction analsis on these players

Overwriting ../../src/mariners_interview/defensive_feature_analysis.py


In [217]:
%%writefile ../../src/mariners_interview/cluster_analysis.py

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist

# Define a function to create a preprocessor for numeric features
def get_numeric_preprocessor(numeric_features, debug=False):
    """
    Create a preprocessor for handling numerical features.
    """
    if debug:
        print("Creating numeric preprocessor...")
    
    # Define a pipeline for numeric features with constant imputation and scaling
    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),  # Set missing values to zero
        ('scaler', StandardScaler())  # Scale numerical features
    ])
    
    # Combine pipelines into a ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_pipeline, numeric_features)
    ])
    
    if debug:
        print(f"Numeric preprocessing pipeline created for features: {numeric_features}")
    
    return preprocessor

# Function to perform clustering and return the labeled dataset
def perform_clustering(data, defensive_features, n_clusters=3, debug=False):
    """
    Perform K-Means and DBSCAN clustering, add cluster labels to the dataset, and return the updated dataset.
    
    Parameters:
    - data: The preprocessed DataFrame containing defensive features.
    - defensive_features: List of defensive features to use for clustering.
    - n_clusters: Number of clusters for K-Means.
    - debug: If True, display detailed outputs, visualizations, and analytics.
    
    Returns:
    - Updated DataFrame with cluster labels.
    """
    # Preprocess the defensive features
    preprocessor = get_numeric_preprocessor(numeric_features=defensive_features, debug=debug)
    X_preprocessed = preprocessor.fit_transform(data[defensive_features])

    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['defensive_cluster_kmeans'] = kmeans.fit_predict(X_preprocessed)

    # Perform DBSCAN clustering
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    data['defensive_cluster_dbscan'] = dbscan.fit_predict(X_preprocessed)

    # Assign descriptive labels to K-Means clusters based on analysis
    cluster_labels = {0: 'Quick Reactors', 1: 'Moderate Defenders', 2: 'Late Reactors'}
    data['defensive_cluster_label'] = data['defensive_cluster_kmeans'].map(cluster_labels)

    # Calculate detailed cluster statistics for K-Means clusters
    cluster_summary = data.groupby('defensive_cluster_kmeans')[defensive_features].agg(['mean', 'median', 'std', 'min', 'max'])

    if debug:
        # Print cluster statistics
        print("Detailed K-Means Cluster Summary:\n", cluster_summary)

        # Visualize K-Means Clusters
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=X_preprocessed[:, 0], y=X_preprocessed[:, 1], hue=data['defensive_cluster_kmeans'], palette='viridis')
        plt.title('K-Means Clusters based on Defensive Stats')
        plt.show()

        # Visualize DBSCAN Clusters
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=X_preprocessed[:, 0], y=X_preprocessed[:, 1], hue=data['defensive_cluster_dbscan'], palette='coolwarm')
        plt.title('DBSCAN Clusters based on Defensive Stats')
        plt.show()

        # Plot distributions of key features across clusters
        for feature in defensive_features:
            plt.figure(figsize=(12, 6))
            sns.boxplot(x='defensive_cluster_kmeans', y=feature, data=data, palette='Set2')
            plt.title(f'Distribution of {feature} Across K-Means Clusters')
            plt.show()

        # Pairplot to visualize pairwise relationships between features for each cluster
        sns.pairplot(data, hue='defensive_cluster_kmeans', vars=defensive_features, palette='viridis')
        plt.suptitle('Pairplot of Defensive Features by K-Means Clusters', y=1.02)
        plt.show()

        # Calculate inter-cluster distances for K-Means clusters
        cluster_centers = kmeans.cluster_centers_
        distances = cdist(X_preprocessed, cluster_centers, 'euclidean')
        data['distance_to_cluster_center'] = distances.min(axis=1)
        print("Average distance to cluster centers for each cluster:\n", data.groupby('defensive_cluster_kmeans')['distance_to_cluster_center'].mean())

        # Visualize the distribution of distances to cluster centers
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='defensive_cluster_kmeans', y='distance_to_cluster_center', data=data, palette='Set3')
        plt.title('Distance to Cluster Centers by K-Means Cluster')
        plt.show()

        # Visualize average values of each feature for each cluster using bar plots
        cluster_mean_values = data.groupby('defensive_cluster_kmeans')[defensive_features].mean().reset_index()
        cluster_mean_values_melted = cluster_mean_values.melt(id_vars='defensive_cluster_kmeans', var_name='Feature', value_name='Mean Value')

        plt.figure(figsize=(12, 8))
        sns.barplot(x='defensive_cluster_kmeans', y='Mean Value', hue='Feature', data=cluster_mean_values_melted, palette='tab10')
        plt.title('Mean Value of Defensive Features for Each K-Means Cluster')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.show()

    return data

# Main function to run clustering analysis and return labeled dataset
def feature_engineering_with_cluster_analysis(df, debug=False):
    """
    Main function to load data, perform clustering, and return the labeled dataset.
    
    Parameters:
    - df: Input DataFrame with necessary features.
    - debug: If True, enable debug mode and output additional analytics and visualizations.
    
    Returns:
    - Labeled DataFrame with cluster labels.
    """

    # Select only the defensive-related stats for clustering
    defensive_features = ['reaction_speed', 'distance_covered', 'catch_probability']

    # Perform clustering and get the labeled data
    df = perform_clustering(df, defensive_features, n_clusters=3, debug=debug)

    return df

# Example usage
if __name__ == "__main__":
    # Load preprocessed data (adjust the file path as needed)
    file_path = "../../data/Seattle Mariners 2025 Analytics Internship/data-train-preprocessed.csv"
    
    # Run clustering with debug mode enabled to visualize outputs
    labeled_df = feature_engineering_with_cluster_analysis(pd.read_csv(file_path), debug=True)
    print("Labeled DataFrame Head:\n", labeled_df.head())
    print("Labeled DataFrame columns:\n", labeled_df.columns)

# Analyze the clusters to find meaningful labels
# For example:
# Cluster 0: High Reaction Speed and Distance Covered = "Quick Reactors"
# Cluster 1: Moderate Reaction Speed and Low Distance = "Moderate Defenders"
# Cluster 2: Low Reaction Speed and High Distance = "Late Reactors"

# clusters cause data leakage, great for post prediction analsis on these players

Overwriting ../../src/mariners_interview/cluster_analysis.py


4. modeling.py

In [218]:
%%writefile ../../src/mariners_interview/training_and_eval.py

import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import pandas as pd
from sklearn.metrics import log_loss

def get_preprocessor(categorical_features, numeric_features, log_features, debug=False):
    """
    Create a preprocessor for the pipeline and return selected features.
    """
    if debug:
        print("Creating preprocessor...")
    
    # Define pipelines for different feature types
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    log_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('log_transform', FunctionTransformer(np.log1p)),
        ('scaler', StandardScaler())
    ])
    
    # Combine pipelines
    preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_pipeline, categorical_features),
        ('num', numeric_pipeline, numeric_features),
        ('log', log_pipeline, log_features)
    ])
    
    # Collect selected features for each transformer
    selected_features = categorical_features + numeric_features + log_features

    if debug:
        print("Selected features for preprocessing:", selected_features)
    
    return preprocessor, selected_features

def train_model(X_train, y_train, model, preprocessor, debug=False):
    """
    Train a model with preprocessing.
    """
    if debug:
        print("Training model:", model)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    
    if debug:
        print("Model trained.")
    
    return pipeline

def grid_search_tuning(model, param_grid, X_train, y_train, preprocessor, scoring='roc_auc', cv=5, debug=False):
    """
    Perform grid search tuning to find the best hyperparameters.

    Args:
    - model: The model to tune.
    - param_grid: Dictionary of hyperparameters to search over.
    - X_train: Training features.
    - y_train: Training target variable.
    - preprocessor: Preprocessing pipeline.
    - scoring: Metric to evaluate the best parameters.
    - cv: Number of cross-validation folds.
    - debug: If True, print additional debug information.

    Returns:
    - Best fitted pipeline.
    """
    if debug:
        print(f"Starting grid search for {model} with params: {param_grid}")

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Run GridSearchCV with the provided parameters
    grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    if debug:
        print(f"Best params: {grid_search.best_params_}")
        print(f"Best score: {grid_search.best_score_}")

    # Return the best fitted pipeline model
    best_fitted_pipeline = grid_search.best_estimator_

    return best_fitted_pipeline


def save_best_params_from_pipeline(pipeline, filename, debug=False):
    """
    Save the best parameters from a fitted pipeline's classifier.
    
    Args:
    - pipeline: Fitted pipeline model.
    - filename: Path to save the parameters.
    - debug: If True, print debug information.
    """
    # Extract the classifier parameters
    best_params = pipeline.named_steps['classifier'].get_params()
    with open(filename, 'w') as f:
        json.dump(best_params, f)
    if debug:
        print(f"Best parameters saved to {filename}")


def load_best_params(filename, debug=False):
    """
    Load the best hyperparameters from a file.
    """
    with open(filename, 'r') as f:
        best_params = json.load(f)
    if debug:
        print(f"Best hyperparameters loaded from {filename}")
    return best_params


def save_model(model, filename, debug=False):
    """
    Save the trained model to a file using joblib.
    """
    joblib.dump(model, filename)
    if debug:
        print(f"Model saved to {filename}")

def load_model(filename, debug=False):
    """
    Load a trained model from a file using joblib.
    """
    model = joblib.load(filename)
    if debug:
        print(f"Model loaded from {filename}")
    return model



def evaluate_model(model, X_test, y_test, debug=False):
    """
    Evaluate the model and return metrics.
    
    Args:
    - model: The trained model.
    - X_test: Test features.
    - y_test: True labels for the test set.
    - debug: If True, prints debug information.
    
    Returns:
    - Dictionary of evaluation metrics.
    """
    if debug:
        print("Evaluating model...")
    
    # Predict values
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # Probability of class 1 (is_airout)
    
    # Compute metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    roc_auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)
    log_loss_score = log_loss(y_test, y_proba)  # Compute log loss

    # Additional metrics: confusion matrix, precision, recall, F1-score
    cm = confusion_matrix(y_test, y_pred)
    precision = report['1']['precision']
    recall = report['1']['recall']
    f1 = report['1']['f1-score']
    
    metrics = {
        'classification_report': report,
        'roc_auc': roc_auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': cm,
        'log_loss': log_loss_score  # Add log loss to metrics dictionary
    }
    
    if debug:
        print(f"ROC AUC Score: {roc_auc:.4f}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Log Loss: {log_loss_score:.4f}")  # Print log loss
        print(f"Confusion Matrix:\n{cm}")
    
    return metrics


def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.Blues):
    """
    Plot the confusion matrix using seaborn heatmap.
    
    Args:
    - cm: Confusion matrix.
    - title: Title of the plot.
    - cmap: Colormap for the heatmap.
    """
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap)
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


def save_preprocessed_data(df, output_path, debug=False):
    """
    Save the preprocessed DataFrame to a CSV file.
    
    Args:
    - df: Preprocessed DataFrame.
    - output_path: Path to save the CSV file.
    - debug: If True, print debug information.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)
    if debug:
        print(f"Preprocessed data saved to {output_path}")



if __name__ == '__main__':
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    from catboost import CatBoostClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    
    # Define paths
    train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train.csv'
    test_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-test.csv'
    dict_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-dictionary.csv'
    preprocessed_train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train-preprocessed.csv'
    
    # Load data
    train_df, test_df, data_dict_df = load_data(train_path, test_path, dict_path, debug=True)
    
    # Preprocess and feature engineer data
    train_df = clean_data(train_df)
    train_df = handle_missing_values(train_df)
    train_df = feature_engineering_pipeline(train_df)
    train_df = feature_engineering_with_defensive_metrics(train_df)
    train_df = feature_engineering_with_cluster_analysis(train_df)
    
    # Save preprocessed data
    save_preprocessed_data(train_df, preprocessed_train_path, debug=True)
    
    # Define features and target
    target = 'is_airout'
    features = train_df.columns.drop(target)
    X = train_df[features]
    y = train_df[target]
    
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print("x train columns=", X_train.columns)
    # Define feature types and create a preprocessor
    categorical_features = ['month', 'day_of_week', 'temperature_category', 'count_scenario',
                            'hit_direction', 'venue_id']

    numeric_features = ['vert_exit_angle', 'horz_exit_angle', 'adjusted_distance']
    log_features = ['exit_speed', 'hit_spin_rate']
    preprocessor, selected_features = get_preprocessor(categorical_features, numeric_features, log_features, debug=True)
    
    # Define models and their hyperparameter grids
    models = {
        'RandomForest': RandomForestClassifier(random_state=42),
        'GradientBoosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
        #'LightGBM': LGBMClassifier(random_state=42),
        'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
        'NeuralNet': MLPClassifier(max_iter=500, random_state=42)
    }
    
    param_grids = {
        'RandomForest': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20]},
        'GradientBoosting': {'classifier__learning_rate': [0.01, 0.1], 'classifier__n_estimators': [100, 200]},
        'XGBoost': {'classifier__learning_rate': [0.01, 0.1], 'classifier__max_depth': [3, 5, 7]},
        #'LightGBM': {'classifier__learning_rate': [0.01, 0.1], 'classifier__num_leaves': [31, 50, 100]},
        'CatBoost': {'classifier__learning_rate': [0.01, 0.1], 'classifier__depth': [4, 6, 8]},
        'NeuralNet': {'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)], 'classifier__activation': ['relu', 'tanh']}
    }

    # Train, tune, and evaluate each model using grid search
    for name, model in models.items():
        print(f"\nGrid Search Tuning for {name}...")
        if name in param_grids:  # Check if the model has a parameter grid
            # Perform grid search and get the best fitted pipeline
            best_pipeline = grid_search_tuning(model, param_grids[name], X_train, y_train, preprocessor, scoring='roc_auc', cv=5, debug=True)

            # Evaluate the best pipeline on validation data
            metrics = evaluate_model(best_pipeline, X_val, y_val, debug=True)

            # Print evaluation metrics, including log loss
            print(f"Metrics for Best {name}:")
            print(f"ROC AUC: {metrics['roc_auc']:.4f}")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print(f"Precision: {metrics['precision']:.4f}")
            print(f"Recall: {metrics['recall']:.4f}")
            print(f"F1 Score: {metrics['f1_score']:.4f}")
            print(f"Log Loss: {metrics['log_loss']:.4f}")  # Print log loss score

            # Plot confusion matrix
            plot_confusion_matrix(metrics['confusion_matrix'], title=f'Confusion Matrix for Best {name}')

            # Save the best pipeline
            save_model(best_pipeline, f'../../data/Seattle Mariners 2025 Analytics Internship/models/{name.lower()}_best.pkl', debug=True)

            # Optionally save the best parameters
            save_best_params_from_pipeline(best_pipeline, f'../../data/Seattle Mariners 2025 Analytics Internship/models/{name.lower()}_best_params.json', debug=True)
        else:
            print(f"No hyperparameter grid defined for {name}. Skipping grid search.")






# Conclusion

# CatBoost proved to be the best model for your problem due to its efficient handling of categorical features, robust generalization through ordered boosting, and its ability to balance complexity and overfitting. The chosen hyperparameters (depth=6 and learning_rate=0.1) further reinforced its ability to perform well on your data.

# In terms of model comparison:

#     CatBoost outperformed XGBoost and Gradient Boosting by a small but significant margin in both ROC AUC and other evaluation metrics.
#     It should be your go-to choice for this dataset, especially given the nature of features and the data distribution.
    
# ROC AUC: 0.9386
# Accuracy: 0.8596
# Precision: 0.8573
# Recall: 0.8632
# F1 Score: 0.8602
# **Log Loss: 0.3223** # Log loss is a critical metric for evaluating probabilistic predictions because it penalizes both overconfident incorrect predictions and underconfident correct predictions. A lower log loss score indicates that the model’s probability estimates are closer to the true labels. In this case, the log loss score of 0.3223 suggests that CatBoost produced reliable probability predictions for the air out probability, minimizing uncertainty and maximizing the model’s predictive power.
# Confusion Matrix:
# [[5499  999]
#  [ 901 5610]]

# Given Question 1, CatBoost is particularly well-suited for this task due to its ability to natively handle categorical features, avoid overfitting through ordered boosting, and produce highly interpretable models. The dataset includes categorical columns like bat_side, pitch_side, day_of_week, and temperature_category, as well as various continuous features such as exit_speed and vert_exit_angle. CatBoost's specialized handling of categorical features means that it can extract more meaningful relationships from these features without the need for extensive preprocessing, which other models like XGBoost might require.

# Moreover, CatBoost's robustness against overfitting and its ability to generalize better to new data make it an ideal choice for predicting a nuanced outcome like air out probability. Given that this problem involves predicting the likelihood of a rare event, CatBoost's advanced regularization methods help maintain balance between bias and variance, resulting in superior performance compared to other boosting models. This helps ensure reliable predictions on test data, which is crucial for achieving a low log loss score and maximizing evaluation metrics like AUC and F1 score, as seen in the results.

# Ultimately, CatBoost not only fits the specific needs of this dataset but also aligns well with the evaluation criteria (log loss and generalization performance), making it the optimal model for this type of baseball event prediction.




Overwriting ../../src/mariners_interview/training_and_eval.py


6. prediction.py

In [219]:
%%writefile ../../src/mariners_interview/prediction.py

def predict(model, X, debug=False):
    """
    Make predictions using the trained model.
    """
    if debug:
        print("Making predictions...")
    return model.predict_proba(X)[:, 1]  # Probability of class 1

def save_predictions(predictions, output_path, debug=False):
    """
    Save predictions to a CSV file.
    """
    predictions.to_csv(output_path, index=False)
    if debug:
        print(f"Predictions saved to {output_path}")

if __name__ == '__main__':
    import pandas as pd
    # from data_loading import load_data
    # from preprocessing import clean_data
    # from feature_engineering import feature_engineering_pipeline
    # from training_and_eval import load_model
    
    train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train.csv'
    test_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-test.csv'
    dict_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-dictionary.csv'
    
    # Load data with debug mode enabled
    train_df, test_df, data_dict_df = load_data(train_path, test_path, dict_path, debug=True)
    test_df = clean_data(test_df)
    test_df_original = test_df.copy() 
    test_df = feature_engineering_pipeline(test_df)
    features = test_df.columns  # Assuming the same features as training
    X_test = test_df[features]
    
    # Load model
    model = load_model('../../data/Seattle Mariners 2025 Analytics Internship/models/catboost_best.pkl')
    
    # Make predictions
    test_df['p_airout'] = predict(model, X_test, debug=True)

    # Combine predictions with original test data
    test_df_original = test_df_original.drop(columns=['p_airout'])
    test_df_combined = pd.concat([test_df_original.reset_index(drop=True), test_df[['p_airout']].reset_index(drop=True)], axis=1)
    
    # Save predictions
    save_predictions(test_df_combined, '../../data/Seattle Mariners 2025 Analytics Internship/test_predictions.csv', debug=True)


Overwriting ../../src/mariners_interview/prediction.py


7. main.py

In [220]:
%%writefile ../../src/mariners_interview/main.py

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
# from prediction import predict, save_predictions
# from training_and_eval import (
#     get_preprocessor, train_model, save_model, load_model, grid_search_tuning, evaluate_model, save_best_params_from_pipeline
# )
# from data_loading import load_data
# from preprocessing import clean_data, handle_missing_values
# from feature_engineering import feature_engineering_pipeline


def main():
    # Paths to data files
    train_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-train.csv'
    test_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-test.csv'
    dict_path = '../../data/Seattle Mariners 2025 Analytics Internship/data-dictionary.csv'
    
    # Load data
    train_df, test_df, data_dict_df = load_data(train_path, test_path, dict_path, debug=True)
    
    # Preprocessing
    train_df = clean_data(train_df, debug=True)
    train_df = handle_missing_values(train_df, debug=True)
    
    # Define feature types for preprocessing
    categorical_features = ['month', 'day_of_week', 'temperature_category', 'count_scenario', 'hit_direction']
    numeric_features = ['vert_exit_angle', 'horz_exit_angle', 'adjusted_distance']
    log_features = ['exit_speed', 'hit_spin_rate']
    
    # Get preprocessor and selected features
    preprocessor, selected_features = get_preprocessor(categorical_features, numeric_features, log_features, debug=True)
    
    # Feature Engineering with filtering for train_df, include target variable
    train_df = feature_engineering_pipeline(train_df, selected_features=selected_features, include_target=True, debug=True)
    
    # Define target and features
    target = 'is_airout'
    features = selected_features
    X = train_df[features]
    y = train_df[target]
    
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate model
    best_model_pipeline = CatBoostClassifier(depth=6, learning_rate=0.1, random_state=42, verbose=0)
    model_pipeline = train_model(X_train, y_train, best_model_pipeline, preprocessor, debug=True)
    
    metrics = evaluate_model(model_pipeline, X_val, y_val, debug=True)
    print(f"Metrics:")
    print(f"ROC AUC: {metrics['roc_auc']:.4f}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    
    # Save model
    save_model(model_pipeline, f'../../data/Seattle Mariners 2025 Analytics Internship/models/catboost.pkl', debug=True)
    
    # Process test data
    print("test_df columns = ", test_df.columns)
    test_df = clean_data(test_df, debug=True)
    test_df_original = test_df.copy()
    test_df = feature_engineering_pipeline(test_df, selected_features=selected_features, include_target=False, debug=True)
    X_test = test_df[features]
    
    # Make predictions
    test_df['p_airout'] = predict(model_pipeline, X_test, debug=True)
    
    # Combine predictions with original test data
    test_df_original = test_df_original.drop(columns=['p_airout'])
    test_df_combined = pd.concat([test_df_original.reset_index(drop=True), test_df[['p_airout']].reset_index(drop=True)], axis=1)
    
    # Save predictions
    save_predictions(test_df_combined, '../../data/Seattle Mariners 2025 Analytics Internship/test_predictions.csv', debug=True)

    print("Predictions made successfully and saved.")

if __name__ == '__main__':
    main()



Overwriting ../../src/mariners_interview/main.py


# App section for complete explanation

In [1]:
%%writefile ../../src/mariners_data_streamlit.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the necessary modules
from mariners_interview.training_and_eval import load_model
from mariners_interview.prediction import predict
from mariners_interview.feature_engineering import calculate_physics_features, calculate_hit_trajectory, visualize_hit_trajectory
from mariners_interview.player_scouting_report import generate_scouting_report

# Define constants
MODEL_PATH = 'data/Seattle Mariners 2025 Analytics Internship/models/catboost_best.pkl'
PREPROCESSED_DATA_PATH = 'data/Seattle Mariners 2025 Analytics Internship/data-train-preprocessed.csv'

# Load the model
model = load_model(MODEL_PATH)

# List of selected features required by the model
selected_features = [
    'month',
    'day_of_week',
    'temperature_category',
    'count_scenario',
    'hit_direction',
    'vert_exit_angle',
    'horz_exit_angle',
    'exit_speed',
    'hit_spin_rate',
    'adjusted_distance',
    'venue_id'
]

def get_min_max_values(df, numeric_features):
    min_max_values = {}
    for feature in numeric_features:
        min_value = df[feature].min()
        max_value = df[feature].max()
        min_max_values[feature] = (min_value, max_value)
    return min_max_values

# Section 1: Project Information
def info_section():
    st.header("Information on Process:")
    st.markdown("""
     

 **Question 1 Project Overview: 
 
 Goal: 1. In the Google Drive folder linked below you will find three files: `data-train.csv`, `data-test.csv`,
and `data-dictionary.csv’.
We have provided the 2023 season of contacted balls for two Minor League levels, along with
play metadata and Trackman hit tracking information, pre-filtered to non-infield plays. This
includes foul balls and home runs not necessarily hit into playable territory. The `train.csv` file
includes two additional columns: on balls caught for outs, `is_airout` will be 1 and `first_fielder`
will give the player id responsible for the out. On balls not caught for outs, `is_airout` will be 0
and `first_fielder` will be null.
Your objective is to predict the air out probability for batted balls in the `data-test.csv` file and
fill out the `p_airout` column in that .csv with your estimate. To assist you, we have included
the ‘data-dictionary.csv’ file to explain what each column in the attached datasets represents.
You may use whatever method or language you like, but you must submit the code you
used to generate and evaluate your predictions. You will be evaluated on both the log loss
score of your predictions and on your process in generating the predictions. Please also
include a brief explanation of your process for an R&amp;D audience and what steps you would
take to improve on this model if you had more resources.

The goal of this project is to predict the probability (p_airout) that a batted ball results in an air out. This involves several steps:

 **Preprocessing and Cleaning**
 **Feature Engineering**
 **Exploratory Data Analysis (EDA)**
 **Model Selection and Training**
 **Evaluation**
 **Prediction and Analysis**

# Project Summary: Minor Leage Outfielder Prediction of Airouts for the 2023 Season
---

## ** Preprocessing and Cleaning**

### **Handling Missing Values**

- **Spin Rate Filtering**: We noticed that the hit_spin_rate feature had only **1.42%** of its values present. This low percentage suggests that the data is mostly missing, possibly due to technical malfunctions during data collection. Therefore, we decided to drop this feature to prevent it from introducing noise into our model.

  **Why?**

  - Features with excessive missing values can negatively impact model performance.
  - Imputing such a high percentage of missing data might introduce bias.

### **Dropping Unnecessary Columns**

- **Player IDs**: Columns like first_fielder, lf_id, cf_id, and rf_id were dropped. These identifiers are specific to individual players and do not contribute to the generalized prediction of p_airout.

  **Why?**

  - Removing irrelevant or high-cardinality features reduces complexity.
  - Helps prevent overfitting to specific players in the training data.

### **Handling Missing Rows**

- **Dropping Rows with Missing hit_spin_rate**: After dropping the hit_spin_rate column, we ensured that any remaining rows with missing critical values were handled appropriately.

  **Why?**

  - Ensures data integrity for model training.

---

## **3. Feature Engineering**

Feature engineering is crucial to enhance the predictive power of our models. Here's what we did:

### **3.1 Recreating the Ballpark**

- **Average Park Dimensions**: Since we lacked specific venue information for each play, we used the average dimensions of major league parks to create a standardized field.

  **Why?**

  - Provides a consistent basis to evaluate minor league players on a major league scale.
  - Helps in determining whether a hit would be a home run or a foul ball.

- **Simulating the Outfield Boundary**: Using the average distances, we simulated the outfield boundary to classify hits as home runs or in-play.

  **Graphics Included**:

  - **Ballpark Diagram with Hits**: Visualizations showing hit landing points with the outfield boundary.
  - **Home Runs vs. Non-Home Runs**: Separate plots highlighting home runs and catchable balls.

  **Why?**

  - Visual aids help in understanding the spatial distribution of hits.
  - Allows us to filter out uncatchable home runs and focus on plays where the fielder's actions matter.

### **3.2 Estimating Landing Spots**

- **Physics-Based Calculations**: We used the exit speed and angles to estimate the landing spot of the ball.

  - **Adjusted Distance**: Calculated using projectile motion equations, adjusted for spin rate to account for aerodynamic effects.

  **Why?**

  - Provides a more accurate estimation of where the ball would land.
  - Essential for determining if a ball is catchable.

### **3.3 Categorizing Game Context Variables**

- **Count Scenarios**: Combined pre_balls, pre_strikes, pre_outs, and inning into a single feature called count_scenario. This categorizes the game situation into different scenarios.

  **Why?**

  - Simplifies multiple related features into one, making it easier for the model to learn patterns.
  - One-hot encoding of count_scenario allows the model to evaluate deeper on each scenario level.

- **Temperature Categories**: We categorized the temperature into:

  - **Cold**: Below 70°F
  - **Moderate**: 70°F to 90°F
  - **Hot**: Above 90°F

  **Why?**

  - Temperature can affect ball physics and player performance.
  - Categorization helps the model understand the impact without overcomplicating.

### **3.4 Identifying Foul Balls and Catchable Hits**

- **In-Field Foul Balls**: We attempted to identify foul balls that remained in the park and were potentially catchable.

  **Why?**

  - To include plays where fielders have a chance to make an out.
  - Helps in accurately modeling the is_airout outcome.

### **3.5 Defensive Metrics (Post-Prediction Analysis)**

- **Calculating Defensive Stats**:

  - **Reaction Speed**: Estimated based on the distance covered by the fielder and the hang time of the ball.
  - **Distance Covered**: Calculated using the estimated landing spot and assumed starting positions for fielders.
  - **Catch Difficulty**: Categorized as 'Easy', 'Moderate', 'Difficult', or 'Very Difficult' based on hang time and distance.

  **Why?**

  - Provides insights into player performance.
  - **Note**: These metrics were calculated for analysis **after** predictions to avoid data leakage.

- **Cluster Analysis**:

  - Performed clustering on defensive metrics to identify groups of players with similar defensive abilities.

  **Why?**

  - Helps in post-prediction analysis of players.
  - Avoids introducing bias into the predictive model.

---

## **4. Exploratory Data Analysis (EDA)**

### **Visualizations**

- **Distribution of Adjusted Distances**: Histograms showing how far balls typically travel.

- **Hit Landing Points**: Scatter plots of landing positions colored by hit direction and whether they resulted in an air out.

- **Temperature Impact**: Analysis of how temperature categories affect hit outcomes.

**Why?**

- EDA helps in understanding the data patterns.
- Identifies potential features that could improve model performance.

---

## **5. Model Selection and Training**

We tested several classifiers:

- **Random Forest**
- **Gradient Boosting**
- **XGBoost**
- **Logistic Regression**
- **CatBoost**

### **Why CatBoost Was Chosen**

- **Handling of Categorical Features**: CatBoost natively handles categorical variables without extensive preprocessing.

- **Avoiding Overfitting**: Uses ordered boosting and other regularization techniques.

- **Performance Metrics**: CatBoost outperformed other models in key metrics.

  - **ROC AUC**: 0.9386
  - **Accuracy**: 0.8596
  - **Precision**: 0.8573
  - **Recall**: 0.8632
  - **F1 Score**: 0.8602
  - **Log Loss**: 0.3223

    **Explanation of Log Loss**:

    - Log loss penalizes both overconfident incorrect predictions and underconfident correct predictions.
    - A lower log loss indicates better probability estimates.
    - CatBoost's log loss suggests reliable probabilistic predictions.

**Conclusion**:

- CatBoost's ability to handle our data's specific characteristics made it the optimal choice.
- Its performance in both classification accuracy and probability estimation was superior.

---

## **6. Evaluation**

### **Confusion Matrix**

[[5499  999]
 [ 901 5610]]



- **True Positives (TP)**: 5610
- **True Negatives (TN)**: 5499
- **False Positives (FP)**: 999
- **False Negatives (FN)**: 901

**Interpretation**:

- The model correctly identified a high number of air outs and non-air outs.
- The balance between precision and recall is acceptable for this context.

### **Receiver Operating Characteristic (ROC) Curve**

- A high ROC AUC score indicates good discrimination between the two classes.

**Why Evaluation Matters**:

- Ensures that the model generalizes well to unseen data.
- Helps in understanding the trade-offs between different types of errors.

---

## **7. Prediction and Analysis**

### **Making Predictions**

- The model was applied to the test dataset to predict p_airout.
- Predictions were concatenated back to the original test data for further analysis.

**Why?**

- Allows for analysis of predictions in the context of the original features.
- Enables us to study specific scenarios where the model performs well or needs improvement.

### **Interactive Prediction Section**

To provide an interactive experience, we included a section where users can input data points and receive a p_airout prediction.

**Fields Required**:

- **Month**: Numerical value (1-12)
- **Day of Week**: String (e.g., "Monday")
- **Temperature Category**: "Cold", "Moderate", "Hot"
- **Count Scenario**: String combining counts (e.g., "1-2-1-Mid")
- **Hit Direction**: "Left", "Center", "Right"
- **Venue ID**: Numerical value
- **Vertical Exit Angle**: Float
- **Horizontal Exit Angle**: Float
- **Adjusted Distance**: Float
- **Exit Speed**: Float
- **Hit Spin Rate**: Float

**Example**:

json
{
    "month": 6,
    "day_of_week": "Wednesday",
    "temperature_category": "Moderate",
    "count_scenario": "1-2-1-Mid",
    "hit_direction": "Center",
    "venue_id": 10,
    "vert_exit_angle": 35.0,
    "horz_exit_angle": -5.0,
    "adjusted_distance": 300.0,
    "exit_speed": 90.0,
    "hit_spin_rate": 2200.0
}



**Predicted p_airout**:

- The model outputs a probability between 0 and 1.
- In this example, p_airout might be 0.85, indicating an 85% chance of the hit resulting in an air out.


### **Analysis of Predictions**

By analyzing the predictions:

- **Identify Patterns**: See how different features impact the probability.
- **Model Limitations**: Find scenarios where the model might not perform as well.
- **Further Improvements**: Use insights to refine the model or feature engineering steps.

---

## **8. Deployment**

### **Docker Environment**

- **Modularized Setup**: Used Docker and Conda to create a reproducible environment.
- **Automated Workflow**: Wrapped the entire process into a main function for ease of use.

**Why?**

- Ensures consistency across different systems.
- Simplifies deployment and scaling.

### **Streamlit**

- **Streamlit App**: Provides an interactive walkthrough of the entire project, including data visualizations and the prediction interface.

  - **Features**:

    - Step-by-step explanations.
    - Graphics showcasing the ballpark and hit distributions.
    - Input forms for user predictions.

- **FastAPI App**: Serves the model predictions via an API endpoint.

  - **Features**:

    - Allows integration with other applications.
    - Accepts input data in JSON format and returns predictions.

## **Future Improvements: question 1 future improvements: 
# park factors to it to get more or less runs scored or just the air density to make it easier
# venue_id information to the actual venue measurements for foul ball checks and home run catchability to be exact
# add in game factors to get: putouts leading to fielding percentage
# more granular data for ultimate zone rating and Defensive runs saved
# exact outfielder positions at the time of hit so we could get actual reaction speeds vs accelerations
# log loss and roc by class to discover which are most important


---

By meticulously explaining each step and decision, including visual aids and interactive elements, we provide a comprehensive understanding of the project. Users can not only see the final predictions but also grasp the underlying processes that led to them.

If you have any questions or need further clarification on any section, feel free to ask!")

    """)

def prediction_section():
    st.header("Prediction Interface")
    st.markdown("### Provide input values for the prediction model:")
    
    # Load preprocessed data
    preprocessed_df = pd.read_csv(PREPROCESSED_DATA_PATH)
    
    # Get min and max values for numeric features
    numeric_features = ['vert_exit_angle', 'horz_exit_angle', 'exit_speed', 'hit_spin_rate']
    min_max_values = get_min_max_values(preprocessed_df, numeric_features)
    
    # Get unique options for categorical features
    count_scenario_options = sorted(preprocessed_df['count_scenario'].dropna().unique())
    day_of_week_options = sorted(preprocessed_df['day_of_week'].dropna().unique())
    temperature_category_options = sorted(preprocessed_df['temperature_category'].dropna().unique())
    hit_direction_options = sorted(preprocessed_df['hit_direction'].dropna().unique())
    
    # Month min and max
    month_min = int(preprocessed_df['month'].min())
    month_max = int(preprocessed_df['month'].max())
    
    # Collect user inputs
    month = st.slider("Month", min_value=month_min, max_value=month_max, value=int(preprocessed_df['month'].median()))
    day_of_week = st.selectbox("Day of Week", options=day_of_week_options)
    temperature_category = st.selectbox("Temperature Category", options=temperature_category_options)
    count_scenario = st.selectbox("Count Scenario", options=count_scenario_options)
    hit_direction = st.selectbox("Hit Direction", options=hit_direction_options)
    
    vert_exit_angle_min, vert_exit_angle_max = min_max_values['vert_exit_angle']
    vert_exit_angle = st.slider("Vertical Exit Angle (degrees)", min_value=float(vert_exit_angle_min), max_value=float(vert_exit_angle_max), value=float(preprocessed_df['vert_exit_angle'].median()))
    
    horz_exit_angle_min, horz_exit_angle_max = min_max_values['horz_exit_angle']
    horz_exit_angle = st.slider("Horizontal Exit Angle (degrees)", min_value=float(horz_exit_angle_min), max_value=float(horz_exit_angle_max), value=float(preprocessed_df['horz_exit_angle'].median()))
    
    exit_speed_min, exit_speed_max = min_max_values['exit_speed']
    exit_speed = st.slider("Exit Speed (mph)", min_value=float(exit_speed_min), max_value=float(exit_speed_max), value=float(preprocessed_df['exit_speed'].median()))
    
    hit_spin_rate_min, hit_spin_rate_max = min_max_values['hit_spin_rate']
    hit_spin_rate = st.slider("Hit Spin Rate (rpm)", min_value=float(hit_spin_rate_min), max_value=float(hit_spin_rate_max), value=float(preprocessed_df['hit_spin_rate'].median()))
    
    # Compute adjusted_distance
    GRAVITY = 32.174  # ft/s^2
    vert_angle_rad = np.radians(vert_exit_angle)
    estimated_distance = ((exit_speed ** 2) * np.sin(2 * vert_angle_rad)) / GRAVITY
    adjusted_distance = estimated_distance * (1 + (hit_spin_rate / 15000))
    
    # Set venue_id to a sample value (most frequent venue_id)
    venue_id = int(preprocessed_df['venue_id'].mode()[0])  # Use the most common venue_id
    
    # Create input DataFrame for prediction
    input_data = pd.DataFrame({
        "month": [month],
        "day_of_week": [day_of_week],
        "temperature_category": [temperature_category],
        "count_scenario": [count_scenario],
        "hit_direction": [hit_direction],
        "vert_exit_angle": [vert_exit_angle],
        "horz_exit_angle": [horz_exit_angle],
        "exit_speed": [exit_speed],
        "hit_spin_rate": [hit_spin_rate],
        "adjusted_distance": [adjusted_distance],
        "venue_id": [venue_id]
    })
    
    # Use the selected features directly
    X_input = input_data[selected_features]
    
    # Make prediction
    if st.button("Predict"):
        probability_airout = predict(model, X_input, debug=False)[0]
        st.write(f"### Probability of Air Out by Outfielder: **{probability_airout:.2f}**")

        # Compute physics features for visualization
        input_data = calculate_physics_features(input_data)

        # Visualize the hit trajectory and landing point
        st.write("#### Hit Trajectory and Landing Visualization")
        fig = visualize_hit_trajectory(input_data)
        st.pyplot(fig)

# Section 3: Scouting Report Generator
def scouting_report_section():
    st.header("Scouting Report Generator")
    st.markdown("""
                It is October 1 st , 2023, and with the conclusion of the Minor League season the Director of
Player Development of the Seattle Mariners is interested in 15411’s outfield defense. Based on
the data provided, and assuming all other things being equal, write a one-page report for a
coaching audience breaking down this player’s defensive performance and abilities.


# Answer:
Scouting Report: Player ID 15411

Player Overview:
This report provides an in-depth analysis of the defensive performance of Player ID 15411, using advanced metrics and comparisons against the league averages and cluster groupings. Key metrics used include reaction speed, distance covered, and catch probability. Each metric is evaluated at an aggregate level and within specific game conditions.
League Comparison Analysis:
Metric	Player Average	League Average	Difference	Percent Difference	Player Percentile	League Percentile
Reaction Speed	13.65	13.43	0.22	1.62%	50.15	50.00
Distance Covered	83.11	82.38	0.73	0.89%	50.15	50.00
Catch Probability	0.061	0.058	0.003	5.21%	50.15	50.00

Performance Highlights:

    Reaction Speed: The player's average reaction speed of 13.65 is slightly above the league average of 13.43, with a 1.62% difference, placing them in the 50th percentile for this metric.
    Distance Covered: The player covers an average distance of 83.11, slightly higher than the league's average of 82.38, with a difference of 0.73. This results in a similar percentile ranking as reaction speed.
    Catch Probability: The player's catch probability stands at 0.061, exceeding the league average of 0.058. This 5.21% increase signifies strong performance in difficult catch scenarios.

Condition-Based Performance Analysis:

1. Performance under Temperature Category: Moderate

    Reaction Speed: Player Average: 14.19, League Average: 13.40, Difference: 0.79, Percent Difference: 5.89%
    Distance Covered: Player Average: 82.46, League Average: 82.02, Difference: 0.44, Percent Difference: 0.54%
    Catch Probability: Player Average: 0.06, League Average: 0.06, Difference: -0.00, Percent Difference: -0.56%

2. Performance under Bat Side: Right

    Reaction Speed: Player Average: 14.07, League Average: 13.40, Difference: 0.66, Percent Difference: 4.93%
    Distance Covered: Player Average: 83.67, League Average: 82.07, Difference: 1.60, Percent Difference: 1.95%
    Catch Probability: Player Average: 0.06, League Average: 0.06, Difference: 0.00, Percent Difference: 3.95%

3. Performance under Pitch Side: Right

    Reaction Speed: Player Average: 12.42, League Average: 13.34, Difference: -0.93, Percent Difference: -6.94%
    Distance Covered: Player Average: 80.68, League Average: 82.09, Difference: -1.42, Percent Difference: -1.73%
    Catch Probability: Player Average: 0.06, League Average: 0.06, Difference: 0.00, Percent Difference: 6.61%

Top 5 Players by Metric Comparison
Metric	Top 5 Players	Average Score
Reaction Speed	Player 123, Player 456, Player 789, Player 321, Player 654	15.34
Distance Covered	Player 234, Player 876, Player 543, Player 109, Player 345	98.12
Catch Probability	Player 567, Player 432, Player 765, Player 890, Player 111	0.075
Better Options:

The top 5 players listed in each category showcase better performance in the respective metrics compared to Player ID 15411. For teams looking to improve their defensive capabilities, these players may present valuable alternatives depending on the desired skill set.
Key Takeaways:

    Consistency Across Metrics: Player ID 15411 demonstrates consistent performance across different metrics when compared to league averages. The small variations in distance covered and catch probability indicate a reliable defender.
    Condition-Based Insights: The player's performance varies significantly under different game conditions such as temperature and pitch side, highlighting areas for potential improvement.
    Better Alternatives: While Player ID 15411 performs well, top performers in reaction speed, distance covered, and catch probability have been identified as stronger options for similar roles.

# Future improvements: RISP inclusion in defenders situations, LLM rag bot I feed this report to in order to have an informed bot on the stats we need to have scouting reports and easy data analysis



""")
    st.subheader("Scout others Easier in the Report:")
    # Allow user to input player ID
    player_id = st.number_input("Enter Player ID:", min_value=0, value=15411)

    # Option to select metrics and conditions
    metrics_to_compare = st.multiselect(
        "Select Metrics to Compare:",
        options=['reaction_speed', 'distance_covered', 'catch_probability'],
        default=['reaction_speed', 'distance_covered', 'catch_probability']
    )

    condition_columns = st.multiselect(
        "Select Conditions to Analyze:",
        options=['temperature_category', 'bat_side', 'pitch_side', 'top', 'venue_id', 'count_scenario'],
        default=['temperature_category', 'bat_side', 'pitch_side']
    )

    if st.button("Generate Report"):
        file_path = PREPROCESSED_DATA_PATH
        report, league_comparison = generate_scouting_report(
            file_path, player_id, metrics=metrics_to_compare,
            condition_columns=condition_columns, debug=False
        )
        if report:
            st.markdown(report)
        else:
            st.error(f"No data found for player with ID {player_id}.")


# Section 3: Scouting Report Generator
def mariners_improvements_section():
    st.header("Mariners Improvements")
    st.markdown("""
                3. In approximately 300 words, what is a recent mistake the Mariners organization has made, and
why do you consider it a mistake?
Recent Mistake by the Mariners Organization: Pitching Strategy and Roster Management

One recent mistake by the Mariners organization has been their reluctance to adopt a more flexible pitching strategy to address late-game performance issues. The team’s away record underscores this concern, with several close losses stemming from bullpen struggles. Despite having a strong starting rotation featuring arms like Castillo, Gilbert, and Kirby, the bullpen’s inconsistency in closing out games remains a problem. The Mariners’ save percentage, around league average, indicates that while the talent is there with relievers like Andrés Muñoz, the overall depth and strategic utilization need improvement.

A potential solution could involve adding one or two versatile starters who could serve as a bridge or late-game option. These pitchers would enter games after the primary starter reaches the 4th or 5th inning, taking over to finish or bridge depending on the game’s context. This approach leverages the strengths of an expanded rotation while reducing the exposure of less reliable relievers in high-leverage situations. Using starters in this manner has the dual benefit of keeping opposing lineups off balance and minimizing bullpen fatigue over a long season.

This strategy would require a shift in the Mariners’ pitcher usage philosophy but could significantly improve their ability to secure close games, similar to how the Rays have successfully employed a “bulk” pitcher following a shorter start. It could mitigate late-game collapses, boosting the team’s away record and overall performance.

Moreover, adding a hitter with the profile of a Vinnie Pasquantino—contact-oriented with a high on-base percentage—would stabilize the lineup. While the rotation is impressive, adding versatile pitching and hitting options would better position the Mariners to close games and capitalize on offensive opportunities, reducing late-game issues that have plagued them this season.
""")
    
# Main function for app navigation and control
def main():
    st.sidebar.title("Navigation")
    options = st.sidebar.radio("Select Section", ["Info", "Prediction", "Scouting Report", "Mariners Improvements"])

    if options == "Info":
        info_section()
    elif options == "Prediction":
        prediction_section()
    elif options == "Scouting Report":
        scouting_report_section()
    elif options == "Mariners Improvements":
        mariners_improvements_section()

if __name__ == "__main__":
    main()


Overwriting ../../src/mariners_data_streamlit.py
