In [5]:
# Play with PyBaseball - player game data each row is a game
from pybaseball import statcast_batter
from pybaseball import playerid_lookup
import pandas as pd
from datetime import datetime, timedelta


In [None]:
# DATA COLLECTION FOR SINGLE GAME
import pybaseball
from datetime import datetime, timedelta

def get_player_data(player_name, date, dataframe):
    '''
    Description: Based on the inputted information, add data to the given dataframe, 
                 or create one if necessary. The function will RETURN a NEW dataframe
    Parameters:
                player_name: name of the player you want to get data on
                             format should be: last_name, first_name
                date: date you want to search for game data on. 
                            format should be a string 'YYYY-MM-DD'
                dataframe: Pandas dataframe to either begin, or add rows too once data has been cleaned 

    Returns: result_dataframe: dataframe with new information, either created, or appended to paramater dataframe
    '''

    # Enable cache for previous requests
    pybaseball.cache.enable()
    # Set date variable
    # Get data within a date range
    data = pybaseball.statcast(start_dt=date, end_dt=date)

    # for column in data.columns:
    #     print(column)
    # Possible relevant columns: pitcher, home_team (for stadium), home_win_exp, starting pitcher, what starting pitcher throws R/L

    # Filter relevant columns for batting performance
    batting_cols = [
        'game_date', 'batter','player_name', 'game_pk', 'events', 'at_bat_number',
        'pitch_type', 'balls', 'strikes', 'pitcher', 'p_throws', 'at_bat_number', 
        'pitch_number', 'home_team', 'batter_days_since_prev_game'
    ]

    # Get batting subset of data
    batting_data = data[batting_cols].copy()

    #player_name = "Romano, Jordan" # FORMAT IS <last, first> as a string

    # Dataset of one player
    player_df = batting_data[batting_data['player_name'] == player_name].copy() 
    # print(output_df)

    print(len(player_df))

    if len(player_df) < 1:
        print(f"No game data found for {player_name} on {date} ")
        return dataframe
    else: 

        # Reset index to avoid duplicate label issues
        player_df = player_df.reset_index(drop=True)

        # Remove duplicate columns if they exist (Fixed issues)
        player_df = player_df.loc[:, ~player_df.columns.duplicated()]

        # Get minimum at bat number
        min_at_bat = player_df['at_bat_number'].min()
        min_at_bat = int(min_at_bat.iloc[0]) if isinstance(min_at_bat, pd.Series) else int(min_at_bat)
        print(f"Min at bat number: {min_at_bat}")

        # GET ROW with minimum at_bat_number AND pitch_number == 1
        min_row = player_df[(player_df['at_bat_number'] == min_at_bat) & 
                            (player_df['pitch_number'] == 1)]

        # Establish variables for game info
        starting_pitcher_id = None
        starting_pitcher_throws = None
        home_team = None
        batter_days_since_last_game = None

        # Get game info on starting pitcher and home team
        if not min_row.empty:
            min_row = min_row.iloc[0]  # Get the first row if multiple matches
            # print("Row with min at_bat_number and pitch_number == 1:")
            # print(min_row)
            home_team = min_row['home_team']
            starting_pitcher_id = min_row['pitcher']
            starting_pitcher_throws = min_row['p_throws']
            batter_days_since_last_game = min_row['batter_days_since_prev_game']
        else:
            print("No row found with min at_bat_number and pitch_number == 1")

        # Get is_hit column, 1 if player made a hit at bat
        player_df['is_hit'] = player_df['events'].isin(['single', 'double', 'triple', 'home_run']).astype(int)
        # Make is_homerun column
        player_df['is_home_run'] = player_df['events'].isin(['home_run']).astype(int)
        # Group by for each indivdual at bat
        per_game_at_bat_stats = player_df.groupby(['game_date', 'batter', 'player_name', 'game_pk']).agg(
            {
                'is_hit': 'sum', # Total hits
                'is_home_run': 'sum' # Total number of home runs
            }
        ).reset_index()
        # Group by for each GAME
        single_game_outcome = per_game_at_bat_stats.groupby(['game_date', 'player_name', 'game_pk']).agg(
            hits=('is_hit', 'sum'),
            at_bats=('is_hit', 'count'),
            home_runs=('is_home_run', 'sum')
        ).reset_index()

        # Add game info to player's outcome
        single_game_outcome['home_team'] = home_team
        single_game_outcome['starting_pitcher_id'] = starting_pitcher_id
        single_game_outcome['starting_pitcher_throws'] = starting_pitcher_throws 
        single_game_outcome['batter_days_since_prev_game'] = batter_days_since_last_game

        print(single_game_outcome)

        if dataframe is None or dataframe.empty:
            # If dataframe is empty or None, assign the new data
            result_dataframe = single_game_outcome.copy()
            print(f"Created new dataframe with {len(result_dataframe)} rows")
        else:
            # If dataframe has data, append the new data
            result_dataframe = pd.concat([dataframe, single_game_outcome], ignore_index=True)
            print(f"Appended data. Dataframe now has {len(result_dataframe)} rows")
        
        return result_dataframe

        # Feature engineering, home team, starting pitcher throws, starting pitcher



This is a large query, it may take a moment to complete


100%|██████████| 1/1 [00:00<00:00, 23.81it/s]

0
No game data found for Romano, Jorda on 2023-06-01 





### Current Data Setup For a Batter's Single Game Outcome

game_date | player_name | game_pk | hits | at_bats | home_runs | home_team (for stadium) | starting pitcher_id | starting_pitcher_throws | batter_days_since_prev_game
