In [None]:
# Play with PyBaseball - player game data each row is a game
from pybaseball import statcast_batter
from pybaseball import playerid_lookup

import pandas as pd
from datetime import datetime, timedelta


In [249]:
# DATA COLLECTION FOR SINGLE GAME
import pybaseball
from datetime import datetime, timedelta

def get_player_data(player_id, date, dataframe):
    '''
    Description: Based on the inputted information, add data to the given dataframe, 
                 or create one if necessary. The function will RETURN a NEW dataframe
    Parameters:
                player_name: name of the player you want to get data on
                             format should be: last_name, first_name
                date: date you want to search for game data on. 
                            format should be a string 'YYYY-MM-DD'
                dataframe: Pandas dataframe to either begin, or add rows too once data has been cleaned 

    Returns: result_dataframe: dataframe with new information, either created, or appended to paramater dataframe
    '''

    # Enable cache for previous requests
    pybaseball.cache.enable()
    # Set date variable
    # Get data within a date range
    data = statcast_batter(start_dt=date, end_dt=date, player_id=player_id)

    # for column in data.columns:
    #     print(column)
    # Possible relevant columns: pitcher, home_team (for stadium), home_win_exp, starting pitcher, what starting pitcher throws R/L

    # Filter relevant columns for batting performance
    batting_cols = [
        'game_date', 'batter', 'batter','player_name', 'game_pk', 'events', 'at_bat_number',
        'pitch_type', 'balls', 'strikes', 'pitcher', 'p_throws', 'at_bat_number', 
        'pitch_number', 'home_team', 'batter_days_since_prev_game', "inning_topbot"
    ]

    # Get batting subset of data
    batting_data = data[batting_cols].copy()

    if len(batting_data) < 1:
        print(f"No game data found for {player_id} on {date} ")
        return dataframe
    else: 

        # Reset index to avoid duplicate label issues
        batting_data = batting_data.reset_index(drop=True)

        # Remove duplicate columns if they exist (Fixed issues)
        batting_data = batting_data.loc[:, ~batting_data.columns.duplicated()]

        # Get minimum at bat number
        min_at_bat = batting_data['at_bat_number'].min()
        min_at_bat = int(min_at_bat.iloc[0]) if isinstance(min_at_bat, pd.Series) else int(min_at_bat)

        # GET ROW with minimum at_bat_number AND pitch_number == 1
        min_row = batting_data[(batting_data['at_bat_number'] == min_at_bat) & 
                            (batting_data['pitch_number'] == 1)]

        # Establish variables for game info
        starting_pitcher_id = None
        starting_pitcher_throws = None
        home_team = None
        batter_days_since_last_game = None

        # Get game info on starting pitcher and home team
        if not min_row.empty:
            min_row = min_row.iloc[0]  # Get the first row if multiple matches
            home_team = min_row['home_team']
            starting_pitcher_id = min_row['pitcher']
            starting_pitcher_throws = min_row['p_throws']
            batter_days_since_last_game = min_row['batter_days_since_prev_game']
            player_name = min_row['player_name']
            inning_topbot = min_row['inning_topbot']
        else:
            print("No row found with min at_bat_number and pitch_number == 1")

        # Get is_hit column, 1 if player made a hit at bat
        batting_data['is_hit'] = batting_data['events'].isin(['single', 'double', 'triple', 'home_run']).astype(int)
        # Make is_homerun column
        batting_data['is_home_run'] = batting_data['events'].isin(['home_run']).astype(int)
        # Group by for each indivdual at bat
        per_game_at_bat_stats = batting_data.groupby(['game_date', 'game_pk', 'at_bat_number']).agg(
            {
                'is_hit': 'sum', # Total hits
                'is_home_run': 'sum' # Total number of home runs
            }
        ).reset_index()

        # Group by for each GAME
        single_game_outcome = per_game_at_bat_stats.groupby(['game_date', 'game_pk']).agg(
            hits=('is_hit', 'sum'),
            at_bats=('is_hit', 'count'),
            home_runs=('is_home_run', 'sum')
        ).reset_index()

        # Add game info to player's outcome
        single_game_outcome['player_name'] = player_name
        single_game_outcome['home_team'] = home_team
        single_game_outcome['is_home'] = 1 if inning_topbot == 'Bot' else 0
        single_game_outcome['starting_pitcher_id'] = starting_pitcher_id
        single_game_outcome['starting_pitcher_throws'] = starting_pitcher_throws 
        single_game_outcome['batter_days_since_prev_game'] = batter_days_since_last_game

        if dataframe is None or dataframe.empty:
            # If dataframe is empty or None, assign the new data
            result_dataframe = single_game_outcome.copy()
            #print(f"Created new dataframe with {len(result_dataframe)} rows")
        else:
            # If dataframe has data, append the new data
            result_dataframe = pd.concat([dataframe, single_game_outcome], ignore_index=True)
        
        return result_dataframe

        # Feature engineering, home team, starting pitcher throws, starting pitcher 
        # Add park/weather information?
        # add if player is home or away
        # starting pitcher era?
        # add confidence level and check for multiple pitchers



### Current Data Setup For a Batter's Single Game Outcome

game_date | player_name | game_pk | hits | at_bats | home_runs | home_team (for stadium) | starting pitcher_id | starting_pitcher_throws | batter_days_since_prev_game


In [None]:
import pandas as pd
import numpy as np
from statsmodels.genmod.families import Poisson
from statsmodels.genmod.generalized_linear_model import GLM
from scipy.stats import poisson


# Gather important features and train and run model

def prepare_features(df):
    '''
    Prepare features from df for further modeling
    '''

    # Sort by player_name and game_date
    df = df.sort_values(by=['player_name', 'game_date'])

    # Calculate recent performance features
    df['total_hits_last5'] = df.groupby('player_name')['hits'].transform(lambda x: x.shift(1).rolling(5).sum())
    df['total_at_bats_last5'] = df.groupby('player_name')['at_bats'].transform(lambda x: x.shift(1).rolling(5).sum())
    df['recent_hits_avg5'] = df['total_hits_last5'] / 5
    df['batting_avg_last5'] = df['total_hits_last5'] / df['total_at_bats_last5']

    df['total_hits_last10'] = df.groupby('player_name')['hits'].transform(lambda x: x.shift(1).rolling(10).sum())
    df['total_at_bats_last10'] = df.groupby('player_name')['at_bats'].transform(lambda x: x.shift(1).rolling(10).sum())
    df['recent_hits_avg10'] = df['total_hits_last10'] / 10
    df['batting_avg_last10'] = df['total_hits_last10'] / df['total_at_bats_last10']

    # Handle pitcher handedness
    df['pitcher_left'] = (df['starting_pitcher_throws'] == 'L').astype(int)

    # Define features
    features = [
        'recent_hits_avg5',
        'batting_avg_last5',
        'recent_hits_avg10',
        'batting_avg_last10',
        'pitcher_left',
        'batter_days_since_prev_game'
    ]

    # Drop rows with NaN in features or target
    df_clean = df.dropna(subset=features + ['hits'])

    return df_clean, features

# Need to revisit model. Only lookin at last 5 and 10 games, wtf????
# TODO: Look at model, ai made this i dont like it they only use recent games in the model and the features they calculate
# and the free ai does this not like the chat premium
def train_model(df, features):
    X = df[features]
    y = df['hits']
    model = GLM(y, X, family=Poisson()).fit()
    return model

def calculate_features_for_prediction(df, player_name, game_date, home_team, player_team, starting_pitcher_throws, batter_days_since_prev_game):
    """
    Calculate features for a specific player and game based on historical data.
    """
    # Filter historical data up to the game date
    df_player = df[(df['player_name'] == player_name) & (df['game_date'] < game_date)]
    if df_player.empty:
        raise ValueError(f"No historical data for {player_name} before {game_date}")

    # Calculate recent performance
    last_5_hits = df_player.tail(5)['hits'].mean() if len(df_player) >= 5 else df_player['hits'].mean()
    last_5_at_bats = df_player.tail(5)['at_bats'].sum() if len(df_player) >= 5 else df_player['at_bats'].sum()
    batting_avg_last5 = last_5_hits / (last_5_at_bats / 5) if last_5_at_bats > 0 else 0

    last_10_hits = df_player.tail(10)['hits'].mean() if len(df_player) >= 10 else df_player['hits'].mean()
    last_10_at_bats = df_player.tail(10)['at_bats'].sum() if len(df_player) >= 10 else df_player['at_bats'].sum()
    batting_avg_last10 = last_10_hits / (last_10_at_bats / 10) if last_10_at_bats > 0 else 0

    # Create features dictionary
    features_row = {
        'recent_hits_avg5': last_5_hits,
        'batting_avg_last5': batting_avg_last5,
        'recent_hits_avg10': last_10_hits,
        'batting_avg_last10': batting_avg_last10,
        'is_home': 1 if home_team == player_team else 0,
        'pitcher_left': 1 if starting_pitcher_throws == 'L' else 0,
        'batter_days_since_prev_game': batter_days_since_prev_game
    }

    return features_row


def predict_over_under(model, features_row, L):
    """
    Predict probabilities of going over or under a given line L.
    """
    X_new = pd.DataFrame([features_row], columns=features_row.keys())
    lambda_pred = model.predict(X_new)[0]
    K = np.floor(L)
    P_under = poisson.cdf(K, lambda_pred)
    P_over = 1 - P_under
    return P_over, P_under


# Run code here:
# GET DF OF GAMES DATA

def predict_hits(df, player_name, game_date, home_team, starting_pitcher_throws, batter_days_since_last_game, line_for_hits):
    '''
    Input dataframe with data, and player information you want to get information from
    '''

    df = df
    # Prepare features
    df_clean, features = prepare_features(df)
    # Train model
    model = train_model(df_clean, features)

    # Get features
    features_row = calculate_features_for_prediction(
            df, player_name, game_date, home_team,
            starting_pitcher_throws, batter_days_since_last_game
        )

    P_over, P_under = predict_over_under(model, features_row, L)
    print(f"For {player_name} on {game_date.date()}, with line {L}:")
    print(f"Probability of over: {P_over:.4f}")
    print(f"Probability of under: {P_under:.4f}")
    print(f"Prediction: {'Over' if P_over > P_under else 'Under'} with confidence {max(P_over, P_under):.4f}")



In [None]:
from pybaseball import statcast_batter
from pybaseball import playerid_lookup

# find David Ortiz's player id (mlbam_key)
id = playerid_lookup('judge','aaron')
id = id['key_mlbam'][0] 

df = pd.DataFrame()

data_df = get_player_data(id,'2025-05-05', df)
print("This is the data")
print(data_df)



Gathering Player Data
pitch_type
game_date
release_speed
release_pos_x
release_pos_z
player_name
batter
pitcher
events
description
spin_dir
spin_rate_deprecated
break_angle_deprecated
break_length_deprecated
zone
des
game_type
stand
p_throws
home_team
away_team
type
hit_location
bb_type
balls
strikes
game_year
pfx_x
pfx_z
plate_x
plate_z
on_3b
on_2b
on_1b
outs_when_up
inning
inning_topbot
hc_x
hc_y
tfs_deprecated
tfs_zulu_deprecated
umpire
sv_id
vx0
vy0
vz0
ax
ay
az
sz_top
sz_bot
hit_distance_sc
launch_speed
launch_angle
effective_speed
release_spin_rate
release_extension
game_pk
fielder_2
fielder_3
fielder_4
fielder_5
fielder_6
fielder_7
fielder_8
fielder_9
release_pos_y
estimated_ba_using_speedangle
estimated_woba_using_speedangle
woba_value
woba_denom
babip_value
iso_value
launch_speed_angle
at_bat_number
pitch_number
pitch_name
home_score
away_score
bat_score
fld_score
post_away_score
post_home_score
post_bat_score
post_fld_score
if_fielding_alignment
of_fielding_alignment
spin_axi

In [251]:
from datetime import datetime, timedelta
id = playerid_lookup('judge','aaron')
id = id['key_mlbam'][0] 
df = pd.DataFrame()
# Get May data
date = '2025-05-01'
while date < '2025-06-01':
    print(f"Getting data for {date}")
    # Get player data for date
    df = get_player_data(id, date, df)

    # Convert to datetime object
    date_obj = datetime.strptime(date, "%Y-%m-%d")

    # Add one day
    next_day_obj = date_obj + timedelta(days=1)

    # Convert back to string
    date = next_day_obj.strftime("%Y-%m-%d")

print(df)

Getting data for 2025-05-01
Gathering Player Data
No game data found for 592450 on 2025-05-01 
Getting data for 2025-05-02
Gathering Player Data
Getting data for 2025-05-03
Gathering Player Data
Getting data for 2025-05-04
Gathering Player Data
Getting data for 2025-05-05
Gathering Player Data
Getting data for 2025-05-06
Gathering Player Data
Getting data for 2025-05-07
Gathering Player Data
Getting data for 2025-05-08
Gathering Player Data
No game data found for 592450 on 2025-05-08 
Getting data for 2025-05-09
Gathering Player Data
Getting data for 2025-05-10
Gathering Player Data
Getting data for 2025-05-11
Gathering Player Data
Getting data for 2025-05-12
Gathering Player Data
Getting data for 2025-05-13
Gathering Player Data
Getting data for 2025-05-14
Gathering Player Data
Getting data for 2025-05-15
Gathering Player Data
No game data found for 592450 on 2025-05-15 
Getting data for 2025-05-16
Gathering Player Data
Getting data for 2025-05-17
Gathering Player Data
Getting data fo