In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
game_info['teams.home.team.name'].values[0]

In [None]:
from get_game_information import fetch_games, get_game_info

games_df, games_list, venues_list = fetch_games()

game_info_list = []

for game_id, venue in zip(games_list, venues_list):
  game_data = get_game_info(game_id)
  game_data['venue.name'] = venue
  ## Get actual game info
  game_info = games_df.copy()
  game_info = game_info[game_info['gamePk'] == game_id]
  home_team = game_info['teams.home.team.name'].values[0]
  away_team = game_info['teams.away.team.name'].values[0]

  break
  ## Add simulation

# print(game_data)

In [None]:
games_df.head()

In [None]:
game_data.head()

In [None]:
def outcomes(game_data, home_or_away):
    home_or_away_team = game_data.copy()
    if home_or_away == 'home':
        home_or_away_team = home_or_away_team[home_or_away_team['isTopInning'] == False]
    else:
        home_or_away_team = home_or_away_team[home_or_away_team['isTopInning'] == True]

    ## Calculate number of automatic outs (i.e. strikeouts)
    ## We're going to assume these outs stay the same in the simulations
    automatic_outs = home_or_away_team.copy()
    automatic_outs = automatic_outs[(automatic_outs['eventType'] == 'out') & (automatic_outs['hitData.launchSpeed'].isnull())]
    strikeouts = len(automatic_outs)
    ## Calculate the number of walks
    walks = home_or_away_team.copy()
    walks = walks[walks['eventType'] == 'walk']
    walk_len = len(walks)

    ## Now let's create a df with balls put in play
    put_in_play = home_or_away_team.copy()
    put_in_play = put_in_play[~put_in_play['hitData.launchSpeed'].isnull()].reset_index(drop=True)
    put_in_play = put_in_play[['hitData.launchSpeed', 'hitData.launchAngle', 'venue.name']]

    ## Now, we'll create a list of outcomes to sample from
    # Convert the DataFrame to a list of lists
    pip_list = put_in_play[['hitData.launchSpeed', 'hitData.launchAngle', 'venue.name']].values.tolist()

    # Create a list of "strikeout" and "walk" strings
    strikeout_list = ["strikeout"] * strikeouts
    walk_list = ["walk"] * walk_len

    # Combine the two lists
    outcomes = pip_list + strikeout_list + walk_list

    return outcomes


In [None]:
home_outcomes = outcomes(game_data, 'home')
away_outcomes = outcomes(game_data, 'away')

In [None]:
import pickle
import numpy as np

In [None]:
import random
import pickle
import pandas as pd

# Load the saved model and fitted preprocessor
with open('logistic_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

with open('preprocessor.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

def simulate_game(outcomes):
    outs = 0
    runs = 0
    bases = [False, False, False]  # First, Second, Third base
    
    outcomes_copy = outcomes.copy()  # Create a copy of the outcomes list
    
    while outcomes_copy:  # Continue until all outcomes are used
        if outs == 3:
            outs = 0
            bases = [False, False, False]  # Clear the bases after 3 outs
        
        # Sample an outcome from the list
        outcome = random.choice(outcomes_copy)
        outcomes_copy.remove(outcome)  # Remove the sampled outcome from the copy

        if outcome == "out":
            outs += 1
        elif outcome == "walk":
            advance_runner(bases)
        elif isinstance(outcome, list) and len(outcome) == 3:
            # Extract the launch speed, launch angle, and stadium from the outcome
            launch_speed, launch_angle, stadium = outcome

            # Create a DataFrame with the new example
            new_example = pd.DataFrame({
                'hitData_launchSpeed': [launch_speed],
                'hitData_launchAngle': [launch_angle],
                'venue_name': [stadium]
            })

            # Preprocess the new example using the loaded preprocessor
            new_example_preprocessed = preprocessor.transform(new_example)

            # Get predicted probabilities
            probabilities = loaded_model.predict_proba(new_example_preprocessed)[0]

            # Generate a random value between 0 and 1
            random_value = random.random()

            # Determine the outcome based on the probabilities
            if random_value < probabilities[0]:
                outs += 1
            elif random_value < probabilities[0] + probabilities[1]:
                runs += advance_runner(bases)
                bases[0] = True
            elif random_value < probabilities[0] + probabilities[1] + probabilities[2]:
                runs += advance_runner(bases, 2)
                bases[1] = True
            elif random_value < probabilities[0] + probabilities[1] + probabilities[2] + probabilities[3]:
                runs += advance_runner(bases, 3)
                bases[2] = True
            else:
                runs += advance_runner(bases, 4)
                bases = [False, False, False]
    
    return runs

def advance_runner(bases, count=1):
    runs = 0
    for _ in range(count):
        if bases[2]:
            runs += 1
        bases[2] = bases[1]
        bases[1] = bases[0]
        bases[0] = True
    return runs

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Simulate the game for home_outcomes and away_outcomes 50,000 times
num_simulations = 25
home_runs_scored = np.array([simulate_game(home_outcomes) for _ in range(num_simulations)])
away_runs_scored = np.array([simulate_game(away_outcomes) for _ in range(num_simulations)])

# Compare the scores and calculate win/tie/loss percentages
home_wins = np.sum(home_runs_scored > away_runs_scored)
away_wins = np.sum(home_runs_scored < away_runs_scored)
ties = np.sum(home_runs_scored == away_runs_scored)

home_win_percentage = home_wins / num_simulations * 100
away_win_percentage = away_wins / num_simulations * 100
tie_percentage = ties / num_simulations * 100

print(f"Home team wins: {home_win_percentage:.2f}%")
print(f"Away team wins: {away_win_percentage:.2f}%")
print(f"Ties: {tie_percentage:.2f}%")

In [None]:
# Graph the distributions of runs scored
plt.figure(figsize=(10, 6))
plt.hist(home_runs_scored, bins=range(max(home_runs_scored)+2), alpha=0.5, label=f'{home_team}')
plt.hist(away_runs_scored, bins=range(max(away_runs_scored)+2), alpha=0.5, label=f'{away_team}')
plt.xlabel('Runs Scored')
plt.ylabel('Frequency')
plt.title(f'Distribution of Runs Scored ({num_simulations} Simulations)')
plt.legend()
plt.show()