In [None]:
#allow output from every Line
from IPython.core. interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
from pybaseball import batting_stats
import random
import numpy as np
import statistics
import matplotlib.pyplot as plt

In [None]:
# this is a function that takes a list of 1's and 0's and returns the 'streaks' and 'slumps' in
# the list. for the purpose of my project, I am primarily interested in the streaks.
def streaks_and_slumps(numbers):
    #create open lists
    streaks = []
    slumps = []
    
    #we'll take the first in the list, and assign it to current number, ans start the count at 1
    current_count = 1
    current_number = numbers[0]

    for number in numbers[1:]:
        #if the next number is the same as current number, increase count by 1
        if number == current_number:
            current_count += 1
        else:
            #assign it to wither streaks or slumps depending on if it was a 0 or 1
            if current_number == 1:
                streaks.append(current_count)
            else:
                slumps.append(current_count)
            #reset count to 1 and set the streak breaking number to the new current number    
            current_count = 1
            current_number = number

    if current_number == 1:
        streaks.append(current_count)
    else:
        slumps.append(current_count)

    return streaks, slumps

In [None]:
dataX = batting_stats(1871, 2024, ind=0)
dataX = dataX[['IDfg', 'Name', 'G', 'H', 'PA']]
#dataX['IDfg'].nunique()

In [None]:
#Run here down to resimulate MLB batting history (1,000 simulations took roughly 3 hours for me)

In [None]:
#how many simulations of "MLB batting history" do you want to do?
career_sims = 1000

In [None]:
data = dataX.reset_index()

#career hit percentage for each player
data['hit%'] = data['H'] / data['PA']

#career plate appearances per game for each player
data['PA rate'] = data['PA'] / data['G']

#plate appearance minimum for each player. If a player averages 4.2 PA per game, I am justifying 
#that in virtually all games, they will get 4 PA's and in 20% of games, they'll get a 5th.
data['PA_m'] = np.floor(data['PA rate'])

#extra PA percentage
data['hit_e_p'] = data['PA rate'] - data['PA_m']


#for each of all the simulations you want..
for i in range(1, career_sims+1):
    column_name = f'career long from MLB sim {i}'
    data[column_name] = pd.NA
    
    #for each player (each row in the existing df)..
    for j in range(len(data)):
        h_p = data.at[j, 'hit%']
        h_e_p = data.at[j, 'hit_e_p']
        c_g = data.at[j, 'G']
        game_hits = []
    
        counter = 0
        #do for the length of players' career games
        while counter < c_g:
            
            g_pa = int(data.at[j, 'PA_m'] + random.choices([0, 1], weights=[1 - h_e_p, h_e_p], k=1)[0])
            #random binary sample based on hit percentage, for the length of PA rate + a chance at extra hit
            game_hits.append(int(sum(random.choices([0, 1], weights=[1 - h_p, h_p], k=g_pa)) > 0))
            counter += 1
    
        data.iloc[j, i + 9] = max(streaks_and_slumps(game_hits)[0])

        

#add two final columns describing each players' average hit streak long and their longest hit streak long
data['player longest long'] = pd.NA
data['player average long'] = pd.NA        


for i in range(len(data)):
    selected_columns = data.iloc[i, 10:career_sims+10]

    max_value = selected_columns.max()
    average_value = selected_columns.mean()

    data.at[i, 'player longest long'] = max_value
    data.at[i, 'player average long'] = average_value    
    


In [None]:
#save to a .csv to keep data and prevent need to resimulate
#data.to_csv('data.csv', index=False)

In [None]:
#data = pd.read_csv("data.csv")