### combine the count datasets

In [2]:
# import packages and datasets
import pandas as pd
import os
os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value')


In [3]:
# get the names of the files in the array from
folder_path = 'C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB/pitch_value/data/count'
try:
    file_names = []
    for entry in os.listdir(folder_path):
        full_path = os.path.join(folder_path, entry)  # Use os.path.join!
        if os.path.isfile(full_path):
            file_names.append(entry)
except Exception as e:
    print(f"An error occurred: {e}")

file_names.pop(12)

'count_clean.ipynb'

In [4]:
count_data = pd.DataFrame()
for item in file_names:
    count_path = os.path.join(folder_path, item)
    count = pd.read_csv(count_path)
    countavg = (count['woba'] * count['pitches']).sum() / count['pitches'].sum()
    count['woba'].fillna(countavg)
    count_data = pd.concat([count_data, pd.DataFrame([{'file': item, 
            'woba_avg': countavg, 'pitches': count['pitches'].sum()}])], ignore_index=True)


In [None]:
# clean and finalize the count data
count_data['balls'] = count_data['file'].str.split('-').str[0].str.extract('(\d+)').astype(int)
count_data['strikes'] = count_data['file'].str.split('-').str[1].str.extract('(\d+)').astype(int)

# Get the column names in the desired order
new_order = ['balls', 'strikes'] + [col for col in count_data.columns if col not in ['balls', 'strikes', 'file']]

# Reorder the columns
count_data = count_data[new_order]

### Strikeout wobas

in 0-3 counts 8647 resulted in strikeouts
<br>
in 1-3 counts 13,513 resulted in strikeouts
<br>
in 2-3 counts 11,756 result in strikeouts
<br>
in 3-3 counts 7243 resulted in strikeouts

In [6]:
# Create empty columns for 0-3,3 counts
empty_data = pd.DataFrame({
    'balls': [0, 1, 2, 3], 
    'strikes': [3, 3, 3, 3],
    'woba_avg': [0.0, 0.0, 0.0, 0.0],
    'pitches': [8647, 13513, 11756, 7243]
})
count_data = pd.concat([count_data, empty_data], ignore_index=True)

#### Walk Wobas

in 3-0 2182 resulted in a walk
<br>
in 3-1 4009 resulted in a walk
<br>
in 3-2 8186 resulted in a walk

In [7]:
empty_data = pd.DataFrame({
    'balls': [4, 4, 4], 
    'strikes': [0, 1, 2],
    'woba_avg': [0.689, 0.689, 0.689],
    'pitches': [2182, 4009, 8186]
})
count_data = pd.concat([count_data, empty_data], ignore_index=True)

### Ball and Strike Value

In [8]:
# Group counts by ball count
strike_effects = []
for balls in range(4):
    # Get counts with same number of balls
    same_balls = count_data[count_data['balls'] == balls]
    
    # Compare adjacent strike counts (0->1, 1->2, 2->3)
    for strikes in range(3):
        current = same_balls[same_balls['strikes'] == strikes]
        next_strike = same_balls[same_balls['strikes'] == strikes + 1]
        
        if not current.empty and not next_strike.empty:
            decrease = current['woba_avg'].values[0] - next_strike['woba_avg'].values[0]
            total_pitches = current['pitches'].values[0] + next_strike['pitches'].values[0]
            weighted_decrease = decrease * total_pitches
            strike_effects.append({
                'balls': balls,
                'strike_increase': f"{strikes}->{strikes+1}",
                'woba_decrease': decrease,
                'total_pitches': total_pitches,
                'weighted_decrease': weighted_decrease
            })

strike_effects_df = pd.DataFrame(strike_effects)

# Calculate overall weighted average
total_weighted_decrease = strike_effects_df['weighted_decrease'].sum()
total_affected_pitches = strike_effects_df['total_pitches'].sum()
weighted_avg_decrease = total_weighted_decrease / total_affected_pitches

print("Effect of adding a strike by count:")
print(strike_effects_df)
print(f"\nOverall weighted average decrease in wOBA per strike: {weighted_avg_decrease:.4f}")

Effect of adding a strike by count:
    balls strike_increase  woba_decrease  total_pitches  weighted_decrease
0       0            0->1       0.018733         275061        5152.690647
1       0            1->2       0.200199         141636       28355.452345
2       0            2->3       0.164576          57417        9449.446629
3       1            0->1       0.029369         139534        4098.007785
4       1            1->2       0.181823         141632       25751.977620
5       1            2->3       0.181400          83656       15175.186235
6       2            0->1       0.029663          58968        1749.175626
7       2            1->2       0.196345          95739       18797.878411
8       2            2->3       0.191335          71086       13601.249934
9       3            0->1       0.088370          21790        1925.579991
10      3            1->2       0.188569          50866        9591.761595
11      3            2->3       0.371669          43156       16

In [11]:
# Group counts by strike count
ball_effects = []
for strikes in range(3): 
    # Get counts with same number of strikes
    same_strikes = count_data[count_data['strikes'] == strikes]
    
    # Compare adjacent ball counts (0->1, 1->2, 2->3, 3->4)
    for balls in range(4):  
        current = same_strikes[same_strikes['balls'] == balls]
        next_ball = same_strikes[same_strikes['balls'] == balls + 1]
        
        if not current.empty and not next_ball.empty:
            increase = next_ball['woba_avg'].values[0] - current['woba_avg'].values[0]
            total_pitches = current['pitches'].values[0] + next_ball['pitches'].values[0]
            weighted_increase = increase * total_pitches
            ball_effects.append({
                'strikes': strikes,
                'ball_increase': f"{balls}->{balls+1}",
                'woba_increase': increase,
                'total_pitches': total_pitches,
                'weighted_increase': weighted_increase
            })

ball_effects_df = pd.DataFrame(ball_effects)

# Calculate overall weighted average
total_weighted_increase = ball_effects_df['weighted_increase'].sum()
total_affected_pitches = ball_effects_df['total_pitches'].sum()
weighted_avg_increase = total_weighted_increase / total_affected_pitches

print("Effect of adding a ball by count:")
print(ball_effects_df)
print(f"\nOverall weighted average increase in wOBA per ball: {weighted_avg_increase:.4f}")

Effect of adding a ball by count:
    strikes ball_increase  woba_increase  total_pitches  weighted_increase
0         0          0->1       0.009084         250240        2273.209748
1         0          1->2       0.024751          90604        2242.545659
2         0          2->3       0.231265          29396        6798.265522
3         0          3->4       0.040392           9019         364.292673
4         1          0->1      -0.001552         164355        -255.115439
5         1          1->2       0.024457         107898        2638.880337
6         1          2->3       0.172558          51362        8862.935539
7         1          3->4       0.128762          18962        2441.577199
8         2          0->1       0.016824         118913        2000.603109
9         2          1->2       0.009935         129473        1286.351523
10        2          2->3       0.180334          95243       17175.556700
11        2          3->4       0.317331          44099       1399

manually change the ball effects for 1-1 as it currently doesnt make sense

In [12]:
# Locate the row where strikes=1 and ball_increase='0->1'
mask = (ball_effects_df['strikes'] == 1) & (ball_effects_df['ball_increase'] == '0->1')

# Update the woba_increase value
ball_effects_df.loc[mask, 'woba_increase'] = 0.009084

# Recalculate the weighted_increase for this row
ball_effects_df.loc[mask, 'weighted_increase'] = (ball_effects_df.loc[mask, 'woba_increase'].values[0] * 
                                                  ball_effects_df.loc[mask, 'total_pitches'].values[0])

# Calculate overall weighted average
total_weighted_increase = ball_effects_df['weighted_increase'].sum()
total_affected_pitches = ball_effects_df['total_pitches'].sum()
weighted_avg_increase = total_weighted_increase / total_affected_pitches

print("Effect of adding a ball by count:")
print(ball_effects_df)
print(f"\nOverall weighted average increase in wOBA per ball: {weighted_avg_increase:.4f}")

Effect of adding a ball by count:
    strikes ball_increase  woba_increase  total_pitches  weighted_increase
0         0          0->1       0.009084         250240        2273.209748
1         0          1->2       0.024751          90604        2242.545659
2         0          2->3       0.231265          29396        6798.265522
3         0          3->4       0.040392           9019         364.292673
4         1          0->1       0.009084         164355        1493.000820
5         1          1->2       0.024457         107898        2638.880337
6         1          2->3       0.172558          51362        8862.935539
7         1          3->4       0.128762          18962        2441.577199
8         2          0->1       0.016824         118913        2000.603109
9         2          1->2       0.009935         129473        1286.351523
10        2          2->3       0.180334          95243       17175.556700
11        2          3->4       0.317331          44099       1399

##### woba changes from intial count

In [10]:
ball_increase = ball_effects_df.loc[0, 'woba_increase']
strike_increase = strike_effects_df.loc[0, 'woba_decrease']
print("ball ", ball_increase)
print("stike ", strike_increase)

ball  0.009084118238981587
stike  0.018732901601053453
