In [3]:
import statsapi
import pandas as pd


## Preprocessing

In [4]:
players = pd.read_csv("mlb_players_2021.csv")
pitches = pd.read_csv("regseason.csv")

In [5]:
players.rename(columns={"id" : "batter","fullName" : "batter_fullName" , "birthCountry" : "batter_birthCountry"}, inplace=True)
players.head()

Unnamed: 0,batter,batter_fullName,birthDate,batter_birthCountry,height,weight,primaryPosition,batSide,pitchHand,debutDate,active
0,642758,Domingo Acevedo,1994-03-06,Dominican Republic,"6' 7""",240,P,R,R,2021-06-21,True
1,664119,Austin Allen,1994-01-16,USA,"6' 1""",219,C,L,R,2019-05-11,False
2,462101,Elvis Andrus,1988-08-26,Venezuela,"6' 0""",210,SS,R,R,2009-04-06,False
3,642456,Luis Barrera,1995-11-15,Dominican Republic,"6' 0""",195,OF,L,L,2021-05-19,True
4,605135,Chris Bassitt,1989-02-22,USA,"6' 5""",217,P,R,R,2014-08-30,True


In [6]:
# Merge pitches df with info about the batter
full_df = pd.merge(pitches, players[["batter","batter_fullName","batSide", "batter_birthCountry"]], on="batter")
full_df.head()

Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,count,PBhand,pitch_cat,batter_fullName,batSide,batter_birthCountry
0,1,SL,2021-10-03,85.1,2.39,6.05,"Detmers, Reid",641487,672282,single,...,Standard,293.0,0.079,0.78,0_1,bLpL,breaking,J.P. Crawford,L,USA
1,2,FF,2021-10-03,93.8,2.31,6.03,"Detmers, Reid",641487,672282,,...,Standard,150.0,0.0,-0.045,0_0,bLpL,fastball,J.P. Crawford,L,USA
2,3,CU,2021-10-03,73.9,2.12,6.25,"Detmers, Reid",664238,672282,strikeout,...,Standard,329.0,-0.032,-0.315,2_2,bRpL,breaking,Dylan Moore,R,USA
3,4,FF,2021-10-03,94.6,2.43,5.96,"Detmers, Reid",664238,672282,,...,Standard,146.0,0.0,0.047,1_2,bRpL,fastball,Dylan Moore,R,USA
4,5,FF,2021-10-03,94.3,2.08,6.15,"Detmers, Reid",664238,672282,,...,Standard,151.0,0.0,0.0,1_2,bRpL,fastball,Dylan Moore,R,USA


In [7]:
# Remove the pitches that were fouled, in play, or swung at
not_hit = ["called_strike", "ball"]
pitches_not_hit = full_df[full_df['description'].isin(not_hit)].copy()
len(pitches_not_hit)
pitches_not_hit.head()

Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,count,PBhand,pitch_cat,batter_fullName,batSide,batter_birthCountry
2,3,CU,2021-10-03,73.9,2.12,6.25,"Detmers, Reid",664238,672282,strikeout,...,Standard,329.0,-0.032,-0.315,2_2,bRpL,breaking,Dylan Moore,R,USA
3,4,FF,2021-10-03,94.6,2.43,5.96,"Detmers, Reid",664238,672282,,...,Standard,146.0,0.0,0.047,1_2,bRpL,fastball,Dylan Moore,R,USA
6,7,FF,2021-10-03,94.2,2.33,5.94,"Detmers, Reid",664238,672282,,...,Standard,145.0,0.0,0.032,0_1,bRpL,fastball,Dylan Moore,R,USA
9,10,CU,2021-10-03,73.9,2.22,6.23,"Detmers, Reid",663728,672282,,...,Standard,322.0,0.0,0.174,2_2,bRpL,breaking,Cal Raleigh,S,USA
12,13,FF,2021-10-03,94.4,2.48,5.97,"Detmers, Reid",663728,672282,,...,Standard,148.0,0.0,0.062,1_2,bRpL,fastball,Cal Raleigh,S,USA


In [8]:
pitches_not_hit["strike_zone"] = pitches_not_hit["zone"] <= 9 # Determine if the pitch was actually in the zone

pitches_not_hit["called_correctly"] = (
    ((pitches_not_hit["zone"] <= 9) & (pitches_not_hit["type"] == "S")) | # Pitch is in the zone and is called a strike  
    ((pitches_not_hit["zone"] > 9) & (pitches_not_hit["type"] == "B"))    # Pitch is not in the zone and is called a ball
) 
pitches_not_hit["called_correctly"] = pitches_not_hit["called_correctly"].astype(int) # This will be are target variable -- whether the pitch was called "correctly"

## New addition

In [9]:
from sklearn.model_selection import train_test_split

# Creating a new variable to see if a game altering event (out or man on base) came as a result of the pitch
pitches_not_hit['has_event'] = pitches_not_hit['events'].notna().astype(int) 
y = pitches_not_hit["called_correctly"]
X= pitches_not_hit.drop(["called_correctly"], axis= 1)

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.head()

Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,delta_home_win_exp,delta_run_exp,count,PBhand,pitch_cat,batter_fullName,batSide,batter_birthCountry,strike_zone,has_event
528852,473444,FF,2021-08-10,90.9,2.12,5.5,"Allard, Kolby",647351,663465,,...,0.0,-0.052,1_0,bRpL,fastball,Abraham Toro,S,Canada,True,0
591532,529727,SI,2021-05-23,96.6,-2.57,5.44,"Castillo, Luis",541645,622491,,...,0.0,0.057,0_0,bRpR,fastball,Avisaíl García,R,Venezuela,False,0
431801,386725,FF,2021-06-05,92.1,-1.53,6.27,"Plutko, Adam",514917,592644,,...,0.0,0.038,0_0,bLpR,fastball,César Hernández,S,Venezuela,False,0
531682,475826,CH,2021-07-22,86.3,-2.06,5.78,"Foltynewicz, Mike",668731,592314,,...,0.0,0.038,1_1,bLpR,offspeed,Akil Baddoo,L,USA,False,0
722233,646975,FF,2021-05-26,92.1,-1.97,6.22,"Pineda, Michael",642082,501381,,...,0.0,0.101,2_2,bLpR,fastball,Chance Sisco,L,USA,True,0


### Running loop with all features

In [10]:
all_quant_cols = [  # only numerical features
    'release_speed', 'release_pos_x', 'release_pos_z', 'zone', 'balls', 'strikes',
    'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'outs_when_up', 'inning',
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'release_spin_rate', 'release_extension',
    'release_pos_y', 'woba_value', 'woba_denom', 'babip_value', 'iso_value',
    'at_bat_number', 'pitch_number', 'home_score', 'away_score', 'bat_score',
    'fld_score', 'post_away_score', 'post_home_score', 'post_bat_score',
    'post_fld_score', 'spin_axis', 'delta_home_win_exp', 'delta_run_exp', 'has_event'
]

all_qual_cols = [
    'pitch_type', 'game_date', 'player_name', 'events', 'description', 'des', 'game_type',
 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'inning_topbot', 'pitch_name',
 'if_fielding_alignment', 'of_fielding_alignment', 'count', 'PBhand', 'pitch_cat', 
 'batter_fullName', 'batSide', 'batter_birthCountry', 'strike_zone'
]

def encode_categoricals(df, categorical_cols):
    return pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [11]:
unique_woba_denom = X_test['woba_value'].dropna().unique()

# Print the unique values
print(unique_woba_denom)

[0.  0.7]


In [12]:

from itertools import combinations
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

best_score = 0
best_cols = []

min_datapoints = 500
# Total number of combinations
total_combos = len(all_qual_cols) * len(list(combinations(all_quant_cols, 2)))

# Initialize progress bar
with tqdm(total=total_combos, desc="Evaluating combinations") as pbar:
    for qual in all_qual_cols:
        for pair in combinations(all_quant_cols, 2): 
            cols = list(pair) + [qual]  # use just the current qualitative column
            try:
                # Drop rows with NaNs in selected columns only
                X_sub = X_train[cols].dropna()
                y_sub = y_train.loc[X_sub.index]

                if len(X_sub) > min_datapoints:
                    continue

                LR = LogisticRegression()
                LR.fit(X_sub, y_sub)
                score = LR.score(X_sub, y_sub)

                if score > best_score:
                    best_score = score
                    best_cols = cols
                    print(f"\n✅ New best score: {best_score:.4f} with features: {best_cols}")
            except KeyError:
                pass
            except ValueError:
                pass
            finally:
                pbar.update(1)

Evaluating combinations:   1%|          | 138/17043 [00:02<05:52, 47.95it/s]


KeyboardInterrupt: 

In [13]:
print(best_cols)
print(best_score)
print(X_train['woba_value'].count())
print(X_train['woba_denom'].count())
print(X_train['strike_zone'].count())

[]
0
19392
19392
292716


In [14]:
print(best_cols)
print(best_score)

[]
0


Unnamed: 0,woba_value,woba_denom,strike_zone
302738,,,True
144685,,,True
693679,,,False
113273,,,False
113660,,,False


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Subset to best columns
X_train_best = X_train[best_cols]
X_train_best = X_train[best_cols].dropna()
y_train = y_train.loc[X_train_best.index]
X_test_best = X_test[best_cols]
X_test_best = X_test[best_cols].dropna()
y_test = y_test.loc[X_test_best.index]

# Scale them
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_best)
X_test_scaled = scaler.transform(X_test_best)

# Train model
clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)

# Predict and score
y_pred = clf.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

ValueError: at least one array or dtype is required

### Takeaways

I created a very accurate model, but it is rather boring. The next step is to understand why this model accuracy is so good and limit our variables

### Running on Select Features

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.head()

Unnamed: 0.1,Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,...,delta_home_win_exp,delta_run_exp,count,PBhand,pitch_cat,batter_fullName,batSide,batter_birthCountry,strike_zone,has_event
235231,210975,CH,2021-10-02,85.7,1.69,5.78,"Urías, Julio",456715,628711,,...,0.0,0.043,1_2,bRpL,offspeed,Lorenzo Cain,R,USA,False,0
343171,307716,SL,2021-08-21,85.4,-1.21,5.86,"Bender, Anthony",571657,669622,,...,0.0,0.093,0_0,bRpR,breaking,Kyle Farmer,R,USA,False,0
70188,63494,FF,2021-05-26,96.2,-1.29,6.2,"Abreu, Bryan",666158,650556,,...,0.0,0.02,1_1,bLpR,fastball,Gavin Lux,L,USA,False,0
595281,532896,CH,2021-04-30,82.7,1.99,6.23,"Miley, Wade",595879,489119,,...,0.0,0.02,1_1,bRpL,offspeed,Javier Báez,R,Puerto Rico,False,0
258890,232035,FF,2021-04-11,96.6,-1.72,5.82,"Knebel, Corey",516770,608349,,...,0.0,-0.049,1_0,bRpR,fastball,Starlin Castro,R,Dominican Republic,True,0


In [17]:
all_quant_cols = [  # only numerical features
    'release_speed', 'release_pos_x', 'release_pos_z', 'zone', 'balls', 'strikes',
    'outs_when_up', 'inning',
    'at_bat_number', 'pitch_number', 'home_score', 'away_score', 'bat_score',
    'fld_score', 'delta_home_win_exp', 'delta_run_exp', 'has_event'
]

all_qual_cols = [
    'pitch_type', 'game_date', 'player_name', 'events', 'description', 'des', 'game_type',
 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'inning_topbot', 'pitch_name',
 'if_fielding_alignment', 'of_fielding_alignment', 'count', 'PBhand', 'pitch_cat', 
 'batter_fullName', 'batSide', 'batter_birthCountry', 'strike_zone'
]


In [None]:

from itertools import combinations
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

best_score = 0
best_cols = []

min_datapoints = 500
# Total number of combinations
total_combos = len(all_qual_cols) * len(list(combinations(all_quant_cols, 2)))

# Initialize progress bar
with tqdm(total=total_combos, desc="Evaluating combinations") as pbar:
    for qual in all_qual_cols:
        for pair in combinations(all_quant_cols, 2): 
            cols = list(pair) + [qual]  # use just the current qualitative column
            try:
                # Drop rows with NaNs in selected columns only
                X_sub = X_train[cols].dropna()
                y_sub = y_train.loc[X_sub.index]

                if len(X_sub) > min_datapoints:
                    continue

                LR = LogisticRegression()
                LR.fit(X_sub, y_sub)
                score = LR.score(X_sub, y_sub)

                if score > best_score:
                    best_score = score
                    best_cols = cols
                    print(f"\n✅ New best score: {best_score:.4f} with features: {best_cols}")
            except KeyError:
                pass
            except ValueError:
                pass
            finally:
                pbar.update(1)

Evaluating combinations: 100%|██████████| 3128/3128 [00:55<00:00, 56.47it/s] 


In [19]:
print(best_cols)
print(best_score)

[]
0
