# MLB For Data Science - Predicting the Strikezone

# Merging, Cleaning, Missing Values

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style = "dark")

In [2]:
atbats = pd.read_csv("atbats.csv")
atbats_2019 = pd.read_csv("2019_atbats.csv")

FileNotFoundError: [Errno 2] File b'atbats.csv' does not exist: b'atbats.csv'

In [None]:
atbats = pd.concat([atbats, atbats_2019], axis = 0, sort = True)

In [None]:
atbats.head()

In [None]:
games = pd.read_csv("games.csv")[["g_id", "date", "away_team", "home_team", "away_final_score", "home_final_score", "umpire_HP"]]
games_2019 = pd.read_csv("2019_games.csv")[["g_id", "date", "away_team", "home_team", "away_final_score", "home_final_score", "umpire_HP"]]

In [None]:
games = pd.concat([games, games_2019], axis = 0, sort = True)

In [None]:
games.head()

In [None]:
pitches = pd.read_csv("pitches.csv")
pitches_2019 = pd.read_csv("2019_pitches.csv")

In [None]:
pitches = pd.concat([pitches, pitches_2019], axis = 0, sort = True)

In [None]:
len(pitches)

In [None]:
pitches = pitches.replace("placeholder", np.NaN)

In [None]:
names = pd.read_csv("player_names.csv").rename(columns = {"id": "batter_id"})

In [None]:
atbats = pd.merge(atbats, names, on = "batter_id") #getting batter names for each row

In [None]:
atbats["batter_name"] = atbats["first_name"] + " " + atbats["last_name"]

In [None]:
atbats = atbats.drop(columns = ["first_name", "last_name"])

In [None]:
names = names.rename(columns = {"batter_id": "pitcher_id"})

In [None]:
atbats = pd.merge(atbats, names, on = "pitcher_id") #getting pitcher names for each row

In [None]:
atbats["pitcher_name"] = atbats["first_name"] + " " + atbats["last_name"]

In [None]:
atbats = atbats.drop(columns = ["first_name", "last_name"]).sort_values(by = "ab_id")

In [None]:
atbats["year"] = atbats["ab_id"].astype(str).str[:4] #the year is the first 4 digits of at bat id

In [None]:
atbats_games = pd.merge(atbats, games, on = "g_id")

In [None]:
atbats_games["date"] = pd.to_datetime(atbats_games["date"])

In [None]:
atbats_pitches = pd.merge(pitches, atbats_games, on = "ab_id")

In [None]:
atbats_pitches.head()

In [None]:
atbats_pitches.info()

In [None]:
def missing_props(df):
    missing_values = []
    for i in df.columns:
        missing_values.append(round(df[i].isnull().sum() / len(df), 3))
    missing_props = pd.DataFrame(list(zip(df.columns, missing_values)), columns = ["Var", "Prop_Missing"]).sort_values(by = "Prop_Missing", ascending = False)
    
    return missing_props

In [None]:
missing_props(atbats_pitches)

# Feature Engineering

## Strikezone Flag

In [None]:
atbats_pitches["Strikezone?"] = np.where((np.abs(atbats_pitches["px"] * 12) <= 9.97) & (atbats_pitches["pz"] * 12 <= 44.08) & (atbats_pitches["pz"] * 12 >= 18.29), True, False)

In [None]:
sns.countplot(atbats_pitches["Strikezone?"])

## Batting and Pitching Teams

In [None]:
atbats_pitches["batting_team"] = np.where(atbats_pitches["top"] == 1, atbats_pitches["away_team"], atbats_pitches["home_team"])
atbats_pitches["pitching_team"] = np.where(atbats_pitches["top"] == 1, atbats_pitches["home_team"], atbats_pitches["away_team"])

## Ball Types

In [None]:
atbats_pitches["Z Call"] = np.where(atbats_pitches["pz"] * 12 >= 44.08, "High", np.where(atbats_pitches["pz"] * 12 <= 18.29, "Low", np.NaN))

In [None]:
atbats_pitches["X Call"] = np.where(((atbats_pitches["px"] * 12 <= -9.97) & (atbats_pitches["stand"] == "L")) | ((atbats_pitches["px"] * 12 >= 9.97) & (atbats_pitches["stand"] == "R")), "Outside", np.where(((atbats_pitches["px"] * 12 <= -9.97) & (atbats_pitches["stand"] == "R")) | ((atbats_pitches["px"] * 12 >= 9.97) & (atbats_pitches["stand"] == "L")), "Inside", np.NaN))
                                    
                                    

In [None]:
sns.countplot(atbats_pitches["Z Call"])

In [None]:
sns.countplot(atbats_pitches["X Call"])

In [None]:
outside = atbats_pitches[atbats_pitches["X Call"] == "Outside"]
inside = atbats_pitches[atbats_pitches["X Call"] == "Inside"]

In [None]:
len(outside[outside["code"] == "P"])

In [None]:
len(inside[inside["code"] == "P"])

In [None]:
atbats_pitches["Location"] = np.where((atbats_pitches["X Call"] == "nan") & (atbats_pitches["Z Call"] != "nan"), atbats_pitches["Z Call"], np.where((atbats_pitches["Z Call"] == "nan") & (atbats_pitches["X Call"] != "nan"), atbats_pitches["X Call"], np.where((atbats_pitches["X Call"] != "nan") & (atbats_pitches["Z Call"] != "nan"), atbats_pitches["Z Call"] + " and " + atbats_pitches["X Call"], "Strikezone")))



In [None]:
atbats_pitches = atbats_pitches.drop(columns = ["X Call", "Z Call"])

## Cumulative Averages

In [None]:
atbats_pitches["ab_id"] = atbats_pitches["ab_id"].astype(str)

In [None]:
events = atbats_pitches[["ab_id", "date", "pitcher_name", "batter_name", "event"]].drop_duplicates()

### Creating Event Flags

In [None]:
events["year"] = events["date"].dt.year

events["PA?"] = np.where(events["event"].isin(["Runner Out"]), 0, 1)
events["AB?"] = np.where(events["event"].isin(["Walk", "Intent Walk", "Sac Fly", "Sac Fly DP", "Sac Bunt", "Sacrifice Bunt DP", "Batter Interference", "Catcher Interference", "Runner Out", "Hit By Pitch"]), 0, 1)

events["Hit?"] = np.where(events["event"].isin(["Single", "Double", "Triple", "Home Run"]), 1, 0)
events["Single?"] = np.where(events["event"].isin(["Single"]), 1, 0)
events["Double?"] = np.where(events["event"].isin(["Double"]), 1, 0)
events["Triple?"] = np.where(events["event"].isin(["Triple"]), 1, 0)
events["Home Run?"] = np.where(events["event"].isin(["Home Run"]), 1, 0)
events["Walk?"] = np.where(events["event"].isin(["Walk"]), 1, 0)
events["Intent Walk?"] = np.where(events["event"].isin(["Intent Walk"]), 1, 0)
events["HBP?"] = np.where(events["event"].isin(["Hit By Pitch"]), 1, 0)
events["Sac Fly?"] = np.where(events["event"].isin(["Sac Fly", "Sac Fly DP"]), 1, 0)

events = events.reset_index().drop(columns = "index")

In [None]:
events.head()

### Batter Averages

In [None]:
cum_sum_batters = events.groupby(["year", "batter_name"]).cumsum().reset_index().drop(columns = "index")

In [None]:
indices = events[["ab_id", "date", "pitcher_name", "batter_name"]]

In [None]:
events2 = pd.concat([indices, cum_sum_batters], axis = 1)

In [None]:
events2["Batter AB"] = events2["AB?"]

events2["Batter AVG"] = round(events2["Hit?"] / events2["AB?"], 3)

events2["Batter OBP"] = round((events2["Hit?"] + events2["Walk?"] + events2["Intent Walk?"] + events2["HBP?"]) / (events2["AB?"] + events2["Walk?"] + events2["HBP?"] + events2["Intent Walk?"] + events2["Sac Fly?"]), 3)

events2["Batter TB"] = events2["Single?"] + (2*events2["Double?"]) + (3*events2["Triple?"]) + (4*events2["Home Run?"]) 
events2["Batter SLG"] = round(events2["Batter TB"] / events2["AB?"], 3)

events2["Batter OPS"] = events2["Batter OBP"] + events2["Batter SLG"]


In [None]:
batter_stats = events2[["ab_id", "date", "pitcher_name", "batter_name", "Batter AB", "Batter AVG", "Batter OBP", "Batter SLG", "Batter OPS"]]

### Pitcher Averages

In [None]:
cum_sum_pitchers = events.groupby(["year", "pitcher_name"]).cumsum().reset_index().drop(columns = "index")

In [None]:
events3 = pd.concat([indices, cum_sum_pitchers], axis = 1)

In [None]:
events3["Pitcher AB"] = events3["AB?"]

events3["Pitcher AVG"] = round(events3["Hit?"] / events3["AB?"], 3)

events3["Pitcher OBP"] = round((events3["Hit?"] + events3["Walk?"] + events3["Intent Walk?"] + events3["HBP?"]) / (events3["AB?"] + events3["Walk?"] + events3["HBP?"] + events3["Intent Walk?"] + events3["Sac Fly?"]), 3)

events3["Pitcher TB"] = events3["Single?"] + (2*events3["Double?"]) + (3*events3["Triple?"]) + (4*events3["Home Run?"]) 
events3["Pitcher SLG"] = round(events3["Pitcher TB"] / events3["AB?"], 3)

events3["Pitcher OPS"] = events3["Pitcher OBP"] + events3["Pitcher SLG"]


In [None]:
pitcher_stats = events3[["ab_id", "date", "pitcher_name", "batter_name", "Pitcher AB", "Pitcher AVG", "Pitcher OBP", "Pitcher SLG", "Pitcher OPS"]]


In [None]:
all_avgs = pd.merge(batter_stats, pitcher_stats, on = ["ab_id", "date", "pitcher_name", "batter_name"])

In [None]:
all_avgs.tail()

In [None]:
all_avgs.info()

## Game Situation Statistics

### Inning, Top or Bottom of Inning, Outs, Teams

In [None]:
game_stats = atbats_pitches[["ab_id", "b_count", "s_count", "outs", "inning", "top", "pitching_team", "batting_team", "on_1b", "on_2b", "on_3b", "p_throws", "stand"]]

In [None]:
game_stats2 = pd.merge(all_avgs, game_stats, on = "ab_id", how = "left")

### Count

In [None]:
game_stats2["b_count"] = game_stats2["b_count"].astype(str)
game_stats2["s_count"] = game_stats2["s_count"].astype(str)


game_stats2["Count"] = game_stats2["b_count"] + "-" + game_stats2["s_count"]

### Same Side?

In [None]:
game_stats2["Same Side?"] = game_stats2["p_throws"] == game_stats2["stand"]

### Bases Occupied

In [None]:
game_stats2["Bases Occupied"] = np.where((game_stats2["on_1b"] == 1) & (game_stats2["on_2b"] == 1) & (game_stats2["on_3b"] == 1), "Bases Loaded", np.where((game_stats2["on_1b"] == 1) & (game_stats2["on_2b"] == 1) & (game_stats2["on_3b"] == 0), "1st and 2nd", np.where((game_stats2["on_1b"] == 1) & (game_stats2["on_2b"] == 0) & (game_stats2["on_3b"] == 1), "1st and 3rd", np.where((game_stats2["on_1b"] == 0) & (game_stats2["on_2b"] == 1) & (game_stats2["on_3b"] == 1), "2nd and 3rd", np.where((game_stats2["on_1b"] == 1) & (game_stats2["on_2b"] == 0) & (game_stats2["on_3b"] == 0), "1st", np.where((game_stats2["on_1b"] == 0) & (game_stats2["on_2b"] == 1) & (game_stats2["on_3b"] == 0), "2nd", np.where((game_stats2["on_1b"] == 0) & (game_stats2["on_2b"] == 0) & (game_stats2["on_3b"] == 1), "3rd", "Bases Empty")))))))



### Score Difference

In [None]:
atbats_pitches["Score Diff"] = atbats_pitches["p_score"] - atbats_pitches["b_score"]

In [None]:
game_stats2["Score Diff"] = atbats_pitches["Score Diff"]

In [None]:
game_stats3 = game_stats2.drop(columns = ["on_1b", "on_2b", "on_3b", "b_count", "s_count"])

## Previous Pitch Statistics

### Pitch Number of At-Bat

In [None]:
game_stats3["Pitch Num"] = atbats_pitches["pitch_num"]

In [None]:
game_stats3["Pitch Num"] = game_stats3["Pitch Num"].astype(str)

### Previous Pitch Code, Type, Location, Event, and Zone

In [None]:
atbats_pitches["Previous Code"] = atbats_pitches["code"].shift(periods = 1)
atbats_pitches["Previous Location"] = atbats_pitches["Location"].shift(periods = 1)
atbats_pitches["Previous Type"] = atbats_pitches["pitch_type"].shift(periods = 1)

atbats["ab_id"] = atbats["ab_id"].astype(str)
atbats["Previous Event"] = atbats["event"].shift(periods = 1)

In [None]:
atbats_pitches["pitch_id"] = atbats_pitches.index

In [None]:
first_pitch_inning = list(atbats_pitches[["g_id", "pitch_id", "inning", "top"]].groupby(["g_id", "inning", "top"]).first().reset_index()["pitch_id"])

first_event_inning = list(atbats[["g_id", "ab_id", "inning", "top"]].groupby(["g_id", "inning", "top"]).first().reset_index()["ab_id"])


In [None]:
atbats_pitches["Previous Code"] = np.where(atbats_pitches["pitch_id"].isin(first_pitch_inning), "First Pitch", atbats_pitches["Previous Code"])

atbats_pitches["Previous Location"] = np.where(atbats_pitches["pitch_id"].isin(first_pitch_inning), "First Pitch", atbats_pitches["Previous Location"])

atbats_pitches["Previous Type"] = np.where(atbats_pitches["pitch_id"].isin(first_pitch_inning), "First Pitch", atbats_pitches["Previous Type"])

atbats["Previous Event"] = np.where(atbats["ab_id"].isin(first_event_inning), "First Event", atbats["Previous Event"])


In [None]:
game_stats3["Previous Code"] = atbats_pitches["Previous Code"]

game_stats3["Previous Location"] = atbats_pitches["Previous Location"]

game_stats3["Previous Type"] = atbats_pitches["Previous Type"]

previous_events = atbats[["ab_id", "Previous Event"]]
game_stats3 = pd.merge(game_stats3, previous_events, on = "ab_id")

## Other Statistics

### Year and Month

In [None]:
game_stats3["year"] = atbats_pitches["date"].dt.year.astype(str)
game_stats3["month"] = atbats_pitches["date"].dt.month.astype(str)

### Pitch Number of Game

In [None]:
cum_count = list(atbats_pitches[["g_id", "pitcher_name"]].groupby(["g_id", "pitcher_name"]).cumcount())

In [None]:
game_stats3["Pitch Count"] = np.array(cum_count) + 1

### Strikezone Percentage of Game

In [None]:
atbats_pitches["Pitch Count"] = np.array(cum_count) + 1

atbats_pitches["Strikezone?"] = np.where(atbats_pitches["Strikezone?"] == True, 1, 0)

In [None]:
cum_count2 = atbats_pitches[["g_id", "pitcher_name", "Strikezone?"]].groupby(["g_id", "pitcher_name"]).cumsum()

In [None]:
atbats_pitches["Strike Count"] = cum_count2

atbats_pitches["Strike Prop"] = atbats_pitches["Strike Count"] / atbats_pitches["Pitch Count"]

In [None]:
atbats_pitches_sort = atbats_pitches.sort_values(by = ["g_id", "pitcher_name", "Pitch Count"])

atbats_pitches_sort["Strike Prop"] = atbats_pitches_sort["Strike Prop"].shift(periods = 1)

In [None]:
first_pitch_game = list(atbats_pitches[["g_id", "pitcher_name", "pitch_id"]].groupby(["g_id", "pitcher_name"]).first()["pitch_id"])

In [None]:
atbats_pitches_sort["Strike Prop"] = np.where(atbats_pitches_sort["pitch_id"].isin(first_pitch_game), 0, atbats_pitches_sort["Strike Prop"])

game_stats3["Strike Prop"] = atbats_pitches_sort["Strike Prop"]

## Adding Strikezone Flag (Target Variable)

In [None]:
game_stats3["Strikezone"] = atbats_pitches["Strikezone?"]

game_stats3["Strikezone"] = np.where(game_stats3["Strikezone"] == True, 1, 0)

In [None]:
len(game_stats3)

In [None]:
len(atbats_pitches)

## Filtering

### Intentional Walks

In [None]:
events = atbats_pitches[["ab_id", "event"]].drop_duplicates()

In [None]:
game_stats4 = pd.merge(game_stats3, events, on = "ab_id")

game_stats5 = game_stats4[game_stats4["event"] != "Intent Walk"].drop(columns = ["ab_id", "event", "date"])

In [None]:
len(game_stats5)

### Uncommon Categorical Values

In [None]:
cat_columns = game_stats5.select_dtypes("object").columns

In [None]:
for i in cat_columns:
    flags = pd.DataFrame(game_stats5[i].value_counts() < 1000)
    drop = flags[flags[i] == True].index
    game_stats5 = game_stats5[~game_stats5[i].isin(drop)]

In [None]:
len(game_stats5)

# Modeling 

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import RobustScaler 

from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV

#import xgboost as xgb


import warnings
warnings.filterwarnings("ignore")

## A General Model

In [None]:
stats_sample = game_stats5.sample(n = 500000).drop(columns = ["pitcher_name", "batter_name"])

In [None]:
sns.countplot(stats_sample["Strikezone"])

In [None]:
no_players_dummies = pd.get_dummies(stats_sample)

In [None]:
no_players_dummies.shape

In [None]:
def general_model(no_players_dummies, mod):
    
    x_vals = no_players_dummies.drop(columns = "Strikezone").fillna(0)
    y_val = no_players_dummies["Strikezone"].values
    
    scaler = StandardScaler() 
    x_vals = scaler.fit_transform(x_vals)
    
    train_array = []
    test_array = []
    
    if mod == LogisticRegression():
        n = 30
    else: 
        n = 1

    for i in range(n):
        
        x_train, x_test, y_train, y_test = train_test_split(x_vals, y_val, test_size = 0.3)
    
        mod.fit(x_train, y_train)
        
        train_score = mod.score(x_train, y_train)
        test_score = mod.score(x_test, y_test)
        
        train_array.append(train_score)
        test_array.append(test_score)
    
    return round(len(no_players_dummies[no_players_dummies["Strikezone"] == 0]) / len(no_players_dummies), 4), round(np.mean(train_array), 4), round(np.mean(test_array), 4)



### Logistic Regression

In [None]:
general_model(no_players_dummies, LogisticRegression())

In [None]:
x_vals = no_players_dummies.drop(columns = "Strikezone").fillna(0)
y_val = no_players_dummies["Strikezone"].values

In [None]:
c = x_vals.columns

In [None]:
scaler = StandardScaler()
x_vals = scaler.fit_transform(x_vals)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_vals, y_val, test_size = 0.3)

In [None]:
mod = LogisticRegression()

In [None]:
mod.fit(x_train, y_train)

In [None]:
imp1 = pd.DataFrame(c, mod.coef_[0]).reset_index().sort_values(by = "index", ascending = False)

imp1 = imp1.rename(columns = {0: "Feature", "index": "Importance"})
imp1["Importance"] = imp1["Importance"] ** 2

imp1[["Feature", "Importance"]].head(10)

### Random Forest

In [None]:
general_model(no_players_dummies, RandomForestClassifier())

In [None]:
mod2 = RandomForestClassifier()

In [None]:
mod2.fit(x_train, y_train)

In [None]:
imp2 = pd.DataFrame(c, mod2.feature_importances_).reset_index().sort_values(by = "index", ascending = False).head(20)

imp2 = imp2.rename(columns = {0: "Feature", "index": "Importance"})

imp2[["Feature", "Importance"]].head(10)

### AdaBoost

In [None]:
general_model(no_players_dummies, AdaBoostClassifier())

In [None]:
mod3 = AdaBoostClassifier()

In [None]:
mod3.fit(x_train, y_train)

In [None]:
imp3 = pd.DataFrame(c, mod3.feature_importances_).reset_index().sort_values(by = "index", ascending = False).head(10)

imp3 = imp3.rename(columns = {0: "Feature", "index": "Importance"})

imp3[["Feature", "Importance"]].head(10)

### Hyperparameter Tuning

In [None]:
#lr = LogisticRegression()

#grid = {'max_iter' : [2000], 'penalty' : ['l1', 'l2'], 'C' : np.logspace(-4, 4, 20), 'solver' : ['liblinear']}

#grid_lr = GridSearchCV(lr, param_grid = grid, verbose = True, n_jobs = -1)

#grid_lr.fit(x_train, y_train)

## Modeling Individual Pitchers

In [None]:
def model_pitcher(game_stats5, pitcher, mod):
    
    ind_pitcher = game_stats5[game_stats5["pitcher_name"] == pitcher].drop(columns = ["pitcher_name", "batter_name"])
    
    ind_pitcher_dummies = pd.get_dummies(ind_pitcher)
    
    x_vals = ind_pitcher_dummies.drop(columns = "Strikezone").fillna(0)
    y_val = ind_pitcher_dummies["Strikezone"].values
    
    train_array = []
    test_array = []
    
    for i in range(1):
    
        x_train, x_test, y_train, y_test = train_test_split(x_vals, y_val, test_size = 0.4)
    
        mod.fit(x_train, y_train)
    
        train_score = mod.score(x_train, y_train)
        test_score = mod.score(x_test, y_test)
        
        train_array.append(train_score)
        test_array.append(test_score)
        
    baseline = len(ind_pitcher_dummies[ind_pitcher_dummies["Strikezone"] == 1]) / len(ind_pitcher_dummies)
    
    return pitcher, round(baseline, 4), round(np.mean(train_array), 4), round(np.mean(test_array), 4)

    

In [None]:
top100_pitchers = list(game_stats5[["pitcher_name"]].groupby("pitcher_name").size().sort_values(ascending = False).index[0:100])

In [None]:
pitcher_models = [model_pitcher(game_stats5, i, LogisticRegression()) for i in top100_pitchers]

In [None]:
model_df = pd.DataFrame(pitcher_models)

model_df[1] = np.where(model_df[1] < 0.5, 1 - model_df[1], model_df[1])

In [None]:
model_df["Spread"] = model_df[3] - model_df[1]

In [None]:
model_df = model_df.rename(columns = {0: "Pitcher", 1: "Baseline", 2: "Train Accuracy", 3: "Test Accuracy"})

In [None]:
model_df_top_acc = model_df.sort_values(by = "Test Accuracy", ascending = False).head(10)
model_df_bot_acc = model_df.sort_values(by = "Test Accuracy").head(10)

model_df_top_spread = model_df.sort_values(by = "Spread", ascending = False).head(10)
model_df_bot_spread = model_df.sort_values(by = "Spread").head(10)

### Plotting Model Effectiveness

In [None]:
sns.barplot(model_df_top_acc["Pitcher"], model_df_top_acc["Test Accuracy"])
plt.xticks(rotation = 90)
plt.title("The 10 Most Predictable MLB Pitchers (2015-2019)")
plt.ylim(0.58, 0.65)
plt.show()

In [None]:
sns.barplot(model_df_bot_acc["Pitcher"], model_df_bot_acc["Test Accuracy"])
plt.xticks(rotation = 90)
plt.title("The 10 Least Predictable MLB Pitchers (2015-2019)")
plt.ylim(0.5, 0.58)
plt.show()

In [None]:
sns.barplot(model_df_top_spread["Pitcher"], model_df_top_spread["Spread"])
plt.xticks(rotation = 90)
plt.title("The 10 Most Effecitvely Modeled MLB Pitchers (2015-2019)")
plt.show()

In [None]:
sns.barplot(model_df_bot_spread["Pitcher"], model_df_bot_spread["Spread"])
plt.xticks(rotation = 90)
plt.title("The 10 Least Effecitvely Modeled MLB Pitchers (2015-2019)")
plt.show()