In [24]:
import pandas as pd
import numpy as np
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import joblib

warnings.filterwarnings("ignore")

In [25]:
def data_prep(path):
    
    df = pd.read_csv(path)
    df = df.loc[:,~df.columns.str.contains("Unnamed")]
    df = df[df['pitch_type'].isin(["FF", "SI", "SL", "CH", "CU", "FC", "KC", 'FS', "KN"])]
    pitchers = list(set(df['pitcher_name']))
    
    mapper = {}
    for pitcher in pitchers:
        pitch_type = list(set(df[df['pitcher_name'] == pitcher]['pitch_type']))
        mapper[pitcher] = {x:i for x,i in zip(pitch_type, range(len(pitch_type)))}
    
    return df, pitchers, mapper

In [26]:
df, pitchers, mapper = data_prep("/Users/nickdimmitt/Desktop/dfs_local/sequences/filtered/df.csv")

In [59]:
def feature_eng(df, pitchers, mapper):
    
    model_df = pd.DataFrame()
    
    for pitcher in pitchers:
        temp = df[df['pitcher_name'] == pitcher].replace({'pitch_type':mapper[pitcher]})
        model_df = pd.concat([model_df, temp])
        model_df = model_df.loc[:,~model_df.columns.str.contains('Unnamed')]
    del df
    
    model_df['pitch_type'] = model_df['pitch_type'].astype('int')
    features = ['pitch_type', 'start_speed', 'zone']
    
    for col in features:
        model_df[f"{col}_last_pitch"] = model_df.groupby(['game_pk', 'ab_number', 'pitcher_name'])[col].transform(lambda x:x.shift(1))
        
    return model_df

In [60]:
model_df = feature_eng(df, pitchers, mapper)

In [66]:
model_df.to_csv("model_rdy.csv")

In [68]:
def models(path, pitchers):
    
    df = pd.read_csv(path)
    
    
    for pitcher in pitchers:
        
        train_df = df[df['pitcher_name'] == pitcher]
        
        target = 'pitch_type'
        features = ['stand', 'balls', 'strikes', 'outs', 'runnerOn1B', 'runnerOn2B', 'runnerOn3B', 'pitch_type_last_pitch', 'start_speed_last_pitch', 'zone_last_pitch']
        
        X = train_df[features]
        y = train_df[target]
        
        if len(list(set(y))) <= 1:
            continue
        
        elif len(list(set(y))) == 2:
            model = XGBClassifier(learning_rate=0.01, n_estimators=1000, max_depth=6, objective='binary:logistic')
        
        else:
            model = XGBClassifier(learning_rate=0.01, n_estimators=1000, max_depth=6, objective='multi:softmax')
        
        try:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44, stratify=y)
        except:
            continue
        
        model.fit(X_train, y_train)
        pitcher = "".join(pitcher.split()).lower()
        joblib.dump(model, f"models/{pitcher}.pkl")
        
    return None

In [69]:
models("model_rdy.csv", pitchers)

In [70]:
def predict_pitch(pitches_dict):
    
    pitcher = input("Pitcher Name: ")
    
    pitch_mapper = pitches_dict[pitcher]
    
    pitcher = "".join(pitcher.split()).lower()
    
    stance = input("Batter Stance(r/l): ")
    if stance == 'r':
        stance = 1
    else:
        stance = 0
        
    balls = int(input("Balls: "))
    strikes = int(input("Strikes: "))
    outs = int(input("Outs: "))
    
    runnerOn1B = input("Runner on 1B? (y/n): ")
    if runnerOn1B == 'y':
        runnerOn1B = 1
    else:
        runnerOn1B = 0
    
    runnerOn2B = input("Runner on 2B? (y/n): ")
    if runnerOn2B == 'y':
        runnerOn2B = 1
    else:
        runnerOn2B = 0
        
    runnerOn3B = input("Runner on 3B? (y/n): ")
    if runnerOn3B == 'y':
        runnerOn3B = 1
    else:
        runnerOn3B = 0
        
    print(list(pitch_mapper.keys()))
    last_pitch = input("Last Pitch Type?: ")
    last_pitch = pitch_mapper[last_pitch]
    
    last_pitch_location = int(input("Last Pitch Location: "))
    
    last_pitch_speed = float(input("Last Pitch Speed: "))
    
    model = joblib.load(f"models/{pitcher}.pkl")
    
    sample = pd.DataFrame({
        'stand':[stance],
        'balls':[balls],
        'strikes':[strikes],
        'outs':[outs],
        'runnerOn1B':[runnerOn1B],
        'runnerOn2B':[runnerOn2B],
        'runnerOn3B':[runnerOn3B],
        'pitch_type_last_pitch':[last_pitch],
        'start_speed_last_pitch':[last_pitch_speed],
        'zone_last_pitch':[last_pitch_location]})
    
    preds = np.array(model.predict_proba(sample)).flatten()
    print("\n\n")
    print("Pitch Probabilities: ")
    for x,y in zip(pitch_mapper.keys(), preds):
        print(f'{x}: {round(y*100,2)}%')
    return None

In [71]:
predict_pitch(mapper)

Pitcher Name:  Gerrit Cole
Batter Stance(r/l):  r
Balls:  1
Strikes:  2
Outs:  1
Runner on 1B? (y/n):  y
Runner on 2B? (y/n):  n
Runner on 3B? (y/n):  n


['FC', 'FF', 'SL', 'KC', 'CH', 'CU', 'SI']


Last Pitch Type?:  FF
Last Pitch Location:  3
Last Pitch Speed:  99





Pitch Probabilities: 
FC: 1.99%
FF: 29.73%
SL: 26.39%
KC: 36.03%
CH: 4.65%
CU: 0.28%
SI: 0.92%
