In [None]:
# default_exp inference

# Inference

> Utilites to make an inference (prediction) on an arbitrary pitcher outing.

In [None]:
# exporti
from pull_the_pitcher.data import processing, pipeline
from pull_the_pitcher.data.processing import add_pitcher_team, add_postouts, outs_per_inning, batters_faced, AL_teams
from pull_the_pitcher.utils import load_model

import sqlite3
import pandas as pd
from pybaseball import statcast
import pickle
import torch
import torch.nn as nn
from drsa.model import DRSA
import altair as alt

## Ensuring pitcher is a valid starter

In [None]:
# export

def get_game_df(db_path: str, year: int, game_pk: int):
    conn = sqlite3.connect(db_path)
    query = f"""select *
                from statcast_{year}
                where game_pk = {game_pk}"""
    
    # making sure year is in db
    cursor = conn.execute(f"select name from sqlite_master where type='table' and name='statcast_{year}'")
    if cursor.fetchone():
        df = pd.read_sql_query(query, conn)
    else:
        df = pd.DataFrame()
    conn.close()
    return df

In [None]:
# example

game_df = get_game_df(db_path="../data/raw/statcast_pitches.db", year=2019, game_pk=565555)
game_df.head(3)

Unnamed: 0,level_0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,343378,4733,SL,2019-07-07 00:00:00,87.4,-2.179672,5.405451,Heath Hembree,622682.0,592390.0,...,3.0,6.0,3.0,6.0,6.0,3.0,3.0,6.0,Standard,Standard
1,343379,4734,FF,2019-07-07 00:00:00,92.7,-2.310906,5.300275,Heath Hembree,622682.0,592390.0,...,3.0,6.0,3.0,6.0,6.0,3.0,3.0,6.0,Standard,Standard
2,343380,4738,FF,2019-07-07 00:00:00,93.1,-2.265012,5.288346,Heath Hembree,622682.0,592390.0,...,3.0,6.0,3.0,6.0,6.0,3.0,3.0,6.0,Standard,Standard


In [None]:
# export

def is_valid_starter(db_path: str, year: int, game_pk: int, pitcher: int) -> bool:
    """
    df is the statcast data for a unique game_pk
    """
    # getting initial game_df
    game_df = get_game_df(db_path=db_path, year=year, game_pk=game_pk)
    
    # getting sorted (by at bat) df for a specific game
    game_df = game_df.sort_values("at_bat_number", ascending=True)

    # first pitcher for each team is throwing at min(at_bat_number)
    home_pitcher_first_ab = game_df.loc[(game_df["inning_topbot"]=="Top"), "at_bat_number"].min()
    home_team = game_df["home_team"].head(1).item()
    home_pitcher = game_df.loc[(game_df["at_bat_number"]==home_pitcher_first_ab), "pitcher"].head(1).item()

    away_pitcher_first_ab = game_df.loc[(game_df["inning_topbot"]=="Bot", "at_bat_number")].min()
    away_team = game_df["away_team"].head(1).item()
    away_pitcher = game_df.loc[(game_df["at_bat_number"]==away_pitcher_first_ab), "pitcher"].head(1).item()
    
    if pitcher not in (home_pitcher, away_pitcher):
        print(f"Pitcher {pitcher_id} was not a starter in game {game_pk}")

    # adding pitcher_team
    game_df.loc[:, "pitcher_team"] = game_df.apply(lambda row: add_pitcher_team(row), axis=1)
    
    # identifying the team of the pitcher of interest
    team = home_team if pitcher == home_pitcher else away_team

    # check for "openers"

    # adding postouts for entire game
    game_team_df = game_df.loc[(game_df["pitcher_team"]==team)]
    game_team_df = add_postouts(game_team_df)

    # subsetting to get pitches thrown by the starter
    game_team_pitcher_df = game_team_df.loc[(game_team_df["pitcher"]==pitcher)]

    # getting criteria to check if opener
    outs = game_team_pitcher_df.groupby(["inning"]).agg({"postouts": outs_per_inning}).sum().item()
    n_batters = batters_faced(game_team_pitcher_df["at_bat_number"])
    opener = outs < 7 or n_batters < 10

    # must not be opener, be from an AL team, and be playing in an AL stadium
    if not opener and (team in AL_teams) and (home_team in AL_teams):
        return True
    return False

In [None]:
# example

is_valid_starter(db_path="../data/raw/statcast_pitches.db", year=2019, game_pk=565367, pitcher=596001)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


True

## Transforming `game_df` into `torch.tensor` to feed into DRSA

In [None]:
# export

def test_front_pad(feature_engineered_df: pd.DataFrame) -> (torch.Tensor, int):
    X = torch.tensor(feature_engineered_df.values, dtype=torch.double).unsqueeze(0)
    batch_size, seq_len, n_features = X.shape
    diff = 36-seq_len
    X_padded = torch.cat([torch.zeros(1, diff, n_features), X.float()], dim=1)
    return X_padded, diff

In [None]:
# export

def game_df2tensor(game_df: pd.DataFrame, game_pk: int, pitcher: int, mappers, scaler) -> torch.tensor:
    # full data cleaning pipeline
    cleaned_df = processing.preliminary_clean(game_df, game_pk, pitcher)
    agged_df = processing.aggregate_at_bats(cleaned_df, pipeline.at_bat_aggs)
    feature_engineered_df = processing.feature_engineering(agged_df)
    feature_engineered_df = feature_engineered_df[pipeline.cols]
    
    # scaling new data (as tuned on training data)
    feature_engineered_df[pipeline.feature_cols] = scaler.transform(feature_engineered_df[pipeline.feature_cols])

    # replacing pitcher id with index of associated embedding
    feature_engineered_df["pitcher"] = [mappers["pitcher"].get(pitcher, 0) for pitcher in feature_engineered_df["pitcher"]]

    # subsetting to only get required cols
    feature_engineered_df = feature_engineered_df[["pitcher"] + pipeline.feature_cols]
    
    # padding as appropriate
    X_padded, diff = test_front_pad(feature_engineered_df)
    
    return X_padded, diff

In [None]:
# example of how game_df2tensor works

# constants
db_path = "../data/raw/statcast_pitches.db"
year = 2019
# game_pk = 565761.0  # 2019-08-14 @ STL
game_pk = 565717.0    # 2019-05-11 @ TB
pitcher = 641745.0  # Brad Keller
mappers_path = "../data/processed/mappers_2017_2018_2016_2019.pkl"
scaler_path = "../data/processed/scaler_2017_2018_2016_2019.pkl"

# loading embedding encoder and scaler
with open("../data/processed/mappers_2017_2018_2016_2019.pkl", "rb") as f:
    mappers = pickle.load(f)
with open("../data/processed/scaler_2017_2018_2016_2019.pkl", "rb") as f:
    scaler = pickle.load(f)
    
game_df = get_game_df(db_path=db_path, year=year, game_pk=game_pk)
X, pad_diff = game_df2tensor(game_df, game_pk, pitcher, mappers, scaler)

assert X.shape == (1, 36, 17)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_team_pitcher_df["events"] = game_team_pitcher_df["events"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_team_pitcher_df["post_bat_score"] = game_team_pitcher_df["post_bat_score"].shift(-1).fillna(method="ffill")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_team_pitcher

## Loading DRSA model

In [None]:
# export

def load_drsa(model_path):
    # model hyperparameters
    n_pitchers = 270
    pitcher_emb_size = 20
    pitcher_embeddings = nn.Embedding(n_pitchers, pitcher_emb_size)
    ptp_embeddings = [pitcher_embeddings]

    # initializing model
    drsa = DRSA(n_features=17,
                hidden_dim=20,
                n_layers=5,
                embeddings=ptp_embeddings,
                output_size=1,
                LSTM_dropout=0.05,
                Linear_dropout=0.1)
    
    # loading up saved model
    load_model(drsa, model_path)

    return drsa

In [None]:
# example of how inference would work

model_path = f"../models/07-22-20_DRSA_2016_2017_2018_2019_loss_4.0985.pth"
drsa = load_drsa(model_path)
drsa.eval() # making sure that no dropout is applied
preds = drsa(X)
print(preds)

tensor([[[0.0865],
         [0.0189],
         [0.0126],
         [0.0114],
         [0.0111],
         [0.0110],
         [0.0110],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0109],
         [0.0110],
         [0.0110],
         [0.0110],
         [0.0111],
         [0.0112],
         [0.0114],
         [0.0117],
         [0.0123],
         [0.0134],
         [0.0155],
         [0.0200],
         [0.0310],
         [0.0636],
         [0.1699],
         [0.3756]]], grad_fn=<SigmoidBackward>)


## Simple Visualization

In [None]:
# export

def make_altair_hist(preds: torch.tensor, diff: int):
    # preparing predictions
    preds_df = pd.DataFrame((preds[:, diff:, :]).squeeze().detach()).reset_index()
    preds_df.columns = ["at_bat_num", "pred"]
    preds_df["at_bat_num"] = preds_df["at_bat_num"] + 1
    preds_df["prediction"] = (preds_df["pred"] > 0.35).replace(True, "pulled").replace(False, "stayed")
    
    # making chart
    
    chart = alt.Chart(preds_df).mark_bar().encode(
                x='pred:Q',
                y="at_bat_num:O",
                tooltip=[alt.Tooltip('pred:Q', format='.4%'),
                         alt.Tooltip('at_bat_num:O')],
                color=alt.Color('prediction', scale=alt.Scale(domain=["stayed", "pulled"], range=["lightgrey", "lightblue"])),
            ).interactive()
    return chart

In [None]:
# example visualization

make_altair_hist(preds, pad_diff)