In [None]:
# default_exp inference

# Inference

> Utilites to make an inference (prediction) on an arbitrary pitcher outing.

In [None]:
# exporti
from pull_the_pitcher.data import processing, pipeline
import sqlite3
import pandas as pd
from pybaseball import statcast

In [None]:
df = statcast(start_dt="2019-07-19", end_dt="2019-07-25", team="KC")

This is a large query, it may take a moment to complete
Completed sub-query from 2019-07-19 to 2019-07-24
Completed sub-query from 2019-07-25 to 2019-07-25


In [None]:
df.loc[(df["player_name"]=="Jakob Junis"), ["pitcher", "game_pk"]]

Unnamed: 0,pitcher,game_pk
1424,596001.0,565367.0
1425,596001.0,565367.0
1426,596001.0,565367.0
1427,596001.0,565367.0
1428,596001.0,565367.0
...,...,...
1583,596001.0,565367.0
1584,596001.0,565367.0
1585,596001.0,565367.0
1586,596001.0,565367.0


### Making sure is a valid starter

In [None]:
import sqlite3

def get_game_df(db_path: str, year: int, game_pk: int):
    conn = sqlite3.connect(db_path)
    query = f"""select *
                from statcast_{year}
                where game_pk = {game_pk}"""
    
    # making sure year is in db
    cursor = conn.execute(f"select name from sqlite_master where type='table' and name='statcast_{year}'")
    if cursor.fetchone():
        df = pd.read_sql_query(query, conn)
    else:
        df = pd.DataFrame()
    conn.close()
    return df

In [None]:
game_df = get_game_df(db_path="../data/raw/statcast_pitches.db", year=2019, game_pk=565555)

In [None]:
game_df.loc[game_df["player_name"]=="David Price", ["player_name", "pitcher"]]

Unnamed: 0,player_name,pitcher
146,David Price,456034.0
147,David Price,456034.0
148,David Price,456034.0
149,David Price,456034.0
150,David Price,456034.0
...,...,...
320,David Price,456034.0
321,David Price,456034.0
322,David Price,456034.0
323,David Price,456034.0


In [None]:
from pull_the_pitcher.data.processing import add_pitcher_team, add_postouts, outs_per_inning, batters_faced, AL_teams


def is_valid_starter(db_path: str, year: int, game_pk: int, pitcher: int):
    """
    df is the statcast data for a unique game_pk
    """
    # getting initial game_df
    game_df = get_game_df(db_path=db_path, year=year, game_pk=game_pk)
    
    # getting sorted (by at bat) df for a specific game
    game_df = game_df.sort_values("at_bat_number", ascending=True)

    # first pitcher for each team is throwing at min(at_bat_number)
    home_pitcher_first_ab = game_df.loc[(game_df["inning_topbot"]=="Top"), "at_bat_number"].min()
    home_team = game_df["home_team"].head(1).item()
    home_pitcher = game_df.loc[(game_df["at_bat_number"]==home_pitcher_first_ab), "pitcher"].head(1).item()

    away_pitcher_first_ab = game_df.loc[(game_df["inning_topbot"]=="Bot", "at_bat_number")].min()
    away_team = game_df["away_team"].head(1).item()
    away_pitcher = game_df.loc[(game_df["at_bat_number"]==away_pitcher_first_ab), "pitcher"].head(1).item()
    
    if pitcher not in (home_pitcher, away_pitcher):
        print(f"Pitcher {pitcher_id} was not a starter in game {game_pk}")

    # adding pitcher_team
    game_df.loc[:, "pitcher_team"] = game_df.apply(lambda row: add_pitcher_team(row), axis=1)
    
    # identifying the team of the pitcher of interest
    team = home_team if pitcher == home_pitcher else away_team

    # check for "openers"

    # adding postouts for entire game
    game_team_df = game_df.loc[(game_df["pitcher_team"]==team)]
    game_team_df = add_postouts(game_team_df)

    # subsetting to get pitches thrown by the starter
    game_team_pitcher_df = game_team_df.loc[(game_team_df["pitcher"]==pitcher)]

    # getting criteria to check if opener
    outs = game_team_pitcher_df.groupby(["inning"]).agg({"postouts": outs_per_inning}).sum().item()
    n_batters = batters_faced(game_team_pitcher_df["at_bat_number"])
    opener = outs < 7 or n_batters < 10

    # must not be opener, be from an AL team, and be playing in an AL stadium
    if not opener and (team in AL_teams) and (home_team in AL_teams):
        return True
    return False

In [None]:
is_valid_starter(db_path="../data/raw/statcast_pitches.db", year=2019, game_pk=565367, pitcher=596001)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


True

### transforming game_df into tensor to feed into DRSA

In [None]:
from pull_the_pitcher.data import processing
from pull_the_pitcher.data import pipeline
import pickle

In [None]:
db_path = "../data/raw/statcast_pitches.db"
year = 2019
game_pk = 565367
pitcher = 596001  # Jakob Junis

In [None]:
game_df = get_game_df(db_path=db_path, year=year, game_pk=game_pk)

In [None]:
cleaned_df = processing.preliminary_clean(game_df, game_pk, pitcher)
agged_df = processing.aggregate_at_bats(cleaned_df, pipeline.at_bat_aggs)
feature_engineered_df = processing.feature_engineering(agged_df)
feature_engineered_df = feature_engineered_df[pipeline.cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_team_pitcher_df["events"] = game_team_pitcher_df["events"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [None]:
! ls ../data/processed/

mappers_2016_2017_2018_2019.pkl  train_2016_2017_2018.csv
mappers_2017_2018_2016_2019.pkl  train_2017_2018_2016.csv
scaler_2016_2017_2018_2019.pkl	 val_2016_2017_2018.csv
scaler_2017_2018_2016_2019.pkl	 val_2017_2018_2016.csv
test_2019.csv


In [None]:
with open("../data/processed/mappers_2017_2018_2016_2019.pkl", "rb") as f:
    mappers = pickle.load(f)
    
with open("../data/processed/scaler_2017_2018_2016_2019.pkl", "rb") as f:
    scaler = pickle.load(f)

In [None]:
# scaling new data (as tuned on training data)
feature_engineered_df[pipeline.feature_cols] = scaler.transform(feature_engineered_df[pipeline.feature_cols])

# replacing pitcher id with index of associated embedding
feature_engineered_df["pitcher"] = feature_engineered_df["pitcher"].map(mappers["pitcher"])

# subsetting to only get required cols
feature_engineered_df = feature_engineered_df[["pitcher"] + pipeline.feature_cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
feature_engineered_df.head()

Unnamed: 0,pitcher,post_bat_score,score_diff,end_inning,inning,postouts,cum_sb_ratio,times_thru_order,post_total_runners,tying_run_on,pitch_total,post_opposite_hand,walk,walk_cumsum,strikeout_cumsum,home_run_cumsum,bases_cumsum
0,182,-0.759251,0.209695,-0.509378,-1.358806,-0.976258,0.154678,-1.60084,-0.692645,-0.454233,-1.606069,0.857632,-0.284401,-0.894291,-1.197142,-0.603666,-1.073264
1,182,-0.759251,0.209695,-0.509378,-1.358806,-0.976258,0.154678,-1.46578,-0.692645,-0.454233,-1.429872,0.857632,-0.284401,-0.894291,-1.197142,-0.603666,-1.073264
2,182,-0.759251,0.209695,-0.509378,-1.358806,-0.976258,-0.697344,-1.33072,0.686417,2.201513,-1.253674,-1.166002,3.516159,0.057558,-1.197142,-0.603666,-0.845393
3,182,-0.759251,0.209695,-0.509378,-1.358806,-0.097711,-0.9636,-1.195659,0.686417,2.201513,-1.183196,0.857632,-0.284401,0.057558,-1.197142,-0.603666,-0.845393
4,182,-0.759251,0.209695,-0.509378,-1.358806,-0.097711,-0.9636,-1.060599,2.065479,2.201513,-1.147956,0.857632,-0.284401,0.057558,-1.197142,-0.603666,-0.845393


### Transforming to torch tensor for inference

In [None]:
import torch

In [None]:
X = torch.tensor(feature_engineered_df.values, dtype=torch.double).unsqueeze(0)

In [None]:
X.shape

torch.Size([1, 23, 17])

### Loading model

In [None]:
# ! pip install drsa

In [None]:
from drsa.model import DRSA
import torch.nn as nn

In [None]:
def save_model(m, p): torch.save(m.state_dict(), p)
def load_model(m, p): m.load_state_dict(torch.load(p))

In [None]:
# loading up saved embeddings
# embeddings = torch.load(f"../models/07-22-20_embeddings_2016_2017_2018_2019_loss_2.6064.pth")

In [None]:
# getting embeddings ready
n_pitchers = 270
pitcher_emb_size = 20
pitcher_embeddings = nn.Embedding(n_pitchers, pitcher_emb_size)
# pitcher_embeddings.weight = nn.Parameter(embeddings)

ptp_embeddings = [pitcher_embeddings]

# initializing model
drsa = DRSA(n_features=17,
            hidden_dim=20,
            n_layers=5,
            embeddings=ptp_embeddings,
            output_size=1,
            LSTM_dropout=0.05,
            Linear_dropout=0.1)

In [None]:
# loading up saved model
load_model(drsa, f"../models/07-22-20_DRSA_2016_2017_2018_2019_loss_2.6398.pth")


In [None]:
drsa.eval() # making sure that no dropout is applied
preds = drsa(X)

In [None]:
preds

tensor([[[0.0199],
         [0.0018],
         [0.0011],
         [0.0011],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0010],
         [0.0011]]], grad_fn=<SigmoidBackward>)

---

In [None]:
df = df.loc[(df["player_name"]=="David Price")]

In [None]:
df[["game_date", "player_name", "pitch_type", "release_speed", "events", "at_bat_number"]].sort_values("at_bat_number").iloc[:30, :]

In [None]:
# export

