In [None]:
# default_exp data.pipeline

# Data Pipeline

> Command-line script, which utilizes the `data_processing` module


In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# exporti 
from pull_the_pitcher.data import processing
from pull_the_pitcher.data.acquisition import query_db
from pull_the_pitcher.data.processing import last
from fastscript import *
import pandas as pd
import numpy as np
from typing import List
from sklearn.preprocessing import StandardScaler
import pickle
from scipy.stats import bernoulli

## Processing each observation

In [None]:
# export

embedding_cols = ["game_pk", "game_type", "pitcher", "pitcher_team_year"]
feature_cols = ["post_bat_score", "score_diff", "end_inning", "inning", "postouts", "cum_sb_ratio",
                "times_thru_order", "post_total_runners", "tying_run_on", "pitch_total", "post_opposite_hand",
                "walk", 'walk_cumsum', 'strikeout_cumsum', 'home_run_cumsum', 'bases_cumsum']
cols = embedding_cols + feature_cols

In [None]:
# export

# adding targets to each
def add_targets(starts: List):
    """adding target as last col to each start"""
    for i, start in enumerate(starts):
        y = np.zeros((start.shape[0], 1))
        y[-1, 0] = 1
        starts[i] = np.concatenate([start, y], axis=1)
    return starts

In [None]:
# export

def stack_into_df(starts: List):
    # concatenating into big dfs
    df = pd.DataFrame(np.concatenate(starts, axis=0), columns=cols+["pulled"])

    # correcting data types
    for col in feature_cols + ["pulled"]:
        df[col] = df[col].astype(float)
    return df

In [None]:
# export

def scale(train: pd.DataFrame, val: pd.DataFrame):
    # scaling data
    scaler = StandardScaler()
    scaler.fit(train[feature_cols])
    train[feature_cols] = scaler.transform(train[feature_cols])
    val[feature_cols] = scaler.transform(val[feature_cols])
    return train, val, scaler

In [None]:
# export

def encode_col(train, valid, col="pitcher_id"):
    
    # encoding movies and user ids with continous ids
    train_ids = np.sort(np.unique(train[col].values))

    # number of unique ids
    num_users = len(train_ids)
    print(f"There are {num_users} unique {col}'s in this dataset")

    # making changes in df
    id2idx = {o:i for i,o in enumerate(train_ids)}
    train[col] = train[col].apply(lambda x: id2idx[x])
    valid[col] = valid[col].apply(lambda x: id2idx.get(x, -1)) # -1 for users not in training
    
    # getting rid of users not in training set
    valid = valid[valid[col] >= 0].copy()
    return train, valid, id2idx


def encode_embedding_cols(train, val, cols=["game_pk", "game_type", "pitcher", "pitcher_team_year"]):
    # adding a row of zeros that act as "null" or "unknown"
    # embeddings for the zero-padded rows
    zero_row = pd.DataFrame(np.zeros((1, train.shape[1])), columns=train.columns)
    train = pd.concat([zero_row, train], axis=0)
    val = pd.concat([zero_row, val], axis=0)

    # changing dtypes in order to encode for embeddings
    for cat in ["game_type", "pitcher_team_year"]:
        train[cat] = train[cat].astype(str)
        val[cat] = val[cat].astype(str)
        
    mappers = dict()
    # not embedding game_pk, just switching to int for easier type casting
    for col in cols:
        train, val, mapper = encode_col(train, val, col=col)
        mappers[col] = mapper
    
    return train, val, mappers

In [None]:
# export
at_bat_aggs = {
    "balls": "max",
    "strikes": "max",
    "pitch_number": "max",
    "post_bat_score": last,
    "post_fld_score": last,
    "events": "max",
    "postouts": last,
    "post_on_1b": last,
    "post_on_2b": last,
    "post_on_3b": last,
    "game_type": last,
    "home_team": last,
    "away_team": last,
    "inning": last,
    "inning_topbot": last,
    "post_opposite_hand": last,
    "game_year": last,
    "pitcher_team": last
}

In [None]:
# export

def add_val_flags(ds_flags: List[int]):
    """
    Adds some 1's to the list of dataset flags to move
    a random 15% of the training set into the validation
    set
    """
    ds_flags = np.array(ds_flags)
    total_train = (ds_flags == 0).sum()
    val_flags = bernoulli(p=0.15).rvs(size=total_train, random_state=742)
    val_indx = 0
    for i, flag in enumerate(ds_flags):
        if flag == 0:
            ds_flags[i] = val_flags[val_indx]
            val_indx += 1
    return ds_flags

In [None]:
# export


@call_parse
def prep_data_for_modeling(
    db_path: Param(
        help="Path to db with statcast data", type=str
    ) = "./data/raw/statcast_pitches.db",
    years: Param(help="Year of statcast data to process", type=str, nargs="+") = [
        "2019"
    ],
    verbose: Param(
        help="Whether to print out updates on processing", type=bool_arg
    ) = True,
    train_test_split_by: Param(
        help="How to split into train/test sets. One of {'start', 'year'}.", type=str
    ) = "start",
    test_size: Param(help="Percent of data to allocate to test set", type=float) = 0.25,
    output_path: Param(
        help="Path to save processed csv files", type=str
    ) = "./data/processed/",
):
    # getting all dfs from all years into a single df
    dfs = []
    for year in years:
        df_year = query_db(db_path, year, verbose=verbose)
        dfs.append(df_year)
    df = pd.concat(dfs, axis=0)

    # identifying eligible game-pitcher-year combos
    games_pitchers_years = processing.get_games_pitchers_years(df, verbose)

    # deciding which outings to allocate to train or test set
    # 2 is test, 1 is val, 0 is train
    if train_test_split_by == "start":
        # pre-determining which starts will go into train/test sets
        ds_flags = bernoulli(p=test_size).rvs(
            len(games_pitchers_years), random_state=742
        ) * 2
        train_year = test_year = years
    elif train_test_split_by == "year":
        # identifying year of test starts
        test_year = [np.sort(df["game_date"].str[:4].unique())[-1]]
        train_year = list(set(years).difference(set(test_year)))
        ds_flags = [
            2 if str(y) == test_year[0] else 0 for (g, p, y) in games_pitchers_years
        ]
    else:
        # no starts go to test set
        test_flags = np.zeros(len(games_pitchers))
    
    # making %15 of training set be val set
    ds_flags = add_val_flags(ds_flags)

    # processing dfs of data from eligible pitchers
    train_starts = []
    val_starts = []
    test_starts = []
    for i, (ds_flag, (g, p, y)) in enumerate(zip(ds_flags, games_pitchers_years)):
        if verbose:
            if i % 100 == 0:
                print(f"Just processed {i}th start.")

        cleaned_df = processing.preliminary_clean(df, g, p)
        agged_df = processing.aggregate_at_bats(cleaned_df, at_bat_aggs)
        feature_engineered_df = processing.feature_engineering(agged_df)

#         # making sure starting pitcher is in AL -> this _should_ no longer be necessary
#         if feature_engineered_df.shape[0] > 0:
        if ds_flag == 2:
            test_starts.append(feature_engineered_df[cols])
        elif ds_flag == 1:
            val_starts.append(feature_engineered_df[cols])
        else:
            train_starts.append(feature_engineered_df[cols])
#         else:
#             print("empty df")

    # adding binary targets (pitcher always removed in last at-bat)
    train_starts = add_targets(train_starts)
    val_starts = add_targets(val_starts)
    test_starts = add_targets(test_starts)

    # stacking starts into dfs for scaling and categorical encoding
    train = stack_into_df(train_starts)
    val = stack_into_df(val_starts)
    test = stack_into_df(test_starts)

    # standard scaling (mean of 0, sd of 1)
    train, val, scaler = scale(train, val)

    # encoding categoricals for embeddings later
    train, val, mappers = encode_embedding_cols(train, val, cols=["pitcher"])

    # saving train, val, test sets, along with categorical mapper and scaler to output path
    train.to_csv(
        f"{output_path}/train_{'_'.join(train_year)}.csv", index=False)
    val.to_csv(f"{output_path}/val_{'_'.join(train_year)}.csv", index=False)
    test.to_csv(f"{output_path}/test_{'_'.join(test_year)}.csv", index=False)

    with open(
        f"{output_path}/mappers_{'_'.join(train_year + test_year)}.pkl", "wb"
    ) as f:
        pickle.dump(mappers, f)

    with open(
        f"{output_path}/scaler_{'_'.join(train_year + test_year)}.pkl", "wb"
    ) as f:
        pickle.dump(scaler, f)

    if verbose:
        print(f"{years} data ready for modeling and saved at {output_path}.")

In [None]:
! rm /tmp/*.db /tmp/*.pkl /tmp/*.csv

In [None]:
! query_statcast --start_dt 2018-07-07 --end_dt 2018-07-20 --output_type db --output_path /tmp
! query_statcast --start_dt 2019-07-07 --end_dt 2019-07-20 --output_type db --output_path /tmp

This is a large query, it may take a moment to complete
Completed sub-query from 2018-07-07 to 2018-07-12
Completed sub-query from 2018-07-13 to 2018-07-18
Completed sub-query from 2018-07-19 to 2018-07-20
Table named 'statcast_2019' already exists in db saved at `/tmp/statcast_2019`.


In [None]:
! ls /tmp

adobesmuoutpJskbsk           [34mcom.apple.launchd.G3SsF3Afuq[m[m
adobesmuoutpV1Fvr8           [34mcom.google.Keystone[m[m
adobesmuoutpXErfzF           [34mpowerlog[m[m
adobesmuoutprupIXQ           statcast_pitches.db


In [None]:
prep_data_for_modeling(db_path="/tmp/statcast_pitches.db",
                       years=["2018", "2019"],
                       train_test_split_by="year",
                       output_path="/tmp")

querying db at /tmp/statcast_pitches.db now.
querying db at /tmp/statcast_pitches.db now.
In this dataset, there are 290 total games.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


There are 339 ineligible starts in the dataset (either 'openers' or an NL team).
There are 241 total eligible game-pitcher combinations in this dataset.
Just processed 0th start.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_df["pitcher_team"] = game_df.apply(lambda row: add_pitcher_team(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_team_pitcher_df["events"] = game_team_pitcher_df["events"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_team_pitcher_df["post_bat_score"] = game_t

Just processed 100th start.
Just processed 200th start.
There are 72 unique pitcher's in this dataset
['2018', '2019'] data ready for modeling and saved at /tmp.


In [None]:
! ls /tmp

adobesmuoutpJskbsk           [34mpowerlog[m[m
adobesmuoutpV1Fvr8           scaler_2018_2019.pkl
adobesmuoutpXErfzF           statcast_pitches.db
adobesmuoutprupIXQ           test_2019.csv
[34mcom.apple.launchd.G3SsF3Afuq[m[m train_2018.csv
[34mcom.google.Keystone[m[m          val_2018.csv
mappers_2018_2019.pkl


In [None]:
! rm /tmp/*.db /tmp/*.pkl /tmp/*.csv

---
## Sanity Check

To make sure there is no crazy data leakage, I make sure that a baseline logistic regression classifier gets a recall score of < 0.5

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, recall_score

In [None]:
train = pd.read_csv("/tmp/train_2018.csv")

In [None]:
X = train[feature_cols]
y = train["pulled"]

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X, y);

In [None]:
print(classification_report(y, log_reg.predict(X)))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      2358
         1.0       0.84      0.25      0.39       104

    accuracy                           0.97      2462
   macro avg       0.90      0.62      0.68      2462
weighted avg       0.96      0.97      0.96      2462



In [None]:
assert recall_score(y, log_reg.predict(X)) < 0.5