In [None]:
# default_exp data.pipeline

# Data Pipeline

> Command-line script, which utilizes the `data_processing` module


In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# exporti 
from pull_the_pitcher.data import processing
from pull_the_pitcher.data.acquisition import query_db
from pull_the_pitcher.data.processing import last
from fastscript import *
import pandas as pd
import numpy as np
from typing import List
from sklearn.preprocessing import StandardScaler
import pickle
from scipy.stats import bernoulli

## Processing each observation

In [None]:
# export

embedding_cols = ["game_pk", "game_type", "pitcher", "pitcher_team_year"]
feature_cols = ["post_bat_score", "score_diff", "end_inning", "inning", "postouts", "cum_sb_ratio",
                "times_thru_order", "post_total_runners", "tying_run_on", "pitch_total", "post_opposite_hand",
                "walk", 'walk_cumsum', 'strikeout_cumsum', 'home_run_cumsum', 'bases_cumsum']
cols = embedding_cols + feature_cols

In [None]:
# export

# adding targets to each
def add_targets(starts: List):
    """adding target as last col to each start"""
    for i, start in enumerate(starts):
        y = np.zeros((start.shape[0], 1))
        y[-1, 0] = 1
        starts[i] = np.concatenate([start, y], axis=1)
    return starts

In [None]:
# export

def stack_into_df(starts: List):
    # concatenating into big dfs
    df = pd.DataFrame(np.concatenate(starts, axis=0), columns=cols+["pulled"])

    # correcting data types
    for col in feature_cols + ["pulled"]:
        df[col] = df[col].astype(float)
    return df

In [None]:
# export

def scale(train: pd.DataFrame, test: pd.DataFrame):
    # scaling data
    scaler = StandardScaler()
    scaler.fit(train[feature_cols])
    train[feature_cols] = scaler.transform(train[feature_cols])
    test[feature_cols] = scaler.transform(test[feature_cols])
    return train, test, scaler

In [None]:
# export

def encode_col(train, valid, col="pitcher_id"):
    
    # encoding movies and user ids with continous ids
    train_ids = np.sort(np.unique(train[col].values))

    # number of unique ids
    num_users = len(train_ids)
    print(f"There are {num_users} unique {col}'s in this dataset")

    # making changes in df
    id2idx = {o:i for i,o in enumerate(train_ids)}
    train[col] = train[col].apply(lambda x: id2idx[x])
    valid[col] = valid[col].apply(lambda x: id2idx.get(x, -1)) # -1 for users not in training
    
    # getting rid of users not in training set
    valid = valid[valid[col] >= 0].copy()
    return train, valid, id2idx


def encode_embedding_cols(train, test, cols=["game_pk", "game_type", "pitcher", "pitcher_team_year"]):
    # adding a row of zeros that act as "null" or "unknown"
    # embeddings for the zero-padded rows
    zero_row = pd.DataFrame(np.zeros((1, train.shape[1])), columns=train.columns)
    train = pd.concat([zero_row, train], axis=0)
    test = pd.concat([zero_row, test], axis=0)

    # changing dtypes in order to encode for embeddings
    for cat in ["game_type", "pitcher_team_year"]:
        train[cat] = train[cat].astype(str)
        test[cat] = test[cat].astype(str)
        
    mappers = dict()
    # not embedding game_pk, just switching to int for easier type casting
    for col in cols:
        train, test, mapper = encode_col(train, test, col=col)
        mappers[col] = mapper
    
    return train, test, mappers

In [None]:
# export
at_bat_aggs = {"balls": "max",
                "strikes": "max",
                "pitch_number": "max",
                "post_bat_score": last,
                "post_fld_score": last,
                "events": "max",
                "postouts": last,
                "post_on_1b": last,
                "post_on_2b": last,
                "post_on_3b": last,
                "game_type": last,
                "home_team": last,
                "away_team": last,
                "inning": last,
                "inning_topbot": last,
               "post_opposite_hand": last,
               "game_year": last}

In [None]:
# export


@call_parse
def prep_data_for_modeling(
    db_path: Param(
        help="Path to db with statcast data", type=str
    ) = "./data/raw/statcast_pitches.db",
    years: Param(help="Year of statcast data to process", type=str, nargs="+") = [
        "2019"
    ],
    verbose: Param(
        help="Whether to print out updates on processing", type=bool_arg
    ) = True,
    train_test_split_by: Param(
        help="How to split into train/test sets. One of {'start', 'year'}.", type=str
    ) = "start",
    test_size: Param(help="Percent of data to allocate to test set", type=float) = 0.25,
    output_path: Param(
        help="Path to save processed csv files", type=str
    ) = "./data/processed/",
):
    # getting all dfs from all years into a single df
    dfs = []
    for year in years:
        df_year = query_db(db_path, year, verbose=verbose)
        dfs.append(df_year)
    df = pd.concat(dfs, axis=0)

    # identifying eligible game-pitcher-year combos
    games_pitchers_years = processing.get_games_pitchers_years(df, verbose)

    # deciding which outings to allocate to train or test set
    if train_test_split_by == "start":
        # pre-determining which starts will go into train/test sets
        test_flags = bernoulli(p=test_size).rvs(
            len(games_pitchers_years), random_state=742
        )
        train_year = test_year = years
    elif train_test_split_by == "year":
        # identifying year of test starts
        test_year = list(np.sort(df["game_date"].str[:4].unique())[-1])
        train_year = list(set(years).difference(set([test_year])))
        test_flags = [
            1 if str(y) == test_year[0] else 0 for (g, p, y) in games_pitchers_years
        ]
    else:
        # no starts go to test set
        test_flags = np.zeros(len(games_pitchers))

    # processing dfs of data from eligible pitchers
    train_starts = []
    test_starts = []
    for i, (test_flag, (g, p, y)) in enumerate(zip(test_flags, games_pitchers_years)):
        if verbose:
            if i % 100 == 0:
                print(f"Just processed {i}th start.")

        cleaned_df = processing.preliminary_clean(df, g, p)
        agged_df = processing.aggregate_at_bats(cleaned_df, at_bat_aggs)
        feature_engineered_df = processing.feature_engineering(agged_df)

        # making sure starting pitcher is in AL -> this _should_ no longer be necessary
        if feature_engineered_df.shape[0] > 0:
            if test_flag:
                test_starts.append(feature_engineered_df[cols])
            else:
                train_starts.append(feature_engineered_df[cols])
        else:
            print("empty df")

    # adding binary targets (pitcher always removed in last at-bat)
    train_starts = add_targets(train_starts)
    test_starts = add_targets(test_starts)

    # stacking starts into dfs for scaling and categorical encoding
    train = stack_into_df(train_starts)
    test = stack_into_df(test_starts)

    # standard scaling (mean of 0, sd of 1)
    train, test, scaler = scale(train, test)

    # encoding categoricals for embeddings later
    train, test, mappers = encode_embedding_cols(train, test, cols=["pitcher"])

    # saving train, test sets, along with categorical mapper to output path
    train.to_csv(f"{output_path}/train_{'_'.join(train_year)}.csv", index=False)
    test.to_csv(f"{output_path}/test_{'_'.join(test_year)}.csv", index=False)
    with open(
        f"{output_path}/mappers_{'_'.join(train_year + test_year)}.pkl", "wb"
    ) as f:
        pickle.dump(mappers, f)

    if verbose:
        print(f"{years} data ready for modeling and saved at {output_path}.")


In [None]:
! query_statcast --start_dt 2019-07-07 --end_dt 2019-07-20 --output_type db --output_path /tmp

This is a large query, it may take a moment to complete
Completed sub-query from 2019-07-07 to 2019-07-12
Completed sub-query from 2019-07-13 to 2019-07-18
Completed sub-query from 2019-07-19 to 2019-07-20


In [None]:
! ls /tmp

[34mcom.apple.launchd.82pLsk9OX7[m[m [34mrecsim[m[m
[34mcom.google.Keystone[m[m          statcast_pitches.db
[34mpowerlog[m[m                     t.py


In [None]:
prep_data_for_modeling(db_path="../data/raw/statcast_pitches.db",
                       years=["2019"],
                       train_test_split_by="start",
                       output_path="/tmp")

querying db at ../data/raw/statcast_pitches.db now.
In this dataset, there are 2429 total games.
There are 2804 ineligible starts in the dataset (either 'openers' or an NL team).
There are 2054 total eligible game-pitcher combinations in this dataset.
Just processed 0th start.
Just processed 100th start.
Just processed 200th start.
Just processed 300th start.
Just processed 400th start.
Just processed 500th start.
Just processed 600th start.
Just processed 700th start.
Just processed 800th start.
Just processed 900th start.
Just processed 1000th start.
Just processed 1100th start.
Just processed 1200th start.
Just processed 1300th start.
Just processed 1400th start.
Just processed 1500th start.
Just processed 1600th start.
Just processed 1700th start.
Just processed 1800th start.
Just processed 1900th start.
Just processed 2000th start.
There are 229 unique pitcher's in this dataset
['2019'] data ready for modeling and saved at /tmp.


In [None]:
! ls /tmp

[34mcom.apple.launchd.82pLsk9OX7[m[m statcast_pitches.db
[34mcom.google.Keystone[m[m          t.py
mappers_2019_2019.pkl        test_2019.csv
[34mpowerlog[m[m                     train_2019.csv
[34mrecsim[m[m


---
## testing

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
train = pd.read_csv("/tmp/train_2019.csv")

In [None]:
# feature_cols.remove("end_inning")
feature_cols.remove("postouts")

In [None]:
X = train[feature_cols]
y = train["pulled"]

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
print(classification_report(y, log_reg.predict(X)))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     33873
         1.0       0.66      0.21      0.32      1529

    accuracy                           0.96     35402
   macro avg       0.81      0.60      0.65     35402
weighted avg       0.95      0.96      0.95     35402



In [None]:
for col, coef in zip(feature_cols, log_reg.coef_[0]):
    print(f"{col}: {round(coef, 4)}")

post_bat_score: -0.2191
score_diff: -0.0956
inning: -1.6752
cum_sb_ratio: -0.1784
times_thru_order: 2.6153
post_total_runners: 0.3159
tying_run_on: -0.0041
pitch_total: 2.2531
post_opposite_hand: -0.0808
walk: -0.0201
walk_cumsum: -0.0988
strikeout_cumsum: -0.0804
home_run_cumsum: 0.3591
bases_cumsum: -0.3573


## problem appears to be with `postouts`/`end_inning`!

Check to see how many co-occurences there are of `postouts==3` and `pulled==1`?