## Imports

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import patsy
import requests
from typing import List, Tuple, Dict
import subprocess
from joblib import parallel_config, Parallel, delayed
from functools import partial
import time

from mlb_predictions.utilities.data_utils import (
    Schedule,
    Game,
    Player,
    batting_stats_range,
    get_hitting_soup,
    sanitize_date_range,
    Player,
    HittingStats,
    PitchingStats,
)

from sklearn.model_selection import train_test_split

## Global Vars

In [2]:
N_TRAIN = 30
N_VALID = 30
N_TEST = 30

## Training/Validation Data Pull

In [3]:
games_full = Schedule(year=2021).get_games(
    game_type="Regular Season", game_status="Final"
) + Schedule(year=2022).get_games(game_type="Regular Season", game_status="Final")

In [4]:
games_train, games_val = train_test_split(
    games_full, train_size=N_TRAIN, test_size=N_TEST, random_state=5
)

In [5]:
def get_lag_date(date: str, lag_days: int) -> str:
    return str((pd.to_datetime(date) + pd.Timedelta(days=lag_days)).date())


game = games_train[0]

stat_end_date = get_lag_date(game.game_date, -1)
stat_start_date = get_lag_date(game.game_date, -1 - 365 * 5)  # 5 years back

In [6]:
ROSTER_TYPE = "active"


def get_game_features(
    game: Game,
    hitting_features: List,
    pitching_features: List,
    fielding_features: List,
    stat_end_date: str,
    stat_start_date: str,
    roster_type="active",
    verbose: int = 0,
):
    """
    Get dictionary of features for a single game
    {
        "home": {
            "hitting": np.array,
            "starting_pitching": np.array,
            "relief_pitching": np.array,
            "fielding": np.array
        },
        "away": ...
    }
    """

    if verbose == 1:
        print(
            f"getting feats for {game.game_date} {game.home_name} v {game.away_name}..."
        )
    # rosters
    if verbose == 2:
        print("fetching home pitching...")
    home_roster_pitching_stats = game.get_home_team().get_roster_stats(
        date=game.game_date,
        stat_type="pitching",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching away pitching...")
    away_roster_pitching_stats = game.get_away_team().get_roster_stats(
        date=game.game_date,
        stat_type="pitching",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching home relief pitching...")
    home_roster_relief_pitching_stats = game.get_home_team().get_roster_stats(
        date=game.game_date,
        stat_type="relief_pitching",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching away relief pitching...")
    away_roster_relief_pitching_stats = game.get_away_team().get_roster_stats(
        date=game.game_date,
        stat_type="relief_pitching",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching home hitting...")
    home_roster_hitting_stats = game.get_home_team().get_roster_stats(
        date=game.game_date,
        stat_type="hitting",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching away hitting...")
    away_roster_hitting_stats = game.get_away_team().get_roster_stats(
        date=game.game_date,
        stat_type="hitting",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching home fielding...")
    home_roster_fielding_stats = game.get_home_team().get_roster_stats(
        date=game.game_date,
        stat_type="fielding",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )
    if verbose == 2:
        print("fetching away fielding...")
    away_roster_fielding_stats = game.get_away_team().get_roster_stats(
        date=game.game_date,
        stat_type="fielding",
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        roster_type=roster_type,
    )

    # lineups
    if verbose == 2:
        print("getting lineups...")
    lineups = game.get_lineup("home/away")
    home_lineup = [l[0] for l in lineups["home"]]
    away_lineup = [l[0] for l in lineups["away"]]

    home_lineup_hitting_stats = home_roster_hitting_stats.loc[
        home_roster_hitting_stats["key_mlbam"].isin([p.id for p in home_lineup]),
        hitting_features,
    ]

    away_lineup_hitting_stats = away_roster_hitting_stats.loc[
        away_roster_hitting_stats["key_mlbam"].isin([p.id for p in away_lineup]),
        hitting_features,
    ]

    home_hitting_feats = home_lineup_hitting_stats.median().values
    away_hitting_feats = away_lineup_hitting_stats.median().values

    # probable pitchers
    if verbose == 2:
        print("getting probable pitchers...")
    probable_pitcher = game.get_probable_pitcher("home/away")
    home_probable_pitcher = probable_pitcher["home"]
    away_probable_pitcher = probable_pitcher["away"]

    home_probable_pitcher_stats = home_roster_pitching_stats.loc[
        home_roster_pitching_stats["key_mlbam"] == home_probable_pitcher.id,
        pitching_features,
    ]
    away_probable_pitcher_stats = away_roster_pitching_stats.loc[
        away_roster_pitching_stats["key_mlbam"] == away_probable_pitcher.id,
        pitching_features,
    ]

    home_probable_pitcher_feats = home_probable_pitcher_stats.values
    away_probable_pitcher_feats = away_probable_pitcher_stats.values

    # Relievers

    home_reliever_pitching_stats = home_roster_relief_pitching_stats.loc[
        home_roster_relief_pitching_stats["position"].isin(["P", "TWP"]),
        pitching_features,
    ]

    away_reliever_pitching_stats = away_roster_relief_pitching_stats.loc[
        away_roster_relief_pitching_stats["position"].isin(["P", "TWP"]),
        pitching_features,
    ]

    home_reliever_pitching_feats = home_reliever_pitching_stats.median().values
    away_reliever_pitching_feats = away_reliever_pitching_stats.median().values

    # fielders

    home_lineup_fielding_stats = pd.DataFrame(
        [
            row
            for i, row in home_roster_fielding_stats.iterrows()
            if (  # only get fielding stats for those in the lineup
                (
                    (row["key_mlbam"], row["Pos"])
                    in [(l[0].id, l[1]) for l in lineups["home"]]
                )
                and row["Pos"] not in ("1B", "P", "DH")  # unimportant defense
            )
        ]
    )[fielding_features]

    away_lineup_fielding_stats = pd.DataFrame(
        [
            row
            for i, row in away_roster_fielding_stats.iterrows()
            if (  # only get fielding stats for those in the lineup
                (
                    (row["key_mlbam"], row["Pos"])
                    in [(l[0].id, l[1]) for l in lineups["away"]]
                )
                and row["Pos"] not in ("1B", "P", "DH")  # unimportant defense
            )
        ]
    )[fielding_features]

    home_fielding_feats = home_lineup_fielding_stats.median().values
    away_fielding_feats = away_lineup_fielding_stats.median().values

    return {
        "home": {
            "hitting": home_hitting_feats,
            "starting_pitching": home_probable_pitcher_feats,
            "relief_pitching": home_reliever_pitching_feats,
            "fielding": home_fielding_feats,
        },
        "away": {
            "hitting": away_hitting_feats,
            "starting_pitching": away_probable_pitcher_feats,
            "relief_pitching": away_reliever_pitching_feats,
            "fielding": away_fielding_feats,
        },
    }

In [7]:
subprocess.run("caffeinate -d -i -w $$ &", shell=True)  # don't go to sleep!

start = time.time()

hitting_features = ["BB%", "K%", "BB/K", "BABIP", "wOBA", "wRC+"]
pitching_features = [
    "Throws",
    "K/9",
    "BB/9",
    "K/BB",
    "HR/9",
    "K%",
    "BB%",
    "K-BB%",
    "BA",
    "WHIP",
    "BABIP",
    "LOB%",
    "ERA-",
    "FIP-",
    "FIP",
    "Soft%",
    "Med%",
    "Hard%",
]
fielding_features = ["FP", "UZR/150"]

train_jobs = [
    delayed(get_game_features)(
        game=game_train,
        hitting_features=hitting_features,
        pitching_features=pitching_features,
        fielding_features=fielding_features,
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        verbose=1,
    )
    for game_train in games_train
]
val_jobs = [
    delayed(get_game_features)(
        game=game_val,
        hitting_features=hitting_features,
        pitching_features=pitching_features,
        fielding_features=fielding_features,
        stat_start_date=stat_start_date,
        stat_end_date=stat_end_date,
        verbose=1,
    )
    for game_val in games_val
]

print("GETTING TRAINING DATA")
with parallel_config(backend="threading", n_jobs=10, verbose=99):
    train_feats = Parallel()(train_jobs)

print("GETTING VALIDATION DATA")
with parallel_config(backend="threading", n_jobs=10, verbose=99):
    val_feats = Parallel()(val_jobs)


end = time.time()
print(f"{end - start} seconds")

GETTING TRAINING DATA
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
getting feats for 2021-04-18 Milwaukee Brewers v Pittsburgh Pirates...
getting feats for 2021-08-10 Minnesota Twins v Chicago White Sox...
getting feats for 2022-06-01 Boston Red Sox v Cincinnati Reds...
getting feats for 2021-08-19 Los Angeles Dodgers v New York Mets...
getting feats for 2021-07-29 New York Mets v Atlanta Braves...
getting feats for 2022-08-24 Tampa Bay Rays v Los Angeles Angels...
getting feats for 2021-06-24 Toronto Blue Jays v Baltimore Orioles...
getting feats for 2022-09-15 Cleveland Guardians v Chicago White Sox...
getting feats for 2022-06-17 New York Mets v Miami Marlins...
getting feats for 2022-08-04 Texas Rangers v Chicago White Sox...
getting feats for 2021-05-14 Detroit Tigers v Chicago Cubs...[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:  2.3min

getting feats for 2021-08-28 Atlanta Braves v San Francisco Giants...[Parallel(n_jobs=10)]: Done   

In [23]:
X_train_home_hitting = np.stack([d["home"]["hitting"] for d in train_feats], axis=0)
X_train_home_starting_pitching = np.stack(
    [d["home"]["starting_pitching"] for d in train_feats], axis=0
)
X_train_home_relief_pitching = np.stack(
    [d["home"]["relief_pitching"] for d in train_feats], axis=0
)
X_train_home_fielding = np.stack([d["home"]["fielding"] for d in train_feats], axis=0)

ValueError: all input arrays must have the same shape