In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mlb-player-salaries/mlbSalaries.csv
/kaggle/input/d/drehero/modeltest/model_lag1_300bs_v1/saved_model.pb
/kaggle/input/d/drehero/modeltest/model_lag1_300bs_v1/variables/variables.index
/kaggle/input/d/drehero/modeltest/model_lag1_300bs_v1/variables/variables.data-00000-of-00001
/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/teams.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/seasons.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/example_test.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/train_updated.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/awards.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/mlb/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/mlb-player-digital-en

In [2]:
from mlb_preprocessing import *
from mlb_config import *

import gc
import pathlib

from tensorflow import keras

In [3]:
BASE_DIR = pathlib.Path("/kaggle/input")
PATH_TO_MLB_DATA = BASE_DIR / "mlb-player-digital-engagement-forecasting"
PATH_TO_MODELS = pathlib.Path("/kaggle/input/d/drehero/modeltest")

In [4]:
ENSEMBLE_SIZE = 1

In [5]:
def get_lagged_targets_test(sample_submission, historic_targets, player_target_stats, lag):
    lagged_targets = pd.DataFrame(sample_submission.loc[:, "date_playerId"])
    lagged_targets["playerId"] = lagged_targets["date_playerId"].apply(lambda x: x.split("_")[1]).astype(int)
    lagged_targets.reset_index(inplace=True)
    lagged_targets.rename(columns={"date": "dailyDataDate"}, inplace=True)
    lagged_targets.drop(["date_playerId"], axis=1, inplace=True)

    earliest_date = int((pd.to_datetime(lagged_targets["dailyDataDate"].min(), format="%Y%m%d") - pd.Timedelta(days=lag)).strftime("%Y%m%d"))

    shifted_historic_targets = historic_targets[historic_targets["dailyDataDate"] >= earliest_date].copy()
    shifted_historic_targets["dailyDataDate"] = pd.to_datetime(shifted_historic_targets["dailyDataDate"], format="%Y%m%d") + pd.Timedelta(days=lag)
    shifted_historic_targets["dailyDataDate"] = shifted_historic_targets["dailyDataDate"].dt.strftime("%Y%m%d").astype(int)

    lagged_targets = pd.merge(lagged_targets, shifted_historic_targets, on=["dailyDataDate", "playerId"], how="left")

    defaults = lagged_targets.loc[:, ["dailyDataDate", "playerId"]]
    target_medians = player_target_stats[["playerId", "target1_50%", "target2_50%", "target3_50%", "target4_50%"]].rename(columns={
        "target1_50%": "target1", "target2_50%": "target2", "target3_50%": "target3", "target4_50%": "target4"
    })
    # TODO: update after changing to yearly stats
    defaults = defaults.merge(target_medians, on=["playerId"], how="left")  # use median as a default

    mask = lagged_targets.isna()
    lagged_targets = lagged_targets.where(~mask, other=defaults.where(mask))

    lagged_targets.rename(columns={
        "target1": f"target1Lag{lag}",
        "target2": f"target2Lag{lag}",
        "target3": f"target3Lag{lag}",
        "target4": f"target4Lag{lag}",
    }, inplace=True)
    return lagged_targets

In [6]:
# keep latest rosters and twitter followers in case they are not present in the test set
train = pd.read_csv(PATH_TO_MLB_DATA / "train.csv")
LATEST_ROSTERS = unpack_data(pd.DataFrame(train[~train["rosters"].isna()].iloc[-1, :]).T, ["rosters"])["rosters"]
LATEST_PLAYER_TWITTER_FOLLOWERS = unpack_data(pd.DataFrame(train[~train["playerTwitterFollowers"].isna()].iloc[-1, :]).T, ["playerTwitterFollowers"])["playerTwitterFollowers"]
LATEST_TEAM_TWITTER_FOLLOWERS = unpack_data(pd.DataFrame(train[~train["teamTwitterFollowers"].isna()].iloc[-1, :]).T, ["teamTwitterFollowers"])["teamTwitterFollowers"]
next_day_player_engagement = unpack_data(train, ["nextDayPlayerEngagement"])["nextDayPlayerEngagement"]

player_target_stats = get_player_target_stats(next_day_player_engagement)
historic_targets = next_day_player_engagement.loc[:, ["dailyDataDate", "playerId", "target1", "target2", "target3", "target4"]]

del(train, next_day_player_engagement)
gc.collect()

0

In [7]:
mlb_social_media = pd.read_csv(BASE_DIR / "mlb-social-media-dataset" / "teams.csv")
player_salaries = pd.read_csv(BASE_DIR / "mlb-player-salaries" / "mlbSalaries.csv")

In [8]:
def prepare_test(unpacked_data, sample_submission, seasons, teams, players, player_target_stats, mlb_social_media, player_salaries, historic_targets):
    global LATEST_PLAYER_TWITTER_FOLLOWERS
    global LATEST_TEAM_TWITTER_FOLLOWERS
    global LATEST_ROSTERS
    df = sample_submission.copy()
    # date
    df["playerId"] = df["date_playerId"].apply(lambda x: int(x.split("_")[1]))
    df["dailyDataDate"] = sample_submission.index
    dates_with_info = get_dates_with_info(df, seasons)
    df = df.merge(dates_with_info, on="dailyDataDate", how="left")
    # players
    players_to_merge = prepare_players(players)
    df = df.merge(players_to_merge, on="playerId", how="left")
    # rosters
    rosters_to_merge = prepare_rosters(unpacked_data["rosters"])
    if rosters_to_merge.empty:
        rosters_to_merge = prepare_rosters(LATEST_ROSTERS)
        rosters_to_merge["dailyDataDate"] = df["dailyDataDate"].max()
    elif LATEST_ROSTERS["dailyDataDate"].max() < unpacked_data["rosters"]["dailyDataDate"].max():
        LATEST_ROSTERS = unpacked_data["rosters"]
    df = df.merge(rosters_to_merge, on=["dailyDataDate", "playerId"], how="left")
    # teams
    teams_to_merge = prepare_teams(teams)
    df = df.merge(teams_to_merge, on=["teamId"], how="left")
    # games
    if unpacked_data["games"].empty:
        games_to_merge = pd.DataFrame(columns=GAMES_TO_MERGE_COLUMNS)
        games_to_merge[["dailyDataDate", "teamId"]] = df[["dailyDataDate", "teamId"]]
    else:
        games_to_merge = prepare_games(unpacked_data["games"])
    df = df.merge(games_to_merge, on=["dailyDataDate", "teamId"], how="left")
    # team box scores
    #if unpacked_data["teamBoxScores"].empty:
    #    team_box_scores_to_merge = pd.DataFrame(columns=TEAM_BOX_SCORES_TO_MERGE_COLUMNS)
    #    team_box_scores_to_merge[["dailyDataDate", "teamId"]] = df[["dailyDataDate", "teamId"]]
    #else:
    #    team_box_scores_to_merge = prepare_team_box_scores(unpacked_data["teamBoxScores"])
    #df = df.merge(team_box_scores_to_merge, on=["dailyDataDate", "teamId"], how="left")
    # player box scores
    if unpacked_data["playerBoxScores"].empty:
        player_box_scores_to_merge = pd.DataFrame(columns=PLAYER_BOX_SCORES_TO_MERGE_COLUMNS)
        player_box_scores_to_merge[["dailyDataDate", "playerId"]] = df[["dailyDataDate", "playerId"]]
    else:
        player_box_scores_to_merge = prepare_player_box_scores(unpacked_data["playerBoxScores"])
    df = df.merge(player_box_scores_to_merge, on=["dailyDataDate", "playerId"], how="left")
    # transactions
    if unpacked_data["transactions"].empty:
        transactions_to_merge = pd.DataFrame(columns=TRANSACTIONS_TO_MERGE_COLUMNS)
        transactions_to_merge[["dailyDataDate", "playerId"]] = df[["dailyDataDate", "playerId"]]
    else:
        transactions_to_merge = prepare_transactions(unpacked_data["transactions"])
    df = df.merge(transactions_to_merge, on=["dailyDataDate", "playerId"], how="left")
    # awards
    if unpacked_data["awards"].empty:
        awards_to_merge = pd.DataFrame(columns=AWARDS_TO_MERGE_COLUMNS)
        awards_to_merge[["dailyDataDate", "playerId"]] = df[["dailyDataDate", "playerId"]]
    else:
        awards_to_merge = prepare_awards(unpacked_data["awards"])
    df = df.merge(awards_to_merge, on=["dailyDataDate", "playerId"], how="left")
    # events
    if unpacked_data["events"].empty or unpacked_data["games"].empty:
        events_to_merge = pd.DataFrame(columns=EVENTS_TO_MERGE_COLUMNS)
        events_to_merge[["dailyDataDate", "playerId"]] = df[["dailyDataDate", "playerId"]]
    else:
        events_to_merge = prepare_events(unpacked_data["events"], unpacked_data["games"])
    df = df.merge(events_to_merge, on=["dailyDataDate", "playerId"], how="left")
    # standings
    if unpacked_data["standings"].empty:
        standings_to_merge = pd.DataFrame(columns=STANDINGS_TO_MERGE_COLUMNS)
        standings_to_merge[["dailyDataDate", "teamId"]] = df[["dailyDataDate", "teamId"]]
    else:
        standings_to_merge = prepare_standings(unpacked_data["standings"])
    df = df.merge(standings_to_merge, on=["dailyDataDate", "teamId"], how="left")
    # twitter
    player_twitter_followers = unpacked_data["playerTwitterFollowers"].copy()
    if player_twitter_followers.empty:
        player_twitter_followers = LATEST_PLAYER_TWITTER_FOLLOWERS.copy()
        player_twitter_followers["date"] = df["date"].max()
    elif LATEST_PLAYER_TWITTER_FOLLOWERS["date"].max() < player_twitter_followers["date"].max():
        LATEST_PLAYER_TWITTER_FOLLOWERS = player_twitter_followers.copy()
    df = pd.merge_asof(df, player_twitter_followers.loc[:, ('date', 'playerId', 'numberOfFollowers')].rename(
        columns={"numberOfFollowers": "numberOfFollowersPlayer"}), on="date", by="playerId", direction="backward")
    team_twitter_followers = unpacked_data["teamTwitterFollowers"].copy()
    if team_twitter_followers.empty:
        team_twitter_followers = LATEST_TEAM_TWITTER_FOLLOWERS.copy()
        team_twitter_followers["date"] = df["date"].max()
    elif LATEST_TEAM_TWITTER_FOLLOWERS["date"].max() < team_twitter_followers["date"].max():
        LATEST_TEAM_TWITTER_FOLLOWERS = team_twitter_followers.copy()
    team_twitter_followers["teamId"] = team_twitter_followers["teamId"].astype(str)
    df = pd.merge_asof(df, team_twitter_followers.loc[:, ("date", "teamId", "numberOfFollowers")].rename(
        columns={"numberOfFollowers": "numberOfFollowersTeam"}), on="date", by="teamId", direction="backward")
    # player target stats
    #df = df.merge(player_target_stats, on="playerId", how="left")
    # mlb social media
    mlb_social_media = prepare_mlb_social_media(mlb_social_media)
    df = df.merge(mlb_social_media, on="teamId", how="left")
    # player salaries
    player_salaries_to_merge = prepare_player_salaries(player_salaries, players)
    df = df.merge(player_salaries_to_merge, on=["playerId", "year"], how="left")
    # lagged targets
    for lag in range(1, N_LAGS+1):
        lagged_targets = get_lagged_targets_test(sample_submission, historic_targets, player_target_stats, lag)
        df = df.merge(lagged_targets, on=["dailyDataDate", "playerId"], how="left")

    df.fillna(COLUMN_DEFAULTS, inplace=True)
    for feature_name in df.columns:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            df[feature_name] = df[feature_name].astype(str)
    return df[FEATURE_NAMES]

In [9]:
seasons = pd.read_csv(PATH_TO_MLB_DATA / "seasons.csv")
players = pd.read_csv(PATH_TO_MLB_DATA / "players.csv")
teams = pd.read_csv(PATH_TO_MLB_DATA / "teams.csv")

In [10]:
def update_historic_targets(historic_targets, submission):
    predicted_targets = submission.copy()
    predicted_targets["dailyDataDate"] = predicted_targets.index
    predicted_targets["playerId"] = predicted_targets["date_playerId"].apply(lambda x: x.split("_")[1]).astype(int)
    historic_targets = historic_targets.append(predicted_targets[["dailyDataDate", "playerId"] + TARGET_FEATURE_NAMES])
    historic_targets = historic_targets.drop_duplicates(subset=["dailyDataDate", "playerId"], keep="last")
    return historic_targets

In [11]:
"""
example_test = pd.read_csv(PATH_TO_MLB_DATA / "example_test.csv")
example_sample_submission = pd.read_csv(PATH_TO_MLB_DATA / "example_sample_submission.csv")

example_test.set_index("date", inplace=True)
example_sample_submission.set_index("date", inplace=True)

unpacked_test = unpack_data(example_test, TEST_FEATURE_NAMES, is_test=True)
test_df = prepare_test(unpacked_test, example_sample_submission, seasons, teams, players,
                       player_target_stats, mlb_social_media, player_salaries, historic_targets)
X = get_dataset_from_df(test_df, batch_size=BATCH_SIZE, is_test=True)

model = keras.models.load_model(PATH_TO_MODELS / "model_lag1_300bs_v1")
y_pred = model.predict(X)
#for i in range(2, ENSEMBLE_SIZE+1):
#    model = keras.models.load_model(PATH_TO_MODELS / f"model_{i}")
#    y_pred += model.predict(X)
    
example_sample_submission[TARGET_FEATURE_NAMES] = y_pred #/ ENSEMBLE_SIZE
example_sample_submission
#"""

'\nexample_test = pd.read_csv(PATH_TO_MLB_DATA / "example_test.csv")\nexample_sample_submission = pd.read_csv(PATH_TO_MLB_DATA / "example_sample_submission.csv")\n\nexample_test.set_index("date", inplace=True)\nexample_sample_submission.set_index("date", inplace=True)\n\nunpacked_test = unpack_data(example_test, TEST_FEATURE_NAMES, is_test=True)\ntest_df = prepare_test(unpacked_test, example_sample_submission, seasons, teams, players,\n                       player_target_stats, mlb_social_media, player_salaries, historic_targets)\nX = get_dataset_from_df(test_df, batch_size=BATCH_SIZE, is_test=True)\n\nmodel = keras.models.load_model(PATH_TO_MODELS / "model_lag1_300bs_v1")\ny_pred = model.predict(X)\n#for i in range(2, ENSEMBLE_SIZE+1):\n#    model = keras.models.load_model(PATH_TO_MODELS / f"model_{i}")\n#    y_pred += model.predict(X)\n    \nexample_sample_submission[TARGET_FEATURE_NAMES] = y_pred #/ ENSEMBLE_SIZE\nexample_sample_submission\n#'

In [12]:
import mlb

env = mlb.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    unpacked_test = unpack_data(test_df, TEST_FEATURE_NAMES, is_test=True)
    X_df = prepare_test(unpacked_test, sample_prediction_df, seasons, teams, players, player_target_stats,
                        mlb_social_media, player_salaries, historic_targets)
    X = get_dataset_from_df(X_df, batch_size=BATCH_SIZE, is_test=True)
    
    model = keras.models.load_model(PATH_TO_MODELS / "model_lag1_300bs_v1")
    y_pred = model.predict(X)
    #for i in range(2, ENSEMBLE_SIZE+1):
    #    model = keras.models.load_model(PATH_TO_MODELS / f"model_{i}")
    #    y_pred += model.predict(X)
    sample_prediction_df[TARGET_FEATURE_NAMES] = y_pred #/ ENSEMBLE_SIZE
    sample_prediction_df[TARGET_FEATURE_NAMES] = sample_prediction_df[TARGET_FEATURE_NAMES].clip(0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0)
    
    # TODO make sure to use different historic targets in ensemble
    historic_targets = update_historic_targets(historic_targets, sample_prediction_df)

    # Submit predictions
    env.predict(sample_prediction_df)  # constructs submissions.csv

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
