In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
import os
import pathlib
import itertools
package_path = os.path.join(pathlib.Path.home(), "mkvchain")
sys.path.append(package_path)
from model import FeatureDependentMarkovChain

In [2]:
state_to_id = {}
id_to_state = {}
i = 0
for outs in [0, 1, 2]:
    for r3 in [0, 1]:
        for r2 in [0, 1]:
            for r1 in [0, 1]:
                state_to_id[(outs, r1, r2, r3)] = i
                id_to_state[i] = (outs, r1, r2, r3)
                i += 1
state_to_id["INNING OVER"] = i
id_to_state[i] = "INNING OVER"

def func(x):
    outs = x.OUTS_CT
    r1 = 0 if isinstance(x.BASE1_RUN_ID, float) else 1
    r2 = 0 if isinstance(x.BASE2_RUN_ID, float) else 1
    r3 = 0 if isinstance(x.BASE3_RUN_ID, float) else 1
    return state_to_id[(outs, r1, r2, r3)]

n = len(state_to_id)

In [3]:
state_to_id

{(0, 0, 0, 0): 0,
 (0, 1, 0, 0): 1,
 (0, 0, 1, 0): 2,
 (0, 1, 1, 0): 3,
 (0, 0, 0, 1): 4,
 (0, 1, 0, 1): 5,
 (0, 0, 1, 1): 6,
 (0, 1, 1, 1): 7,
 (1, 0, 0, 0): 8,
 (1, 1, 0, 0): 9,
 (1, 0, 1, 0): 10,
 (1, 1, 1, 0): 11,
 (1, 0, 0, 1): 12,
 (1, 1, 0, 1): 13,
 (1, 0, 1, 1): 14,
 (1, 1, 1, 1): 15,
 (2, 0, 0, 0): 16,
 (2, 1, 0, 0): 17,
 (2, 0, 1, 0): 18,
 (2, 1, 1, 0): 19,
 (2, 0, 0, 1): 20,
 (2, 1, 0, 1): 21,
 (2, 0, 1, 1): 22,
 (2, 1, 1, 1): 23,
 'INNING OVER': 24}

In [5]:
headtohead = pd.read_csv(os.path.join(pathlib.Path.home(), "mkvchain", "data", "headtohead-2017.csv"))
batters = headtohead.groupby("RESP_BAT_ID").sum()

In [49]:

myvals = batters.B_PA.values.reshape(-1, 1)
divisor = np.hstack([myvals for k in range(17)])
batter_features = batters.iloc[:, 3:]/ divisor


pitchers = headtohead.groupby("RESP_PIT_ID").sum()
myvals = pitchers.B_PA.values.reshape(-1, 1)
divisor = np.hstack([myvals for k in range(17)])
pitcher_features = pitchers.iloc[:,3:] / divisor

In [50]:
df = pd.read_csv(os.path.join(pathlib.Path.home(), "mkvchain","data", "all2018.csv"))
df = df.join(batter_features, on='BAT_ID').join(pitcher_features, on='PIT_ID', lsuffix='_PIT')
df["STATE"] = df.apply(func, axis=1)
df["BAT_PIT_SAME"] = df.BAT_HAND_CD == df.PIT_HAND_CD
df["SCORE_DIFF"] = (df["BAT_HOME_ID"] * 2 - 1) * (df.HOME_SCORE_CT - df.AWAY_SCORE_CT)
df["DATE"] = pd.to_datetime(df.GAME_ID.str[3:-1], format="%Y%m%d")
df["STADIUM"] = df.GAME_ID.str[:3]
df["DOUBLEHEADER"] = df.GAME_ID.str[-1]
df.sort_values(by=["DATE", "STADIUM", "DOUBLEHEADER", "EVENT_ID"], inplace=True)
df["Inning_over"] = (df[["INN_CT", "BAT_HOME_ID"]].shift(1) != df[["INN_CT", "BAT_HOME_ID"]]).any(axis=1)
df.iloc[0, -1] = False
df.head(5)

  df = pd.read_csv(os.path.join(pathlib.Path.home(), "mkvchain","data", "all2018.csv"))


Unnamed: 0,GAME_ID,AWAY_TEAM_ID,INN_CT,BAT_HOME_ID,OUTS_CT,BALLS_CT,STRIKES_CT,PITCH_SEQ_TX,AWAY_SCORE_CT,HOME_SCORE_CT,...,B_SH,B_SF,B_XI,STATE,BAT_PIT_SAME,SCORE_DIFF,DATE,STADIUM,DOUBLEHEADER,Inning_over
6267,ARI201803290,COL,1,0,0,2,2,BBFCS,0,0,...,0.004843,0.006053,0.0,0,True,0,2018-03-29,ARI,0,False
6268,ARI201803290,COL,1,0,1,2,2,CFBBX,0,0,...,0.004843,0.006053,0.0,8,False,0,2018-03-29,ARI,0,False
6269,ARI201803290,COL,1,0,1,2,0,BBX,1,0,...,0.004843,0.006053,0.0,8,False,1,2018-03-29,ARI,0,False
6270,ARI201803290,COL,1,0,1,1,2,SBCFC,1,0,...,0.004843,0.006053,0.0,9,False,1,2018-03-29,ARI,0,False
6271,ARI201803290,COL,1,0,2,0,2,SS>S,1,0,...,0.004843,0.006053,0.0,17,True,1,2018-03-29,ARI,0,False


In [63]:
features_df = df[['SCORE_DIFF', 'BAT_LINEUP_ID', 'BAT_HOME_ID', 'INN_CT', 'B_AB_PIT',
       'B_H_PIT', 'B_TB_PIT', 'B_2B_PIT', 'B_3B_PIT', 'B_HR_PIT', 'B_HR4_PIT',
       'B_RBI_PIT', 'B_BB_PIT', 'B_IBB_PIT', 'B_SO_PIT', 'B_GDP_PIT',
       'B_HP_PIT', 'B_SH_PIT', 'B_SF_PIT', 'B_XI_PIT', 'B_AB', 'B_H', 'B_TB',
       'B_2B', 'B_3B', 'B_HR', 'B_HR4', 'B_RBI', 'B_BB', 'B_IBB', 'B_SO',
       'B_GDP', 'B_HP', 'B_SH', 'B_SF', 'B_XI', 'BAT_HAND_CD', 'PIT_HAND_CD',
       'BAT_PIT_SAME']].copy()
features_df.INN_CT[features_df.INN_CT >= 10] = 10
features_df.SCORE_DIFF[features_df.SCORE_DIFF >= 3] = 3
features_df.SCORE_DIFF[features_df.SCORE_DIFF <= -3] = -3
dummy_cols = ["BAT_HAND_CD", "PIT_HAND_CD", "INN_CT", "BAT_LINEUP_ID", "SCORE_DIFF"]
for c in dummy_cols:
    features_df = pd.concat([features_df, pd.get_dummies(features_df[c], prefix=c)], axis=1)
features_df.drop(dummy_cols, axis=1, inplace=True)
features = features_df.values.astype(float)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  features_df.INN_CT[features_df.INN_CT >= 10] = 10
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df.I

In [66]:
features_df

Unnamed: 0,BAT_HOME_ID,B_AB_PIT,B_H_PIT,B_TB_PIT,B_2B_PIT,B_3B_PIT,B_HR_PIT,B_HR4_PIT,B_RBI_PIT,B_BB_PIT,...,BAT_LINEUP_ID_7,BAT_LINEUP_ID_8,BAT_LINEUP_ID_9,SCORE_DIFF_-3,SCORE_DIFF_-2,SCORE_DIFF_-1,SCORE_DIFF_0,SCORE_DIFF_1,SCORE_DIFF_2,SCORE_DIFF_3
6267,0,0.887671,0.291781,0.530137,0.047945,0.019178,0.050685,0.000000,0.143836,0.089041,...,False,False,False,False,False,False,True,False,False,False
6268,0,0.893741,0.275109,0.362445,0.040757,0.005822,0.011645,0.000000,0.093159,0.085881,...,False,False,False,False,False,False,True,False,False,False
6269,0,0.891971,0.274453,0.524088,0.062774,0.010219,0.055474,0.000000,0.191241,0.090511,...,False,False,False,False,False,False,False,True,False,False
6270,0,0.906977,0.218247,0.420394,0.057245,0.005367,0.044723,0.001789,0.148479,0.087657,...,False,False,False,False,False,False,False,True,False,False
6271,0,0.881262,0.231911,0.372913,0.063080,0.000000,0.025974,0.000000,0.107607,0.103896,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89405,0,0.891971,0.274453,0.524088,0.062774,0.010219,0.055474,0.000000,0.191241,0.090511,...,False,False,False,True,False,False,False,False,False,False
89406,0,0.906977,0.218247,0.420394,0.057245,0.005367,0.044723,0.001789,0.148479,0.087657,...,False,False,False,True,False,False,False,False,False,False
89407,0,0.881262,0.231911,0.372913,0.063080,0.000000,0.025974,0.000000,0.107607,0.103896,...,False,False,False,True,False,False,False,False,False,False
89408,0,0.909333,0.250667,0.341333,0.029333,0.002667,0.018667,0.002667,0.106667,0.064000,...,True,False,False,True,False,False,False,False,False,False


In [65]:
features.shape

(191051, 64)

In [52]:
M = np.zeros((n, n))
R = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        if i == n-1:
            M[i, j] = 0 if j != n-1 else 1
        else:
            o_start, r1_start, r2_start, r3_start = id_to_state[i]
            if j == n-1:
                M[i, j] = 1 if o_start + r1_start + r2_start + r3_start + 1 >= 3 else 0
            else:
                o_end, r1_end, r2_end, r3_end = id_to_state[j]
                o = o_end - o_start
                r_start = r1_start + r2_start + r3_start
                r_end = r1_end + r2_end + r3_end
                runs = r_start - r_end + 1 - o
                runner_backwards = r3_start == 1 and r2_start == 0 and r1_start == 0 and r1_end == 1 and r2_end == 1
                M[i, j] = 1 if runs >= 0 and runs + o <= r_start + 1 and o >= 0 and not runner_backwards else 0
                R[i, j] = 0 if M[i, j] == 0 else runs
M.sum()

np.float64(294.0)

In [54]:
R

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [2., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [2., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [3., 2., 2., 1., 2., 1., 1., 0., 2., 1., 1., 0., 1., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [2., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [3., 2., 2., 1., 2., 1., 1., 0., 2., 1., 1., 0., 1., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [3., 2., 2., 1., 2., 1., 1., 0., 2., 1., 1., 0., 1., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [4., 3., 3., 2., 3., 2., 2., 1., 3., 2., 2., 1., 2., 1., 1., 0.,
        2., 1., 1., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 

In [55]:
states = df["STATE"].values.copy()
lengths = np.diff(np.append(np.append(0, np.where(df["Inning_over"])[0]), df.shape[0])).copy()
i = 0
states_new = []
features_new = []
lengths_new = []
for l in lengths:
    if M[states[i:i+l][-1], 24] == 0: # game over early
        states_new += [states[i:i+l]]
        features_new += [features[i:i+l]]
        lengths_new += [l]
    else:
        states_new += [np.append(states[i:i+l], state_to_id["INNING OVER"])]
        features_new += [np.vstack([features[i:i+l], np.zeros((1, features.shape[1]))])]
        lengths_new += [l + 1]
    i += l
states = np.concatenate(states_new)
features = np.vstack(features_new)
lengths = np.array(lengths_new)

In [61]:
features_new[0].shape

(6, 64)

In [62]:
states_new[0]

array([ 0,  8,  8,  9, 17, 24])

In [68]:
lengths

array([6, 8, 5, ..., 5, 4, 6], shape=(43627,))