In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys
import os
import pathlib
import itertools
package_path = os.path.join(pathlib.Path.home(), "mkvchain")
sys.path.append(package_path)
from model import FeatureDependentMarkovChain

In [3]:
state_to_id = {}
id_to_state = {}
i = 0
for outs in [0, 1, 2]:
    for r3 in [0, 1]:
        for r2 in [0, 1]:
            for r1 in [0, 1]:
                state_to_id[(outs, r1, r2, r3)] = i
                id_to_state[i] = (outs, r1, r2, r3)
                i += 1
state_to_id["INNING OVER"] = i
id_to_state[i] = "INNING OVER"

def func(x):
    outs = x.OUTS_CT
    r1 = 0 if isinstance(x.BASE1_RUN_ID, float) else 1
    r2 = 0 if isinstance(x.BASE2_RUN_ID, float) else 1
    r3 = 0 if isinstance(x.BASE3_RUN_ID, float) else 1
    return state_to_id[(outs, r1, r2, r3)]

n = len(state_to_id)

In [4]:
headtohead = pd.read_csv(os.path.join(pathlib.Path.home(), "mkvchain", "data", "headtohead-2017.csv"))
batters = headtohead.groupby("RESP_BAT_ID").sum()

In [5]:
starting_state = state_to_id[(0,0,0,0)]
ending_state = state_to_id[(0,0,0,0)]

In [6]:
starting_state

0

In [4]:

myvals = batters.B_PA.values.reshape(-1, 1)
divisor = np.hstack([myvals for k in range(17)])
batter_features = batters.iloc[:, 3:]/ divisor


pitchers = headtohead.groupby("RESP_PIT_ID").sum()
myvals = pitchers.B_PA.values.reshape(-1, 1)
divisor = np.hstack([myvals for k in range(17)])
pitcher_features = pitchers.iloc[:,3:] / divisor

In [5]:
df = pd.read_csv(os.path.join(pathlib.Path.home(), "mkvchain","data", "all2018.csv"))
df = df.join(batter_features, on='BAT_ID').join(pitcher_features, on='PIT_ID', lsuffix='_PIT')
df["STATE"] = df.apply(func, axis=1)
df["BAT_PIT_SAME"] = df.BAT_HAND_CD == df.PIT_HAND_CD
df["SCORE_DIFF"] = (df["BAT_HOME_ID"] * 2 - 1) * (df.HOME_SCORE_CT - df.AWAY_SCORE_CT)
df["DATE"] = pd.to_datetime(df.GAME_ID.str[3:-1], format="%Y%m%d")
df["STADIUM"] = df.GAME_ID.str[:3]
df["DOUBLEHEADER"] = df.GAME_ID.str[-1]
df.sort_values(by=["DATE", "STADIUM", "DOUBLEHEADER", "EVENT_ID"], inplace=True)
df["Inning_over"] = (df[["INN_CT", "BAT_HOME_ID"]].shift(1) != df[["INN_CT", "BAT_HOME_ID"]]).any(axis=1)
df.iloc[0, -1] = False

  df = pd.read_csv(os.path.join(pathlib.Path.home(), "mkvchain","data", "all2018.csv"))


In [6]:
features_df = df[['SCORE_DIFF', 'BAT_LINEUP_ID', 'BAT_HOME_ID', 'INN_CT', 'B_AB_PIT',
       'B_H_PIT', 'B_TB_PIT', 'B_2B_PIT', 'B_3B_PIT', 'B_HR_PIT', 'B_HR4_PIT',
       'B_RBI_PIT', 'B_BB_PIT', 'B_IBB_PIT', 'B_SO_PIT', 'B_GDP_PIT',
       'B_HP_PIT', 'B_SH_PIT', 'B_SF_PIT', 'B_XI_PIT', 'B_AB', 'B_H', 'B_TB',
       'B_2B', 'B_3B', 'B_HR', 'B_HR4', 'B_RBI', 'B_BB', 'B_IBB', 'B_SO',
       'B_GDP', 'B_HP', 'B_SH', 'B_SF', 'B_XI', 'BAT_HAND_CD', 'PIT_HAND_CD',
       'BAT_PIT_SAME']].copy()
features_df.INN_CT[features_df.INN_CT >= 10] = 10
features_df.SCORE_DIFF[features_df.SCORE_DIFF >= 3] = 3
features_df.SCORE_DIFF[features_df.SCORE_DIFF <= -3] = -3
dummy_cols = ["BAT_HAND_CD", "PIT_HAND_CD", "INN_CT", "BAT_LINEUP_ID", "SCORE_DIFF"]
for c in dummy_cols:
    features_df = pd.concat([features_df, pd.get_dummies(features_df[c], prefix=c)], axis=1)
features_df.drop(dummy_cols, axis=1, inplace=True)
features = features_df.values.astype(float)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  features_df.INN_CT[features_df.INN_CT >= 10] = 10
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df.I

In [7]:
features_df

Unnamed: 0,BAT_HOME_ID,B_AB_PIT,B_H_PIT,B_TB_PIT,B_2B_PIT,B_3B_PIT,B_HR_PIT,B_HR4_PIT,B_RBI_PIT,B_BB_PIT,...,BAT_LINEUP_ID_7,BAT_LINEUP_ID_8,BAT_LINEUP_ID_9,SCORE_DIFF_-3,SCORE_DIFF_-2,SCORE_DIFF_-1,SCORE_DIFF_0,SCORE_DIFF_1,SCORE_DIFF_2,SCORE_DIFF_3
6267,0,0.887671,0.291781,0.530137,0.047945,0.019178,0.050685,0.000000,0.143836,0.089041,...,False,False,False,False,False,False,True,False,False,False
6268,0,0.893741,0.275109,0.362445,0.040757,0.005822,0.011645,0.000000,0.093159,0.085881,...,False,False,False,False,False,False,True,False,False,False
6269,0,0.891971,0.274453,0.524088,0.062774,0.010219,0.055474,0.000000,0.191241,0.090511,...,False,False,False,False,False,False,False,True,False,False
6270,0,0.906977,0.218247,0.420394,0.057245,0.005367,0.044723,0.001789,0.148479,0.087657,...,False,False,False,False,False,False,False,True,False,False
6271,0,0.881262,0.231911,0.372913,0.063080,0.000000,0.025974,0.000000,0.107607,0.103896,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89405,0,0.891971,0.274453,0.524088,0.062774,0.010219,0.055474,0.000000,0.191241,0.090511,...,False,False,False,True,False,False,False,False,False,False
89406,0,0.906977,0.218247,0.420394,0.057245,0.005367,0.044723,0.001789,0.148479,0.087657,...,False,False,False,True,False,False,False,False,False,False
89407,0,0.881262,0.231911,0.372913,0.063080,0.000000,0.025974,0.000000,0.107607,0.103896,...,False,False,False,True,False,False,False,False,False,False
89408,0,0.909333,0.250667,0.341333,0.029333,0.002667,0.018667,0.002667,0.106667,0.064000,...,True,False,False,True,False,False,False,False,False,False


In [11]:
features.shape

(191051, 64)

In [8]:
M = np.zeros((n, n))
R = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        if i == n-1:
            M[i, j] = 0 if j != n-1 else 1
        else:
            o_start, r1_start, r2_start, r3_start = id_to_state[i]
            if j == n-1:
                M[i, j] = 1 if o_start + r1_start + r2_start + r3_start + 1 >= 3 else 0
            else:
                o_end, r1_end, r2_end, r3_end = id_to_state[j]
                o = o_end - o_start
                r_start = r1_start + r2_start + r3_start
                r_end = r1_end + r2_end + r3_end
                runs = r_start - r_end + 1 - o
                runner_backwards = r3_start == 1 and r2_start == 0 and r1_start == 0 and r1_end == 1 and r2_end == 1
                M[i, j] = 1 if runs >= 0 and runs + o <= r_start + 1 and o >= 0 and not runner_backwards else 0
                R[i, j] = 0 if M[i, j] == 0 else runs
M.sum()

np.float64(294.0)

In [9]:
states = df["STATE"].values.copy()
lengths = np.diff(np.append(np.append(0, np.where(df["Inning_over"])[0]), df.shape[0])).copy()
i = 0
states_new = []
features_new = []
lengths_new = []
for l in lengths:
    if M[states[i:i+l][-1], 24] == 0: # game over early
        states_new += [states[i:i+l]]
        features_new += [features[i:i+l]]
        lengths_new += [l]
    else:
        states_new += [np.append(states[i:i+l], state_to_id["INNING OVER"])]
        features_new += [np.vstack([features[i:i+l], np.zeros((1, features.shape[1]))])]
        lengths_new += [l + 1]
    i += l
states = np.concatenate(states_new)
features = np.vstack(features_new)
lengths = np.array(lengths_new)

In [10]:
train_idx = int(lengths.size * .8)
val_idx = int(lengths.size * .9)

lengths_train = lengths[:train_idx]
lengths_val = lengths[train_idx:val_idx]
lengths_test = lengths[val_idx:]

states_train = states[:lengths_train.sum()]
states_val = states[lengths_train.sum():lengths_train.sum()+lengths_val.sum()]
states_test = states[lengths_train.sum()+lengths_val.sum():]

features_train = features[:lengths_train.sum()]
features_val = features[lengths_train.sum():lengths_train.sum()+lengths_val.sum()]
features_test = features[lengths_train.sum()+lengths_val.sum():]

# lengths_train = lengths_train[:5000]
# states_train = states_train[:sum(lengths_train)]
# features_train = features_train[:sum(lengths_train)]

# append some inning over sequences
states_train = np.append(states_train, [state_to_id["INNING OVER"]]*20)
lengths_train = np.append(lengths_train, 20)
features_train = np.vstack([features_train, np.zeros((20, features.shape[1]))])

In [17]:
features_train.shape, features_val.shape, features_test.shape

((187612, 64), (23482, 64), (23545, 64))

In [11]:
from sklearn.preprocessing import QuantileTransformer, StandardScaler
ss = StandardScaler()

features_train = ss.fit_transform(features_train)
features_val = ss.transform(features_val)
features_test = ss.transform(features_test)

In [12]:
model1 = FeatureDependentMarkovChain(n, lam_frob=0, mask=M, n_iter=1)
model1.fit(states_train, features_train*0, lengths_train, verbose=False)
train1, val1, test1 = model1.score(states_train, features_train*0, lengths_train, average=False), \
    model1.score(states_val, features_val*0, lengths_val, average=False), \
    model1.score(states_test, features_test*0, lengths_test, average=False)
train1, val1, test1

  z = np.log(Ps[:, i, :])


(np.float64(-147840.7765547269),
 np.float64(-15071.131835870518),
 np.float64(-14488.13832877862))

In [13]:
predictions = model1.predict(features_test)
for i, j in itertools.product(range(n), range(n)):
    y = predictions[:, i, j]
    val = len(np.unique(y))
    print(f"{i} -> {j} : {val}")

0 -> 0 : 2
0 -> 1 : 2
0 -> 2 : 2
0 -> 3 : 1
0 -> 4 : 2
0 -> 5 : 1
0 -> 6 : 1
0 -> 7 : 1
0 -> 8 : 2
0 -> 9 : 1
0 -> 10 : 1
0 -> 11 : 1
0 -> 12 : 1
0 -> 13 : 1
0 -> 14 : 1
0 -> 15 : 1
0 -> 16 : 1
0 -> 17 : 1
0 -> 18 : 1
0 -> 19 : 1
0 -> 20 : 1
0 -> 21 : 1
0 -> 22 : 1
0 -> 23 : 1
0 -> 24 : 1
1 -> 0 : 2
1 -> 1 : 2
1 -> 2 : 2
1 -> 3 : 2
1 -> 4 : 2
1 -> 5 : 2
1 -> 6 : 2
1 -> 7 : 1
1 -> 8 : 2
1 -> 9 : 2
1 -> 10 : 2
1 -> 11 : 1
1 -> 12 : 2
1 -> 13 : 1
1 -> 14 : 1
1 -> 15 : 1
1 -> 16 : 2
1 -> 17 : 1
1 -> 18 : 1
1 -> 19 : 1
1 -> 20 : 1
1 -> 21 : 1
1 -> 22 : 1
1 -> 23 : 1
1 -> 24 : 1
2 -> 0 : 2
2 -> 1 : 2
2 -> 2 : 2
2 -> 3 : 2
2 -> 4 : 2
2 -> 5 : 2
2 -> 6 : 2
2 -> 7 : 1
2 -> 8 : 2
2 -> 9 : 2
2 -> 10 : 2
2 -> 11 : 1
2 -> 12 : 2
2 -> 13 : 1
2 -> 14 : 1
2 -> 15 : 1
2 -> 16 : 2
2 -> 17 : 1
2 -> 18 : 1
2 -> 19 : 1
2 -> 20 : 1
2 -> 21 : 1
2 -> 22 : 1
2 -> 23 : 1
2 -> 24 : 1
3 -> 0 : 2
3 -> 1 : 2
3 -> 2 : 2
3 -> 3 : 2
3 -> 4 : 2
3 -> 5 : 2
3 -> 6 : 2
3 -> 7 : 2
3 -> 8 : 2
3 -> 9 : 2
3 -> 10 : 2
3 -> 11 

In [15]:
from copy import deepcopy
train2, val2, test2 = -np.inf, -np.inf, -np.inf
best_lam = None
model2 = None
for lam in np.logspace(-3,-1,10):
    model = FeatureDependentMarkovChain(n, lam_frob=lam, mask=M, n_iter=1)
    model.As = deepcopy(model1.As)
    model.bs = deepcopy(model1.bs)
    model.fit(states_train, features_train, lengths_train, verbose=False, warm_start=True)
    traini, vali, testi = model.score(states_train, features_train, lengths_train, average=False), \
          model.score(states_val, features_val, lengths_val, average=False), \
          model.score(states_test, features_test, lengths_test, average=False)
    if vali > val2:
        train2 = traini
        val2 = vali
        test2 = testi
        best_lam = lam
        model2 = model
train2, val2, test2

  z = np.log(Ps[:, i, :])


(np.float64(-145456.81762464228),
 np.float64(-15013.006866982518),
 np.float64(-14412.517347597694))

In [19]:
best_lam

np.float64(0.004641588833612777)

In [17]:
model = FeatureDependentMarkovChain(n, lam_frob=best_lam, mask=M, n_iter=1)
model.As = deepcopy(model1.As)
model.bs = deepcopy(model1.bs)
model.fit(states_train, features_train, lengths_train, verbose=False, warm_start=True)

In [18]:
predictions = model.predict(features_test)
for i, j in itertools.product(range(n), range(n)):
    y = predictions[:, i, j]
    val = len(np.unique(y.round(decimals=4)))
    print(f"{i} -> {j} : {val}")

0 -> 0 : 616
0 -> 1 : 1711
0 -> 2 : 688
0 -> 3 : 1
0 -> 4 : 61
0 -> 5 : 1
0 -> 6 : 1
0 -> 7 : 1
0 -> 8 : 1932
0 -> 9 : 1
0 -> 10 : 1
0 -> 11 : 1
0 -> 12 : 1
0 -> 13 : 1
0 -> 14 : 1
0 -> 15 : 1
0 -> 16 : 1
0 -> 17 : 1
0 -> 18 : 1
0 -> 19 : 1
0 -> 20 : 1
0 -> 21 : 1
0 -> 22 : 1
0 -> 23 : 1
0 -> 24 : 1
1 -> 0 : 285
1 -> 1 : 3
1 -> 2 : 751
1 -> 3 : 1265
1 -> 4 : 92
1 -> 5 : 527
1 -> 6 : 271
1 -> 7 : 1
1 -> 8 : 164
1 -> 9 : 1909
1 -> 10 : 1303
1 -> 11 : 1
1 -> 12 : 20
1 -> 13 : 1
1 -> 14 : 1
1 -> 15 : 1
1 -> 16 : 1011
1 -> 17 : 1
1 -> 18 : 1
1 -> 19 : 1
1 -> 20 : 1
1 -> 21 : 1
1 -> 22 : 1
1 -> 23 : 1
1 -> 24 : 1
2 -> 0 : 179
2 -> 1 : 433
2 -> 2 : 298
2 -> 3 : 752
2 -> 4 : 293
2 -> 5 : 466
2 -> 6 : 25
2 -> 7 : 1
2 -> 8 : 64
2 -> 9 : 110
2 -> 10 : 2073
2 -> 11 : 1
2 -> 12 : 1813
2 -> 13 : 1
2 -> 14 : 1
2 -> 15 : 1
2 -> 16 : 40
2 -> 17 : 1
2 -> 18 : 1
2 -> 19 : 1
2 -> 20 : 1
2 -> 21 : 1
2 -> 22 : 1
2 -> 23 : 1
2 -> 24 : 1
3 -> 0 : 144
3 -> 1 : 2
3 -> 2 : 138
3 -> 3 : 169
3 -> 4 : 34
3 -> 5 : 2

In [1]:
starting_state = state_to_id[(0,0,0,0)]
ending_state = state_to_id[(0,0,0,0)]

NameError: name 'state_to_id' is not defined

In [20]:
model = FeatureDependentMarkovChain(n, lam_frob=best_lam, mask=M, n_iter=1, lam_col_norm=0.0045)
model.As = deepcopy(model1.As)
model.bs = deepcopy(model1.bs)
model.fit(states_train, features_train, lengths_train, verbose=False, warm_start=True)
train3, val3, test3 = model.score(states_train, features_train, lengths_train, average=False), \
    model.score(states_val, features_val, lengths_val, average=False), \
    model.score(states_test, features_test, lengths_test, average=False)
nnz = []
for i in range(model.As[0].shape[0]):
    col_norm = np.linalg.norm(np.concatenate([A[i,:] for A in model.As]))
    if col_norm > 0:
        nnz.append(i)

0 1.0 1.197634365013504
1 1.2 1.1975774555829646
2 1.44 1.1975203278234774
3 1.728 1.1974642350222275
4 2.0736 1.1974103694015998
5 2.48832 1.1973597391530193
6 2.9859839999999997 1.1973131029890742
7 3.5831807999999996 1.1972709579975962
8 4.299816959999999 1.1972335815522959
9 5.159780351999999 1.197201099902845
10 6.191736422399999 1.197173545631401
11 7.430083706879999 1.1971508757653866
12 8.916100448255998 1.1971329461920115
13 10.699320537907196 1.197119462395994
14 12.839184645488634 1.197109963336695
15 15.407021574586361 1.1971037532379807
16 18.48842588950363 1.197100053789075
17 22.186111067404358 1.1970980894089962
18 26.62333328088523 1.1970971808353092
19 31.947999937062274 1.1970968190189855
20 38.33759992447473 1.1970966990888547
21 11.501279977342419 1.1970966708594255
22 13.801535972810901 1.1970966529992273
23 16.56184316737308 1.197096637654184
24 19.874211800847696 1.1970966253641138
25 23.849054161017236 1.197096624998138
26 14.309432496610341 1.1970966117509925


In [23]:
nnz

array([ 6,  9, 11, 14, 21, 46, 48, 56, 57])

Using nnz as index

In [22]:
nnz = np.array(nnz)
train4, val4, test4 = -float("inf"), -float("inf"), -float("inf")
for lam in np.logspace(-3,-1,10):
    model = FeatureDependentMarkovChain(n, lam_frob=0.0046, mask=M, n_iter=1)
    model.fit(states_train, features_train[:,nnz], lengths_train, verbose=False, warm_start=False)
    traini, vali, testi = model.score(states_train, features_train[:,nnz], lengths_train, average=False), \
        model.score(states_val, features_val[:,nnz], lengths_val, average=False), \
        model.score(states_test, features_test[:,nnz], lengths_test, average=False)
    if vali > val4:
        train4 = traini
        val4 = vali
        test4 = testi
train4, val4, test4

(np.float64(-146948.36662437546),
 np.float64(-15012.35770425032),
 np.float64(-14415.378254983798))

In [None]:
predictions = model.predict(features_test)
for i, j in itertools.product(range(n), range(n)):
    y = predictions[:, i, j]
    val = len(np.unique(y.round(decimals=4)))
    print(f"{i} -> {j} : {val}")