In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Activation, Dense, Dropout, Embedding, concatenate, Reshape
from tensorflow.keras import Model



In [2]:
df = pd.read_csv('..\\data\\processed2\\processed_data2.csv', index_col=0)

In [3]:
inds = df.index[df['inning']<9]

In [4]:
df['event_final'] = df['event_final'].astype('category')
inning = df['inning'].values
df['inning_diff'] = np.maximum(0, np.minimum(1, inning[1:] - inning[:-1])).astype(int).tolist() + [0]
df['inning_diff'] = df['inning_diff'].astype('category')
outs = df['outs'].values
df['outs_change'] = np.maximum(-2, np.minimum(3, outs[1:] - outs[:-1])).astype(int).tolist() + [0]
df['outs_diff'] = df['outs_change']
df.loc[df['outs_change']==-2, 'outs_diff'] = 1
df.loc[df['outs_change']==-1, 'outs_diff'] = 2
df['outs_diff'] = df['outs_diff'].astype('category')
home_score = df['home_score'].values
visiting_score = df['visiting_score'].values
df['home_score_diff'] = np.maximum(0,np.minimum(4,home_score[1:] - home_score[:-1])).astype(int).tolist() + [0]
df['home_score_diff'] = df['home_score_diff'].astype('category')
df['visiting_score'] = df['visiting_score'].astype('category')
df['visiting_score_diff'] = np.maximum(0,np.minimum(4,visiting_score[1:]-visiting_score[:-1])).astype(int).tolist()+[0]
df['visiting_score_diff'] = df['visiting_score_diff'].astype('category')
df['outs'] = df['outs'].astype('category')

In [5]:
df['event_final_code'] = df['event_final'].cat.codes
df['home_score_code'] = df['home_score_diff'].cat.codes
df['visiting_score_code'] = df['visiting_score_diff'].cat.codes
df['outs_code'] = df['outs'].cat.codes
df['outs_diff_code'] = df['outs_diff'].cat.codes

In [6]:
situation = df[['inning', 'home_score', 'visiting_score', 'batting team', 'outs', 'first', 'second', 'third']].values
batter = df['res batter'].values
pitcher = df['res pitcher'].values
balls = df['balls'].values
strikes = df['strikes'].values
fouls = df['fouls'].values
outcome = df['event_final_code'].values
outcome_onehot = pd.get_dummies(df['event_final_code']).values
inning_diff = df['inning_diff'].values#pd.get_dummies(df['inning_code']).values
home_score_onehot = pd.get_dummies(df['home_score_code']).values
visiting_score_onehot = pd.get_dummies(df['visiting_score_code']).values
outs_onehot = pd.get_dummies(df['outs_code']).values
outs_diff_onehot = pd.get_dummies(df['outs_diff_code']).values
first = df['first'].values
second = df['second'].values
third = df['third'].values

In [7]:
df.iloc[40000:40060]

Unnamed: 0,gameid,opp,inning,batting team,outs,balls,strikes,pitch sequence,visiting_score,home_score,...,inning_diff,outs_change,outs_diff,home_score_diff,visiting_score_diff,event_final_code,home_score_code,visiting_score_code,outs_code,outs_diff_code
40000,CHN201006200,ANA,1,1,2,3,2,CBFBBX,0,2,...,1,-2,1,0,0,19,0,0,2,1
40001,CHN201006200,ANA,2,0,0,0,1,CX,0,2,...,0,0,0,0,0,21,0,0,0,0
40002,CHN201006200,ANA,2,0,0,3,2,BBCBCT,0,2,...,0,2,2,0,0,72,0,0,0,2
40003,CHN201006200,ANA,2,0,2,3,2,BFBBCC,0,2,...,0,-2,1,0,0,39,0,0,2,1
40004,CHN201006200,ANA,2,1,0,3,2,CFBBBFFX,0,2,...,0,0,0,0,0,90,0,0,0,0
40005,CHN201006200,ANA,2,1,0,1,2,LLBX,0,2,...,0,1,1,0,0,21,0,0,0,1
40006,CHN201006200,ANA,2,1,1,2,2,SFBFBX,0,2,...,0,0,0,0,0,63,0,0,1,0
40007,CHN201006200,ANA,2,1,1,0,0,X,0,2,...,0,0,0,1,0,52,1,0,1,0
40008,CHN201006200,ANA,2,1,1,0,0,X,0,3,...,0,1,1,1,0,38,1,0,1,1
40009,CHN201006200,ANA,2,1,2,1,2,FBFS,0,4,...,1,-2,1,0,0,72,0,0,2,1


In [8]:
samples = inds.shape[0]-1

p = np.random.permutation(samples)
train_inds = inds[p[:samples//5 * 4]]
test_inds = inds[p[samples//5 * 4:]]

In [9]:
situation_train = situation[train_inds, 3:]
# outcome_train = outcome_onehot[train_inds]
outcome_train = df['event_final_code'].values[train_inds]
y_train = [outs_diff_onehot[train_inds].astype(float), 
           visiting_score_onehot[train_inds].astype(float), 
           home_score_onehot[train_inds].astype(float),
           first[train_inds+1].astype(float),
           second[train_inds+1].astype(float),
           third[train_inds+1].astype(float)]

situation_test = situation[test_inds, 3:]
# outcome_test = outcome_onehot[test_inds]
outcome_test = df['event_final_code'].values[test_inds]
y_test = [outs_diff_onehot[test_inds].astype(float), 
          visiting_score_onehot[test_inds].astype(float), 
          home_score_onehot[test_inds].astype(float),
          first[test_inds+1].astype(float),
          second[test_inds+1].astype(float),
          third[test_inds+1].astype(float)]

In [10]:
X_train = [situation_train.astype(float), np.expand_dims(outcome_train, axis=1).astype(np.int32)]
X_test = [situation_test.astype(float), np.expand_dims(outcome_test, axis=1).astype(np.int32)]

In [11]:
def build_model(sit_shape_in, outcome_size, hidden, activations):
    
    situation = Input(shape=(sit_shape_in,))

    outcome = Input(shape=(1,))

    out_emb = Embedding(outcome_size, 8)(outcome)
    
    out_emb = Reshape((8,))(out_emb)

    X = concatenate([situation, out_emb], axis=-1)
    
    X_int = Dense(hidden, activation=activations)(X)

    X_int = Dense(hidden, activation=activations)(X_int)

    #X_int = Dropout(0.25)(X_int)
    
    X_int = Dense(hidden, activation=activations)(X_int)

    inning_diff = Dense(1, activation='sigmoid')(X_int)

    half_inning = Dense(1, activation='sigmoid')(X_int)

    outs = Dense(3, activation='softmax')(X_int)

    visiting_score = Dense(5, activation='softmax')(X_int)

    home_score = Dense(5, activation='softmax')(X_int)

    first = Dense(1, activation='sigmoid')(X_int)

    second = Dense(1, activation='sigmoid')(X_int)

    third = Dense(1, activation='sigmoid')(X_int)

    return Model(inputs=[situation, outcome], 
                 outputs=[outs, visiting_score, home_score, first, second, third])

In [12]:
model = build_model(situation_train.shape[1], np.max(outcome_train)+1, 64, 'tanh')

In [13]:
def compile_model(model, LR):
    adam = tf.keras.optimizers.Adam(lr=LR)
    model.compile(
        optimizer=adam,
        loss=['categorical_crossentropy',
            'categorical_crossentropy', 'categorical_crossentropy', 'binary_crossentropy',
            'binary_crossentropy', 'binary_crossentropy'],
        metrics=['accuracy'])
    return model

In [14]:
# model = load_model('..\\models\\situation_prediction_relu5')

In [15]:
model = compile_model(model, 1e-2)
for i in range(90):
    if i == 30:
        model = compile_model(model, 1e-3)
    elif i == 60:
        model = compile_model(model, 1e-4)
    print('epoch: '+str(i+1))
    model.fit(x=X_train,
              y=y_train, 
              batch_size=situation_train.shape[0]//20,
              epochs=1,
              shuffle=True,
              verbose=1)

on 1323296 samples
epoch: 44
Train on 1323296 samples
epoch: 45
Train on 1323296 samples
epoch: 46
Train on 1323296 samples
epoch: 47
Train on 1323296 samples
epoch: 48
Train on 1323296 samples
epoch: 49
Train on 1323296 samples
epoch: 50
Train on 1323296 samples
epoch: 51
Train on 1323296 samples
epoch: 52
Train on 1323296 samples
epoch: 53
Train on 1323296 samples
epoch: 54
Train on 1323296 samples
epoch: 55
Train on 1323296 samples
epoch: 56
Train on 1323296 samples
epoch: 57
Train on 1323296 samples
epoch: 58
Train on 1323296 samples
epoch: 59
Train on 1323296 samples
epoch: 60
Train on 1323296 samples
epoch: 61
Train on 1323296 samples
epoch: 62
Train on 1323296 samples
epoch: 63
Train on 1323296 samples
epoch: 64
Train on 1323296 samples
epoch: 65
Train on 1323296 samples
epoch: 66
Train on 1323296 samples
epoch: 67
Train on 1323296 samples
epoch: 68
Train on 1323296 samples
epoch: 69
Train on 1323296 samples
epoch: 70
Train on 1323296 samples
epoch: 71
Train on 1323296 samples
e

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 8)         904         input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 5)]          0                                            
__________________________________________________________________________________________________
reshape (Reshape)               (None, 8)            0           embedding[0][0]                  
______________________________________________________________________________________________

In [17]:
[loss, l3, l4, l5, l6, l7, l8, 
 outs_acc, 
 visiting_score_acc, home_score_acc, 
 first_acc, second_acc, third_acc] = model.evaluate(X_test, y_test, batch_size=situation_test.shape[0]//20)



In [18]:
#print('inning accuracy: '+str(inning_acc))
#print('half inning accuracy: '+str(half_inning_acc))
print('outs accuracy: '+str(outs_acc))
print('visiting score accuracy: '+str(visiting_score_acc))
print('home score accuracy: '+str(home_score_acc))
print('first accuracy: '+str(first_acc))
print('second accuracy: '+str(second_acc))
print('third accuracy: '+str(third_acc))

outs accuracy: 0.96034473
visiting score accuracy: 0.9889428
home score accuracy: 0.98809946
first accuracy: 0.94584465
second accuracy: 0.94121987
third accuracy: 0.9528695


In [34]:
# print('inning accuracy: '+str(inning_acc))
# print('half inning accuracy: '+str(half_inning_acc))
print('outs accuracy: '+str(outs_acc))
print('visiting score accuracy: '+str(visiting_score_acc))
print('home score accuracy: '+str(home_score_acc))
print('first accuracy: '+str(first_acc))
print('second accuracy: '+str(second_acc))
print('third accuracy: '+str(third_acc))

outs accuracy: 0.9601941
visiting score accuracy: 0.988743
home score accuracy: 0.9875422
first accuracy: 0.9456364
second accuracy: 0.94013107
third accuracy: 0.952595


In [42]:
print('outs accuracy: '+str(l3))
print('visiting score accuracy: '+str(l4))
print('home score accuracy: '+str(l5))
print('first accuracy: '+str(l6))
print('second accuracy: '+str(l7))
print('third accuracy: '+str(l8))

outs accuracy: 0.12330812
visiting score accuracy: 0.027783116
home score accuracy: 0.030062685
first accuracy: 0.15806599
second accuracy: 0.15753484
third accuracy: 0.10959541


In [35]:
print('outs accuracy: '+str(l3))
print('visiting score accuracy: '+str(l4))
print('home score accuracy: '+str(l5))
print('first accuracy: '+str(l6))
print('second accuracy: '+str(l7))
print('third accuracy: '+str(l8))

outs accuracy: 0.12205936
visiting score accuracy: 0.026567861
home score accuracy: 0.028697474
first accuracy: 0.15731363
second accuracy: 0.15605888
third accuracy: 0.10687726


In [19]:
y_pred = model(X_test)

In [20]:
y_pred = np.concatenate((y_pred), axis=1)

In [21]:
model.save('..\\models\\situation_prediction_tanh3')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ..\models\situation_prediction_tanh3\assets
INFO:tensorflow:Assets written to: ..\models\situation_prediction_tanh3\assets


In [22]:
model2 = tf.keras.models.load_model('..\\models\\situation_prediction_tanh3')

In [23]:
cats = df['event_final'].astype('category')

outcome_dict = {}
for i, cat in enumerate(cats.cat.categories):
    outcome_dict[cat] = i
outcome_dict_rev = dict(enumerate(cats.cat.categories))
N_outcomes = len(outcome_dict)

In [24]:
outcome_dict

{'1B': 0,
 '1G': 1,
 '1L': 2,
 '1P': 3,
 '1SH': 4,
 '2B': 5,
 '2G': 6,
 '2P': 7,
 '2SH': 8,
 '34G': 9,
 '3B': 10,
 '3EG': 11,
 '3G': 12,
 '3L': 13,
 '3P': 14,
 '3SH': 15,
 '4B': 16,
 '4G': 17,
 '4L': 18,
 '4P': 19,
 '5B': 20,
 '5G': 21,
 '5L': 22,
 '5P': 23,
 '5SH': 24,
 '6G': 25,
 '6L': 26,
 '6P': 27,
 '6SF': 28,
 '7F': 29,
 '7L': 30,
 '7SF': 31,
 '8F': 32,
 '8L': 33,
 '8SF': 34,
 '9F': 35,
 '9G': 36,
 '9L': 37,
 '9SF': 38,
 'C': 39,
 'D1G': 40,
 'D3G': 41,
 'D4G': 42,
 'D5G': 43,
 'D7F': 44,
 'D7G': 45,
 'D7L': 46,
 'D8F': 47,
 'D8G': 48,
 'D8L': 49,
 'D9F': 50,
 'D9G': 51,
 'D9L': 52,
 'DG': 53,
 'DG7': 54,
 'DG8': 55,
 'DG89': 56,
 'DG9': 57,
 'FC1': 58,
 'FC2': 59,
 'FC3': 60,
 'FC4': 61,
 'FC5': 62,
 'FC6': 63,
 'H': 64,
 'H7': 65,
 'H78': 66,
 'H8': 67,
 'H89': 68,
 'H9': 69,
 'HF': 70,
 'IW': 71,
 'K': 72,
 'S1B': 73,
 'S1G': 74,
 'S1L': 75,
 'S2B': 76,
 'S2G': 77,
 'S3B': 78,
 'S3G': 79,
 'S3L': 80,
 'S3P': 81,
 'S4B': 82,
 'S4G': 83,
 'S4L': 84,
 'S4P': 85,
 'S5B': 86,
 'S5G'

In [25]:
def update_situation(situation, outcome, model=model2):
    vals = np.array(list(situation.values()))[3:].astype(np.float32)
    pred = model([vals.reshape(1, -1), outcome.astype(np.int32)])
    pred = [np.squeeze(p.numpy()) for p in pred]
    valid = False
    count = 0
    while not valid and count < 20:
        count += 1
        inning = np.random.choice(2, p=[1-pred[0], pred[0]])
        half_inning = np.random.choice(2, p=[1-pred[1], pred[1]])
        outs = np.random.choice(pred[2].shape[0], p=pred[2])
        visiting_score = np.random.choice(pred[3].shape[0], p=pred[3])
        home_score = np.random.choice(pred[4].shape[0], p=pred[4])
        first = np.random.choice(2, p=[1-pred[5], pred[5]])
        second = np.random.choice(2, p=[1-pred[6], pred[6]])
        third = np.random.choice(2, p=[1-pred[7], pred[7]])
        situation_out = {'inning diff' : inning,
                         'home score diff' : home_score,
                         'visiting score diff' : visiting_score,
                         'half inning' : half_inning,
                         'outs' : outs,
                         'first' : first,
                         'second' : second,
                         'third' : third}
        valid = compare_situation(situation, situation_out)
        if not valid:
            print('reject')
    return situation_out, pred

def compare_situation(situation, update):
    base_runners = situation['first'] + situation['second'] + situation['third']
    base_runners_update = update['first'] + update['second'] + update['third']
    base_runners_diff = base_runners_update - base_runners
    if update['half inning'] != situation['half inning']:
        if update['outs'] != 0:
            print('reset outs')
            return False
        elif (not update['half inning']) & (update['inning diff'] != 1):
            print('reset inning')
            return False
        elif update['first'] or update['second'] or update['third']:
            print('reset base runners')
            return False
        else:
            return True
    elif (update['half inning'] == situation['half inning']) & (update['outs'] < situation['outs']):
        print('outs decreasing')
        return False
    elif update['half inning'] & \
        (update['outs'] - situation['outs'] + update['home score diff'] - 1 != -base_runners_diff):
        print('more outs than base runner change')
        return False
    elif (not update['half inning']) & \
        (update['outs'] - situation['outs'] + update['visiting score diff'] - 1 != -base_runners_diff):
        print('more outs than base runner change')
        return False
    else:
        return True


In [26]:
def get_sit_feed(sit, out, outcome_dict):
    sit_feed = {'inning' : sit[0],
                'home_score' : sit[1], 
                'visiting_score' : sit[2],
                'half inning' : sit[3],
                'outs' : sit[4],
                'first' : sit[5],
                'second' : sit[6],
                'third' : sit[7]}
    return sit_feed, outcome_dict[out]

def get_sit_out_test(y_test, ind):
    test = [t[ind] for t in y_test]
    sit_out = {'inning_diff' : int(round(test[0])),
               'home score diff' : np.where(test[4])[0][0],
               'visiting score diff' : np.where(test[3])[0][0],
               'half inning' : int(round(test[1])),
               'outs' : np.where(test[2])[0][0],
               'first' : int(round(test[5])),
               'second' : int(round(test[6])),
               'third' : int(round(test[7]))}
    return sit_out

In [27]:
ind = 13040

In [28]:
ind += 1

In [29]:
sit_feed, outcome = get_sit_feed(situation[test_inds[ind]], outcome_test[ind], outcome_dict_rev)
print(sit_feed)
print(outcome)

{'inning': 2, 'home_score': 0, 'visiting_score': 1, 'half inning': 1, 'outs': 0, 'first': 1, 'second': 0, 'third': 0}
6G


In [30]:
sit_out = get_sit_out_test(y_test, ind)
sit_pred, pred = update_situation(sit_feed, np.array([[outcome_dict[outcome]]]), model)
print(outcome)
print(sit_out)
print(sit_pred)

TypeError: type numpy.ndarray doesn't define __round__ method

In [31]:
plt.bar(np.arange(3), [pred[5], pred[6], pred[7]])
plt.ylim([0, 1])
plt.xticks(np.arange(3), ['1st', '2nd', '3rd'])
plt.title('Probability of base runners')
plt.show()

plt.bar(np.arange(3), pred[2])
plt.ylim([0, 1])
plt.xticks(np.arange(3), ['0', '1', '2'])
plt.title('Probability of outs')
plt.show()

NameError: name 'pred' is not defined