In [65]:
import numpy as np
import pandas as pd
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras import callbacks

In [66]:
#reading in team data
data = pd.read_csv("team_data.csv")

#changing binary outcome to 0 and 1
data.LgWin = data.LgWin.replace(['N', 'Y'], [0, 1])

#data2 is all the data from 1995-2015 and NL only
data2 = data[(data.yearID >= 1995) & (data.yearID <= 2015) & (data.lgID == 'NL')]

#columns that I want to use to predict LgWin: I want to use fielding percentage, stolen bases, walks, hits 
cols = ['LgWin', 'FP', 'SB', 'BB', 'H', 'HR', 'ERA', 'R', 'X2B', 'X3B']

#storing all relevant columns into df, then splitting into training and validation sets
df = data2[cols]

df_train = df.sample(frac=0.75, random_state=0)
df_valid = df.drop(df_train.index)

X_train = df_train.drop('LgWin', axis=1)
X_valid = df_valid.drop('LgWin', axis=1)
y_train = df_train['LgWin']
y_valid = df_valid['LgWin']

In [67]:
#Neural network with 4 hidden layers, batch normalization, and dropout

model = keras.Sequential([
    layers.BatchNormalization(input_shape = [9]),
    layers.Dense(10, activation='relu', input_shape=[9]),  
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1, activation='sigmoid'),
])

#compiling using adam optimizer

opt = keras.optimizers.Adam(learning_rate=0.01)

model.compile(
    optimizer = opt,
    loss ='binary_crossentropy',
    metrics =['binary_accuracy'],
)

#early stopping

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)



history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=300,
    verbose = False
)


In [68]:
cols2 = ['FP', 'SB', 'BB', 'H',  'HR', 'ERA', 'R', 'X2B', 'X3B']
cols3 = ['teamID', 'yearID', 'FP', 'SB', 'BB', 'H', 'HR', 'ERA', 'R','X2B', 'X3B', 'LgWin']
Z = data[(data.yearID >= 2016) & (data.yearID <= 2020) & (data.lgID == 'NL')]
predict_these = Z[cols3]

In [69]:
predictions = pd.DataFrame(model.predict(Z[cols2]))

new_index = [i for i in range(75)]

predictions

predict_these['new_index'] = new_index
predict_these.set_index('new_index', inplace = True)

combined = pd.concat((predictions, predict_these), axis = 1)

In [70]:
combined.rename(columns = {0: "probability_Lg_Win"}, inplace=True)
pd. set_option("display.max_rows", None) 

In [71]:
combined

Unnamed: 0,probability_Lg_Win,teamID,yearID,FP,SB,BB,H,HR,ERA,R,X2B,X3B,LgWin
0,0.039499,ARI,2016,0.983,137,463,1479,190,5.09,752,285,56,0.0
1,0.029169,ATL,2016,0.983,75,502,1404,122,4.51,649,295,27,0.0
2,0.121899,CHN,2016,0.983,66,656,1409,199,3.15,808,293,30,1.0
3,0.022278,CIN,2016,0.983,139,452,1403,164,4.91,716,277,33,0.0
4,0.051713,COL,2016,0.982,66,494,1544,204,4.91,845,318,47,0.0
5,0.057406,LAN,2016,0.986,45,525,1376,189,3.7,725,272,21,0.0
6,0.048637,MIA,2016,0.985,71,447,1460,128,4.05,655,259,42,0.0
7,0.01841,MIL,2016,0.978,181,599,1299,194,4.08,671,249,19,0.0
8,0.073223,NYN,2016,0.985,42,517,1342,218,3.57,671,240,19,0.0
9,0.026217,PHI,2016,0.984,96,424,1305,161,4.63,610,231,35,0.0
