In [120]:
import numpy as np
import pandas as pd
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras import callbacks

In [121]:
#reading in team data
data = pd.read_csv("team_data.csv")

#changing binary outcome to 0 and 1
data.LgWin = data.LgWin.replace(['N', 'Y'], [0, 1])

#data2 is all the data from 1995-2015 and NL only
data2 = data[(data.yearID >= 1995) & (data.yearID <= 2015) & (data.lgID == 'NL')]

#columns that I want to use to predict LgWin: I want to use fielding percentage, stolen bases, walks, hits 
cols = ['LgWin', 'FP', 'SB', 'BB', 'H', 'HR', 'ERA', 'R', 'X2B', 'X3B']

#storing all relevant columns into df, then splitting into training and validation sets
df = data2[cols]

df_train = df.sample(frac=0.75, random_state=0)
df_valid = df.drop(df_train.index)

X_train = df_train.drop('LgWin', axis=1)
X_valid = df_valid.drop('LgWin', axis=1)
y_train = df_train['LgWin']
y_valid = df_valid['LgWin']

In [None]:
#Neural network with 4 hidden layers, batch normalization, and dropout

model = keras.Sequential([
    layers.BatchNormalization(input_shape = [9]),
    layers.Dense(10, activation='relu', input_shape=[9]),  
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1, activation='sigmoid'),
])

#compiling using adam optimizer

opt = keras.optimizers.Adam(learning_rate=0.01)

model.compile(
    optimizer = opt,
    loss ='binary_crossentropy',
    metrics =['binary_accuracy'],
)

#early stopping

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)


history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=300,
    verbose = False
)

In [None]:
# variables that I trained the neural network on
predict_cols = ['FP', 'SB', 'BB', 'H',  'HR', 'ERA', 'R', 'X2B', 'X3B']
# same as above, but with important information like team, year, and whether the National League was won.
all_relevant_cols = ['FP', 'SB', 'BB', 'H', 'HR', 'ERA', 'R','X2B', 'X3B', 'teamID', 'yearID',  'LgWin']
Z = data[(data.yearID >= 2016) & (data.yearID <= 2020) & (data.lgID == 'NL')]
predict_these = Z[all_relevant_cols]
predict_these_inputs = Z[predict_cols]

In [117]:
#Converting model predictions from numpy array to dataframe
predictions = pd.DataFrame(model.predict(predict_these_inputs))

#Attaching new index to predict_these and predictions starting at 1 and going to 75
new_index = [i for i in range(1, 76)]

predict_these['new_index'] = new_index
predictions['new_index'] = new_index
predict_these.set_index('new_index', inplace = True)
predictions.set_index('new_index', inplace = True)

#combining the predictions with the data used for the predictions
combined = pd.concat((predict_these, predictions), axis = 1)

#rename column with predictions to probability_Lg_Win, display all 75 rows (5 years * 15 teams)

combined.rename(columns = {0: "probability_Lg_Win"}, inplace=True)
pd.set_option("display.max_rows", None) 

In [118]:
#changing LgWin column back to ['N', 'Y']; it was changed to [0,1] for classification
final_data = combined.copy()
final_data.LgWin = final_data.LgWin.replace([0, 1], ['N', 'Y'])
final_data

Unnamed: 0_level_0,FP,SB,BB,H,HR,ERA,R,X2B,X3B,teamID,yearID,LgWin,probability_Lg_Win
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.983,137,463,1479,190,5.09,752,285,56,ARI,2016,N,0.0002062917
2,0.983,75,502,1404,122,4.51,649,295,27,ATL,2016,N,0.0001684725
3,0.983,66,656,1409,199,3.15,808,293,30,CHN,2016,Y,0.09472162
4,0.983,139,452,1403,164,4.91,716,277,33,CIN,2016,N,6.352732e-05
5,0.982,66,494,1544,204,4.91,845,318,47,COL,2016,N,0.00776276
6,0.986,45,525,1376,189,3.7,725,272,21,LAN,2016,N,0.04712927
7,0.985,71,447,1460,128,4.05,655,259,42,MIA,2016,N,0.004623801
8,0.978,181,599,1299,194,4.08,671,249,19,MIL,2016,N,3.85151e-05
9,0.985,42,517,1342,218,3.57,671,240,19,NYN,2016,N,0.04791355
10,0.984,96,424,1305,161,4.63,610,231,35,PHI,2016,N,0.0001376867
