In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras import layers
from tensorflow.keras import callbacks

In [2]:
#reading in team data
data = pd.read_csv("team_data.csv")

#changing binary outcome to 0 and 1
data.LgWin = data.LgWin.replace(['N', 'Y'], [0, 1])

#data2 is all the data from 1995-2015 and NL only (this is the longest possible range that contains all the variables of interest)
data2 = data[(data.yearID >= 1995) & (data.yearID <= 2015) & (data.lgID == 'NL')]

#columns that I want to use to predict LgWin: I want to use fielding percentage, stolen bases, walks, hits 
cols = ['LgWin', 'FP', 'SB', 'BB', 'H', 'HR', 'ERA', 'R', 'X2B', 'X3B']

#storing all relevant columns into df, then splitting into training and validation sets
df = data2[cols]

df_train = df.sample(frac=0.75, random_state=0)
df_valid = df.drop(df_train.index)

X_train = df_train.drop('LgWin', axis=1)
X_valid = df_valid.drop('LgWin', axis=1)
y_train = df_train['LgWin']
y_valid = df_valid['LgWin']

In [3]:
#Neural network with 4 hidden layers, batch normalization, and dropout

model = keras.Sequential([
    layers.BatchNormalization(input_shape = [9]),
    layers.Dense(10, activation='relu', input_shape=[9]),  
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(10, activation='relu'), 
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1, activation='sigmoid'),
])

#compiling using adam optimizer

opt = keras.optimizers.Adam(learning_rate=0.01)

model.compile(
    optimizer = opt,
    loss ='binary_crossentropy',
    metrics =['binary_accuracy'],
)

#early stopping

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=500,
    verbose = False
)

In [4]:
# variables that I trained the neural network on
predict_cols = ['FP', 'SB', 'BB', 'H',  'HR', 'ERA', 'R', 'X2B', 'X3B']

# same as above, but with important information like team, year, and whether the National League was won. Mainly to compare predictions against actual results

all_relevant_cols = ['FP', 'SB', 'BB', 'H', 'HR', 'ERA', 'R','X2B', 'X3B', 'teamID', 'yearID',  'LgWin']
Z = data[(data.yearID >= 2016) & (data.yearID <= 2020) & (data.lgID == 'NL')]
predict_these = Z[all_relevant_cols]
predict_these_inputs = Z[predict_cols]

In [5]:
#Converting model predictions from numpy array to dataframe
predictions = pd.DataFrame(model.predict(predict_these_inputs))

#Attaching new index to predict_these and predictions starting at 1 and going to 75
new_index = [i for i in range(1, 76)]

predict_these['new_index'] = new_index
predictions['new_index'] = new_index
predict_these.set_index('new_index', inplace = True)
predictions.set_index('new_index', inplace = True)

#combining the predictions with the data used for the predictions
combined = pd.concat((predict_these, predictions), axis = 1)

#rename column with predictions to probability_Lg_Win, display all 75 rows (5 years * 15 teams)

combined.rename(columns = {0: "probability_Lg_Win"}, inplace=True)

combined
#pd.set_option("display.max_rows", None) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predict_these['new_index'] = new_index


Unnamed: 0_level_0,FP,SB,BB,H,HR,ERA,R,X2B,X3B,teamID,yearID,LgWin,probability_Lg_Win
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.983,137,463,1479,190,5.09,752,285,56,ARI,2016,0.0,0.002860
2,0.983,75,502,1404,122,4.51,649,295,27,ATL,2016,0.0,0.002853
3,0.983,66,656,1409,199,3.15,808,293,30,CHN,2016,1.0,0.191406
4,0.983,139,452,1403,164,4.91,716,277,33,CIN,2016,0.0,0.001774
5,0.982,66,494,1544,204,4.91,845,318,47,COL,2016,0.0,0.012850
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.978,16,167,425,59,4.68,219,76,6,PIT,2020,0.0,0.000047
72,0.985,55,204,506,95,3.86,325,103,12,SDN,2020,0.0,0.154161
73,0.980,19,195,532,81,4.64,299,107,14,SFN,2020,0.0,0.000306
74,0.983,18,205,410,51,3.90,240,73,7,SLN,2020,0.0,0.005095


In [6]:
final_data = combined.copy()

#changing LgWin column back to ['N', 'Y']; it was changed to [0,1] for classification

final_data.LgWin = final_data.LgWin.replace([0, 1], ['N', 'Y'])

#getting the index of final_data containing the maximum league win probability that my model assigned to each team

highest_probability_each_year = final_data.groupby('yearID').probability_Lg_Win.idxmax()
highest_probability_each_year

yearID
2016     3
2017    21
2018    36
2019    51
2020    66
Name: probability_Lg_Win, dtype: int64

In [7]:
#The team my model predicts to win the national league wins each year from 2016-2020 except for the 2019 Washington Nationals.

final_data.loc[highest_probability_each_year]

Unnamed: 0_level_0,FP,SB,BB,H,HR,ERA,R,X2B,X3B,teamID,yearID,LgWin,probability_Lg_Win
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3,0.983,66,656,1409,199,3.15,808,293,30,CHN,2016,Y,0.191406
21,0.985,77,649,1347,221,3.38,770,312,20,LAN,2017,Y,0.202654
36,0.983,75,647,1394,235,3.38,804,296,33,LAN,2018,Y,0.206411
51,0.982,57,607,1414,279,3.37,886,302,20,LAN,2019,N,0.273965
66,0.982,29,228,523,118,3.02,349,97,6,LAN,2020,Y,0.248405
