In [37]:
import requests
import urllib.request
import numpy as np
import time
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle
from sklearn.metrics import mean_squared_error, mean_squared_log_error

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)



In [38]:
with open('phillies_2019.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    phillies_2019 = pickle.load(f)

In [39]:
with open('phils_prior.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    phils_prior = pickle.load(f)

# Combine and Re-Organize Data

In [40]:
phils_total_sum = pd.concat([phils_prior, phillies_2019])

In [41]:
phils_total_sum.reset_index(drop=True, inplace = True)

Y = phils_total_sum.opp_Win

phils_total_sum = phils_total_sum.drop(['opp_Win'], axis=1)
phils_total_sum = phils_total_sum.drop(['Game', 'opp_Game', 'IP', 'opp_IP'], axis=1)


In [42]:
int_columns = phils_total_sum.columns[:-2]

In [43]:
for x in int_columns:
    phils_total_sum[x] = phils_total_sum[x].astype(str).astype(int)

In [44]:
phils_total_sum.insert(loc=12, column = 'BA', value = phils_total_sum['H']/phils_total_sum['AB'])

In [45]:
phils_total_sum.insert(loc=34, column = 'opp_BA', value = phils_total_sum['opp_H']/phils_total_sum['opp_AB'])

In [46]:
phils_total_sum.insert(loc=13, column = 'OBP', value = (phils_total_sum['H'] + phils_total_sum['BB'] + phils_total_sum['opp_HBP'])/ phils_total_sum['PA'])

In [47]:
phils_total_sum.insert(loc=36, column = 'opp_OBP', value = (phils_total_sum['opp_H'] + phils_total_sum['opp_BB'] + phils_total_sum['HBP'])/ phils_total_sum['opp_PA'])

In [48]:
phils_total_sum.insert(loc=14, column = 'SLG', value = (((phils_total_sum['H']-(phils_total_sum['2B']+phils_total_sum['3B']+phils_total_sum['HR']))+
                                                        (phils_total_sum['2B']*2) + (phils_total_sum['3B'] *3) + (phils_total_sum['HR']*4))/ 
                                                        phils_total_sum['AB']))





In [49]:
phils_total_sum.insert(loc=38, column = 'opp_SLG', value = (((phils_total_sum['opp_H']-(phils_total_sum['opp_2B']+phils_total_sum['opp_3B']+phils_total_sum['opp_HR']))+
                                                        (phils_total_sum['opp_2B']*2) + (phils_total_sum['opp_3B'] *3) + (phils_total_sum['opp_HR']*4))/ 
                                                        phils_total_sum['opp_AB']))





In [50]:
phils_total_sum.insert(loc=15, column = 'OPS', value = phils_total_sum['OBP'] + phils_total_sum['SLG'])




In [51]:
phils_total_sum.insert(loc=40, column = 'opp_OPS', value = phils_total_sum['opp_OBP'] + phils_total_sum['opp_SLG'])




In [52]:
features_team = [col for col in phils_total_sum.columns if phils_total_sum[col].dtype in [np.object]]
X_team = phils_total_sum[features_team]

np.shape(X_team)

(753, 2)

In [53]:
# Make dummies
X_team = pd.get_dummies(X_team, drop_first=True)
np.shape(X_team)

(753, 54)

# Logistic Regression

In [54]:
averages = pd.DataFrame(columns = phils_total_sum.columns[:-2])

for i in range(1,len(phils_total_sum)):
    temp = phils_total_sum.loc[:i, :].mean()
    temp = pd.DataFrame(temp).T
    temp['opp_Win'] = Y[i]
    averages = pd.concat([averages,temp], ignore_index=True)

In [55]:
X = averages.drop(columns = ('opp_Win'))
Y = averages['opp_Win']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [56]:
# y_train = y_train.astype(int)

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept = False, C = 1e12, solver='liblinear')
model_log = logreg.fit(X_train, y_train)
# model_log

In [57]:
y_hat_test = logreg.predict(X_test)
y_hat_train = logreg.predict(X_train)

In [58]:
import numpy as np
#We could subtract the two columns. If values or equal, difference will be zero. Then count number of zeros.
residuals = np.abs(y_train - y_hat_train)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0.0    357
1.0    207
Name: opp_Win, dtype: int64
0.0    0.632979
1.0    0.367021
Name: opp_Win, dtype: float64


In [59]:
print('Training r^2:', logreg.score(X_train.astype(int), y_train.astype(int)))
print('Testing r^2:', logreg.score(X_test.astype(int), y_test.astype(int)))
print('Training MSE:', mean_squared_error(y_train.astype(int), logreg.predict(X_train.astype(int))))
print('Testing MSE:', mean_squared_error(y_test.astype(int), logreg.predict(X_test.astype(int))))

Training r^2: 0.49645390070921985
Testing r^2: 0.5851063829787234
Training MSE: 0.5035460992907801
Testing MSE: 0.4148936170212766


In [60]:
from sklearn import preprocessing

# scale the data and perform train test split
X_scaled = preprocessing.scale(X)

# X_train, X_test, y_train, y_test = train_test_split(X_scaled,Y)

In [61]:
logreg = LogisticRegression(fit_intercept = False, C = 1e0, solver='liblinear')
model_log = logreg.fit(X_train.astype(int), y_train.astype(int))
print('Training r^2:', logreg.score(X_train.astype(int), y_train.astype(int)))
print('Testing r^2:', logreg.score(X_test.astype(int), y_test.astype(int)))
print('Training MSE:', mean_squared_error(y_train.astype(int), logreg.predict(X_train.astype(int))))
print('Testing MSE:', mean_squared_error(y_test.astype(int), logreg.predict(X_test.astype(int))))

Training r^2: 0.5921985815602837
Testing r^2: 0.5159574468085106
Training MSE: 0.4078014184397163
Testing MSE: 0.48404255319148937


In [62]:
X_all = pd.concat([pd.DataFrame(X_scaled), X_team], axis = 1)
# X_train, X_test, y_train, y_test = train_test_split(X_scaled,Y)
X_train = X_scaled[:600,:]
X_test = X_scaled[600:, :]
y_train = Y.loc[:599]
y_test = Y.loc[600:]

In [63]:
logreg_all = LogisticRegression(fit_intercept = True, C = 1e0, penalty='l1')
logreg_all.fit(X_train.astype(int), y_train.astype(int))
print('Training r^2:', logreg_all.score(X_train.astype(int), y_train.astype(int)))
print('Testing r^2:', logreg_all.score(X_test.astype(int), y_test.astype(int)))
print('Training MSE:', mean_squared_error(y_train.astype(int), logreg_all.predict(X_train.astype(int))))
print('Testing MSE:', mean_squared_error(y_test.astype(int), logreg_all.predict(X_test.astype(int))))

Training r^2: 0.5933333333333334
Testing r^2: 0.45394736842105265
Training MSE: 0.4066666666666667
Testing MSE: 0.5460526315789473


In [64]:
#X_test[-1, :]

In [65]:
# corr = phils_total_sum.corr()
# corr[corr<-0.5]

# Evaluating Logistic Regression

In [66]:
def precision(y_hat, y):
    #Could also use confusion matrix
    y_y_hat = list(zip(y, y_hat))
    tp = sum([1 for i in y_y_hat if i[0]==1 and i[1]==1])
    fp = sum([1 for i in y_y_hat if i[0]==0 and i[1]==1])
    return tp/float(tp+fp)

def recall(y_hat, y):
    #Could also use confusion matrix
    y_y_hat = list(zip(y, y_hat))
    tp = sum([1 for i in y_y_hat if i[0]==1 and i[1]==1])
    fn = sum([1 for i in y_y_hat if i[0]==1 and i[1]==0])
    return tp/float(tp+fn)

def accuracy(y_hat, y):
    #Could also use confusion matrix
    y_y_hat = list(zip(y, y_hat))
    tp = sum([1 for i in y_y_hat if i[0]==1 and i[1]==1])
    tn = sum([1 for i in y_y_hat if i[0]==0 and i[1]==0])
    return (tp+tn)/float(len(y_hat))

def f1(y_hat,y):
    precision_score = precision(y_hat,y)
    recall_score = recall(y_hat,y)
    numerator = precision_score * recall_score
    denominator = precision_score + recall_score
    return 2 * (numerator / denominator)

y_hat_test = logreg.predict(X_test)
y_hat_train = logreg.predict(X_train)

#print('Training Precision: ', precision(y_hat_train, y_train))
#print('Testing Precision: ', precision(y_hat_test, y_test))
#print('\n\n')

#print('Training Recall: ', recall(y_hat_train, y_train))
#print('Testing Recall: ', recall(y_hat_test, y_test))
#print('\n\n')

print('Training Accuracy: ', accuracy(y_hat_train, y_train))
print('Testing Accuracy: ', accuracy(y_hat_test, y_test))
print('\n\n')

#print('Training F1-Score: ',f1(y_hat_train,y_train))
#print('Testing F1-Score: ',f1(y_hat_test,y_test))

Training Accuracy:  0.5633333333333334
Testing Accuracy:  0.46710526315789475





# XG Boost

In [31]:
import xgboost as xgb
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn import preprocessing
warnings.filterwarnings('ignore')
%matplotlib inline

In [32]:
#labels = Y
#labels_removed_df = X
#scaler = StandardScaler()
#scaled_df = scaler.fit_transform(labels_removed_df)
X = averages.drop(columns = ('opp_Win'))
X_scaled = preprocessing.scale(X)

X_train = X_scaled[:600,:]
X_test = X_scaled[600:, :]
y_train = Y.loc[:599]
y_test = Y.loc[600:]

In [33]:
X_test.shape, y_test.shape

((152, 50), (152,))

In [34]:
clf = xgb.XGBClassifier(max_depth = 7)
clf.fit(X_train, y_train)
training_preds = clf.predict(X_train)
val_preds = clf.predict(X_test)
training_accuracy = accuracy_score(y_train.values.astype(int), training_preds.astype(int))
val_accuracy = accuracy_score(y_test.values.astype(int), val_preds.astype(int))

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

Training Accuracy: 100.0%
Validation accuracy: 56.58%


In [35]:
def prob_odds(game):
    prob_phils_win = game[0]
    prob_other_win = game[1]
    if prob_phils_win >0.5:
        x = ((-prob_phils_win)*100)/(1-prob_phils_win)
        y = (100-(prob_other_win*100))/prob_other_win
    else:
        x = (100-(prob_phils_win*100))/prob_phils_win
        y = ((-prob_other_win)*100)/(1-prob_other_win)
    return x, y

In [35]:
game = clf.predict_proba(X_test[0,:].reshape(1,-1)).tolist()

In [36]:
(100 - game[0][0]*100) / game[0][0]

185.38154938566447

In [37]:
for x in range(12,20):
    game = clf.predict_proba(X_test[-x,:].reshape(1,-1)).tolist()[0]
    print(prob_odds(game))

(-155.083449543784, 155.08346893543134)
(-178.57662576766575, 178.576648895741)
(-119.35348233715335, 119.35348233715335)
(172.55938857286284, -172.55938857286284)
(164.68530983101505, -164.68530983101505)
(-167.97154366360957, 167.97156506428593)
(-156.12928400749158, 156.12926445651092)
(-127.37810145456685, 127.3780860465285)
