In [81]:
# normal imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns',500)

import warnings
warnings.filterwarnings('ignore')

# all the modeling imports
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB

# scoring, feature selection, and gridsearch
from sklearn.metrics import mean_squared_error,accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score

from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

In [15]:
import os
import sys
module_path = os.path.abspath(os.path.join('baseline_model.py'))

In [16]:
# testing normal dataset first
df_norm = pd.read_csv('data/pbp_data_mvp.csv')
df_norm.drop(columns='Unnamed: 0',inplace=True)

In [17]:
df_norm_copy_for_attaching = df_norm[['Date','home_team','away_team','home_outcome','away_outcome','game_number_of_season']]
df_agg = pd.read_csv('data/aggregate_data.csv')
df_agg.drop(columns='Unnamed: 0',inplace=True)

# Base Modeling

In [18]:
def baseline_prediction(X,y):
    '''
    This function will take input of the dataset split into the feature set and target
    It runs logistic regression, decision tree, random forest, and xgboost
    output train and test scores
    '''
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=99)

    # Scaling is Needed for Knn
    scaler = StandardScaler()  
    scaler.fit(X_train)

    X_train_scaled = scaler.transform(X_train)  
    X_test_scaled = scaler.transform(X_test)

    # Logistic Regression
    lr_base = LogisticRegression(random_state=99)
    lr_base.fit(X_train,y_train)
    pred_lr_base = lr_base.predict(X_train)
    score_lr_base = accuracy_score(y_train,pred_lr_base)
    
    pred_lr_base_test = lr_base.predict(X_test)
    score_lr_base_test = accuracy_score(y_test,pred_lr_base_test)
    print('Logistic Regression Accuracy\nTrain={} Test={}'.format(round(score_lr_base,3),round(score_lr_base_test,3)))
    
    # Decision Tree
    tree_base = DecisionTreeClassifier(max_depth=15)
    tree_base.fit(X_train,y_train)
    pred_tree_base = tree_base.predict(X_train)
    score_tree_base = accuracy_score(y_train,pred_tree_base)

    pred_tree_base_test = tree_base.predict(X_test)
    score_tree_base_test = accuracy_score(y_test,pred_tree_base_test)
    print('Decision Tree Accuracy\nTrain={} Test={}'.format(round(score_tree_base,3),round(score_tree_base_test,3)))

          
    # Random Forest
    rand_base = RandomForestClassifier()
    rand_base.fit(X_train,y_train)
    pred_rand_base = rand_base.predict(X_train)
    score_rand_base = accuracy_score(y_train,pred_rand_base)

    pred_rand_base_test = rand_base.predict(X_test)
    score_rand_base_test = accuracy_score(y_test,pred_rand_base_test)
    print('Random Forest Accuracy\nTrain={} Test={}'.format(round(score_rand_base,3),round(score_rand_base_test,3)))

    
    #XG Boost
    xg_base = xgb.XGBClassifier(objecteve='binary:logistic')
    xg_base.fit(X_train,y_train)
    pred_xg_base = xg_base.predict(X_train)
    score_xg_base = accuracy_score(y_train,pred_xg_base)

    pred_xg_base_test = xg_base.predict(X_test)
    score_xg_base_test = accuracy_score(y_test,pred_xg_base_test)
    print('XGBoost Accuracy\nTrain={} Test={}'.format(round(score_xg_base,3),round(score_xg_base_test,3)))

### Normal Data (game per game stats)

Each row in this data is the stats from one game. The target is the outcome of the next game.

In [19]:
# dropping the unnecessary cols that wont be used
cols_to_drop = ['forfeit_info','lf_ump_id','rf_ump_id','protest_info',
                'date_game_completed','additional_info','save_pitch_id',
                'game_win_rbi_batter_id','game_in_series','away_catch_interference',
                'home_catch_interference','away_pitch_balks',
                'home_pitch_balks','day_of_week','away_league',
                'away_team_game_number','home_league',
                'home_team_game_number','day_or_night','park_id',
                'attendance','time_of_game','away_line_scores',
                'home_line_scores','year','id','outcome',
               'Date','away_team','home_team']
df_norm.drop(columns=cols_to_drop,inplace=True)
df_norm.drop(df_norm.loc[:,'hb_ump_id':'acquisition_info'],axis=1,inplace=True)

In [20]:
# PASO has some infinite values and those rows are dropped
df_norm.replace([np.inf,-np.inf],np.nan,inplace=True)
df_norm.dropna(inplace=True)

In [21]:
# creating x and y
X_norm = df_norm.drop(columns='target')
y_norm = df_norm.target

In [22]:
# baseline testing
baseline_prediction(X_norm,y_norm)

Logistic Regression Accuracy
Train=0.536 Test=0.531
Decision Tree Accuracy
Train=0.633 Test=0.509
Random Forest Accuracy
Train=1.0 Test=0.522
XGBoost Accuracy
Train=0.566 Test=0.526


The game by game statistics (only using the last game to predict the next game) performs pretty well. Logistic regression perfoms the best and overfitting is very very low. Decision tree and random forest overfit a ton with very poor results on test. XG boost overfits slightly and performs the same as random forst. Will be doing more testing on logistic regression.

### Aggregate Stats

In [23]:
# changing the name of these becuase we already have the teams win % as this
df_norm_copy_for_attaching.rename(columns={'home_outcome':'home_outcome_nonagg','away_outcome':'away_outcome_nonagg'},inplace=True)
# merging to df
df_agg = df_agg.merge(df_norm_copy_for_attaching,how='left',on=['Date','home_team','away_team'])
# taking out the first 10 games due to them not having enough data
df_agg_cutt_early = df_agg[(df_agg.game_number_of_season > 10)]

In [24]:
# dropping columns no longer needed
cols_to_drop_agg_cutt = ['home_team','away_team','Date','away_outcome_nonagg','game_number_of_season']
df_agg_cutt_early.drop(columns=cols_to_drop_agg_cutt,inplace=True)

In [25]:
# PA/SO can result in infinite value, row dropped
df_agg_cutt_early.replace([np.inf,-np.inf],np.nan,inplace=True)
df_agg_cutt_early.dropna(inplace=True)

In [26]:
# target is the outcome of current game
X_agg_cutt = df_agg_cutt_early.drop(columns='home_outcome_nonagg')
y_agg_cutt = df_agg_cutt_early.home_outcome_nonagg

In [27]:
# baseline testing
baseline_prediction(X_agg_cutt,y_agg_cutt)

Logistic Regression Accuracy
Train=0.564 Test=0.569
Decision Tree Accuracy
Train=0.799 Test=0.543
Random Forest Accuracy
Train=0.991 Test=0.526
XGBoost Accuracy
Train=0.608 Test=0.569


From this we can see that the aggregate data performs better than the normal per game statistics data. The reason for this improvement is each team is not being assesed on only their previous game but all previous games that season. A team is not how well they did last game but how well they have been doing that season. One off game can throw off predictions. Logistic regression and XGBoost are going to be futher improved due to their accuracy in these intial tests.

## Feature Selection

Feature selection can be very useful by reducing the amount of features used in the model while also maintaining/increasing the accuracy score. A filter method of select k-best is going to be used first and then a wrapper method of recursive feature elimination after.

#### Filter Method

In [88]:
# instantiating a feature selector object
feature_selector = SelectKBest(f_classif,50)

# fitting to our data
feature_selector.fit(X_train,y_train)

# features that we keep
selected_filter = X_train.columns[feature_selector.get_support()]

In [89]:
baseline_prediction(X_agg_cutt[selected_filter],y_agg_cutt)

Logistic Regression Accuracy
Train=0.56 Test=0.565
Decision Tree Accuracy
Train=0.78 Test=0.536
Random Forest Accuracy
Train=0.991 Test=0.53
XGBoost Accuracy
Train=0.605 Test=0.565


#### Wrapper Method

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_agg_cutt,y_agg_cutt,random_state=99)
# we are going to use logistic regression as an estimator as it was the best performer in baseline tests
estimator = LogisticRegression()

feature_selector = RFECV(estimator=estimator, step=1, cv=10,n_jobs=-1)

feature_selector.fit(X_train,y_train)

selected_wrapper = X_train.columns[feature_selector.support_]

print('# of feature pre feature selection: {}'.format(len(X_train.columns)))
print('# of feature post feature selection: {}'.format(len(selected_wrapper)))

# of feature pre feature selection: 76
# of feature post feature selection: 50


In [80]:
baseline_prediction(X_agg_cutt[selected_wrapper],y_agg_cutt)

Logistic Regression Accuracy
Train=0.564 Test=0.568
Decision Tree Accuracy
Train=0.779 Test=0.531
Random Forest Accuracy
Train=0.991 Test=0.531
XGBoost Accuracy
Train=0.605 Test=0.566


The recursive feature elimination produces slightly better results than the filter method. This shows that using 16 less features only decresed prediction accuracy by 0.001 for logistic regression and 0.003 for XGBoost. I am going to continue with the full dataset as computation time is not an issure in this project. I also believe that allowing as much information into the models will increase accuracy as much as posssible.

# In Depth Modeling

Now that we have identified the data we are going to use it is time to find the most optimal version of logistic regressiona and XGBoost as they were the best performers. Going to use grid search to find the best parameters for both models.

### What are the aggregated statistics

The way that the aggregated stats are calculated are completelty different than the stats of the normal non-aggregated table. For aggregated stats the varibales are the average of the teams statistics up until that game and not including that current game. This allows us not to have to predict the outcome of the next game (the next game the home team plays) but the outcome of the game in that row. There is no data leakage because the stats from that game have not yet been added to the aggregated stats therefore it is not included in the teams average up until that game.

## Logistic Regression Grid Search

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_agg_cutt,y_agg_cutt,random_state=99)

In [64]:
param_grid_iterations = []

In [74]:
param_grid_lr = { 
    'penalty': ['l2','l1'],
    'C': [1e10,1e9,1e8],
    'max_iter':[60,80,100]
}

grid_lr=GridSearchCV(LogisticRegression(),
                         param_grid_lr, 
                         cv=10, 
                         scoring='accuracy', 
                         verbose=1, 
                         n_jobs=-1)

grid_lr.fit(X_train,y_train)

# Single best score achieved across all params
print(grid_lr.best_score_)

# Dictionary containing the parameters used to generate that score
print(grid_lr.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_lr.best_estimator_)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   16.6s finished


0.5608405148387396
{'C': 1000000000.0, 'max_iter': 80, 'penalty': 'l2'}
LogisticRegression(C=1000000000.0, max_iter=80)


In [75]:
# Predict the response for test dataset
y_pred_train = grid_lr.best_estimator_.predict(X_train)
y_pred_test = grid_lr.best_estimator_.predict(X_test)
# getting accuracy scores
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print("Train Accuracy: %f" % (train_acc))
print("Test Accuracy: %f" % (test_acc))

param_grid_iterations.append((grid_lr.best_params_,test_acc))

Train Accuracy: 0.563604
Test Accuracy: 0.570783


In [76]:
for item in param_grid_iterations:
    print(item)

({'C': 1000000000.0, 'max_iter': 100, 'penalty': 'l2'}, 0.5707825650043964)
({'C': 1000000000.0, 'max_iter': 80, 'penalty': 'l2'}, 0.5707825650043964)
({'C': 1000000000.0, 'max_iter': 80, 'penalty': 'l2'}, 0.5707825650043964)


From testing different parameter grids this problem we can see that this proble favors a very regularization score. L2 also is the best performing penalty. Max iterations seems to converge to 80. All of these have the same score but will end with 80 as it has the least iterations but also the same score.

From this we can see the the best logistic regression achieved a score of 57%. This is very good as it is 7% above being completely random which can help inform what team to choose.

#### Obtaining Feature Importances

In [95]:
lr_final = LogisticRegression(penalty='l2',C=1e9,max_iter=80)
lr_final.fit(X_train,y_train);

In [97]:
feature_importance = lr_final.coef_[0]

In [98]:
features = X_agg_cutt.columns

In [99]:
feature_importance_list = []
for feature, importance in zip(features,feature_importance):
    feature_importance_list.append((feature,importance))
feature_importance_list.sort(key=lambda x: np.abs(x[1]),reverse=True)

In [100]:
feature_importance_list[0:10]

[('away_at_bats', -0.04345678209051356),
 ('home_at_bats', -0.04260631266103364),
 ('away_def_putouts', -0.03404399300307917),
 ('home_def_putouts', -0.031812435006491196),
 ('home_total_bases', -0.01812287965758287),
 ('away_total_bases', -0.013503773002808774),
 ('away_def_assists', -0.011995252966007584),
 ('away_so', -0.010932023522150374),
 ('home_hits', -0.01016044205761474),
 ('away_hits', -0.01008158397118806)]

## XG Boost Grid Search

Going to use XG boost for now as it perfomed the best out of all combinations. This will be using the aggregated data.

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': [300,500],
              'learning_rate': [1,0.1,0.01],
              'max_depth': [5, 7, 10],
              'min_child_weight': [1, 2, 3]
             }
gsearch1 = GridSearchCV(
    estimator = clf_xgb,
    param_grid = param_dist, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    iid=False, 
    cv=5)

In [None]:
gsearch1.fit(X_train_norm,y_train_norm)

In [None]:
gsearch1.best_params_

In [None]:
gsearch1.best_score_

### Testing grid search

In [None]:
X_train_agg, X_test_agg, y_train_agg, y_test_agg = train_test_split(X_agg,y_agg,random_state=99)

In [None]:
#XG Boost
xg_gridv1 = xgb.XGBClassifier(objecteve='binary:logistic',
                           colsample_bytree=0.4,
                           learning_rate=0.2,
                           max_depth=7,
                           min_child_weight=3,
                            n_estimators=300)
xg_gridv1.fit(X_train_agg,y_train_agg)
pred_xg_gridv1 = xg_gridv1.predict(X_train_agg)
score_xg_gridv1 = accuracy_score(y_train_agg,pred_xg_gridv1)
print('XGBoost Accuracy: {}'.format(score_xg_gridv1))

pred_xg_gridv1_test = xg_gridv1.predict(X_test_agg)
score_xg_gridv1_test = accuracy_score(y_test_agg,pred_xg_gridv1_test)
print('XGBoost Test Accuracy: {}'.format(score_xg_gridv1_test))

#### Logistic Regression Feature Weights

## Aggregate Data cutting first 10 games

## Feature Selection

Recursive Feature Elimination

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_agg_cutt,y_agg_cutt,random_state=99)

estimator = LogisticRegression()

feature_selector = RFECV(estimator=estimator, step=1, cv=10,n_jobs=-1,min_features_to_select=15)

feature_selector.fit(X_train,y_train)

selected_wrapper = X_train.columns[feature_selector.support_]

print('# of feature pre feature selection: {}'.format(len(X_train.columns)))
print('# of feature post feature selection: {}'.format(len(selected_wrapper)))

baseline_prediction(X_agg[selected_wrapper],y_agg)

Feature selection using the wrapper method actually saw a loss in performance in almost every metric.

## Logistic Regression Grid Search

In [None]:
param_grid_lr = { 
    'penalty': ['l2','l1','elasticnet'],
    'C': [0.1,0.05,0.01],
    'max_iter':[1000],
    
}

grid_lr=GridSearchCV(LogisticRegression(),
                         param_grid_lr, 
                         cv=10, 
                         scoring='accuracy', 
                         verbose=1, 
                         n_jobs=-1)

grid_lr.fit(X_train,y_train)

# Single best score achieved across all params (min_samples_split)
print(grid_lr.best_score_)

# Dictionary containing the parameters (min_samples_split) used to generate that score
print(grid_lr.best_params_)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid_lr.best_estimator_)

In [None]:
#Predict the response for test dataset
y_pred_test = grid_lr.best_estimator_.predict(X_test)

y_pred_train = grid_lr.best_estimator_.predict(X_train)


train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print("Train Accuracy: %f" % (train_acc))
print("Test Accuracy: %f" % (test_acc))


## XG Boost

In [None]:
#XG Boost
xg_cutt = xgb.XGBClassifier(objecteve='binary:logistic')
xg_cutt.fit(X_train_agg_cutt,y_train_agg_cutt)
pred_xg_cutt = xg_cutt.predict(X_train_agg_cutt)
score_xg_cutt = accuracy_score(y_train_agg_cutt,pred_xg_cutt)
print('XGBoost Accuracy: {}'.format(score_xg_cutt))

pred_xg_cutt_test = xg_cutt.predict(X_test_agg_cutt)
score_xg_cutt_test = accuracy_score(y_test_agg_cutt,pred_xg_cutt_test)
print('XGBoost Test Accuracy: {}'.format(score_xg_cutt_test))

In [None]:
xgb.plot_importance(xg_cutt,max_num_features = 15)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

In [None]:
df_norm.away_int_walk.value_counts()

## Gaussian

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
pred_gnb = gnb.predict(X_train)
score_gnb = accuracy_score(y_train,pred_gnb)
print(score_gnb)

## Voting Classifier

In [None]:
clf1 = LogisticRegression(C=0.05,penalty='l2',max_iter=1000)
clf2 = xgb.XGBClassifier()
clf3 = DecisionTreeClassifier()
eclf = VotingClassifier(estimators=[('lr',clf1),('xg',clf2),('dt',clf3)],
                        voting='hard')
eclf.fit(X_train,y_train)
eclf_preds = eclf.predict(X_train)
eclf_score = accuracy_score(y_train,eclf_preds)
print(eclf_score)

eclf_preds_test = eclf.predict(X_test)
eclf_score_test = accuracy_score(y_test,eclf_preds_test)
print(eclf_score_test)