In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns',500)

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

from sklearn.metrics import mean_squared_error,accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

In [29]:
df_agg = pd.read_csv('data/aggregate_data.csv')
df_agg.drop(columns='Unnamed: 0',inplace=True)
df_norm = pd.read_csv('data/pbp_data_mvp.csv')
df_norm.drop(columns='Unnamed: 0',inplace=True)

In [30]:
df_norm.target.value_counts()

 1    25777
 0    22787
-1        4
Name: target, dtype: int64

### Attaching Target to aggregate Data

In [31]:
info_for_attaching_to_agg = df_norm[['Date','home_team','away_team','target']]

In [32]:
df_agg = df_agg.merge(info_for_attaching_to_agg,how='left',on=['Date','home_team','away_team'])

# Base Modeling

## Normal (non-aggregate) data

In [33]:
def baseline_prediction(X,y):
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=99)

    # Scaling is Needed for Knn
    scaler = StandardScaler()  
    scaler.fit(X_train)

    X_train_scaled = scaler.transform(X_train)  
    X_test_scaled = scaler.transform(X_test)

    # Logistic Regression
    lr_base = LogisticRegression(random_state=99)
    lr_base.fit(X_train,y_train)
    pred_lr_base = lr_base.predict(X_train)
    score_lr_base = accuracy_score(y_train,pred_lr_base)
    print('Logistic Regression Accuracy: {}'.format(score_lr_base))

    pred_lr_base_test = lr_base.predict(X_test)
    score_lr_base_test = accuracy_score(y_test,pred_lr_base_test)
    print('Logistic Regression Test Accuracy: {}'.format(score_lr_base_test))
    
    # KNN
    knn_base = KNeighborsClassifier()
    knn_base.fit(X_train_scaled,y_train)
    pred_knn_base = knn_base.predict(X_train_scaled)
    score_knn_base = accuracy_score(y_train,pred_knn_base)
    print('KNN Accuracy: {}'.format(score_knn_base))

    pred_knn_base_test = knn_base.predict(X_test_scaled)
    score_knn_base_test = accuracy_score(y_test,pred_knn_base_test)
    print('KNN Test Accuracy: {}'.format(score_knn_base_test))
    
    # Decision Tree
    tree_base = DecisionTreeClassifier(max_depth=15)
    tree_base.fit(X_train,y_train)
    pred_tree_base = tree_base.predict(X_train)
    score_tree_base = accuracy_score(y_train,pred_tree_base)
    print('Tree Accuracy: {}'.format(score_tree_base))

    pred_tree_base_test = tree_base.predict(X_test)
    score_tree_base_test = accuracy_score(y_test,pred_tree_base_test)
    print('Tree Test Accuracy: {}'.format(score_tree_base_test))
    
    # Random Forest
    rand_base = RandomForestClassifier()
    rand_base.fit(X_train,y_train)
    pred_rand_base = rand_base.predict(X_train)
    score_rand_base = accuracy_score(y_train,pred_rand_base)
    print('Random Forest Accuracy: {}'.format(score_tree_base))

    pred_rand_base_test = rand_base.predict(X_test)
    score_rand_base_test = accuracy_score(y_test,pred_rand_base_test)
    print('Random Forest Test Accuracy: {}'.format(score_rand_base_test))
    
    #XG Boost
    xg_base = xgb.XGBClassifier(objecteve='binary:logistic')
    xg_base.fit(X_train,y_train)
    pred_xg_base = xg_base.predict(X_train)
    score_xg_base = accuracy_score(y_train,pred_xg_base)
    print('XGBoost Accuracy: {}'.format(score_xg_base))

    pred_xg_base_test = xg_base.predict(X_test)
    score_xg_base_test = accuracy_score(y_test,pred_xg_base_test)
    print('XGBoost Test Accuracy: {}'.format(score_xg_base_test))

## Normal Data

#### Dropping non-numeric columns

In [34]:
cols_to_drop = ['forfeit_info','lf_ump_id','rf_ump_id','protest_info',
                'date_game_completed','additional_info','save_pitch_id',
                'game_win_rbi_batter_id','game_in_series','away_catch_interference',
                'home_catch_interference','away_pitch_balks',
                'home_pitch_balks','day_of_week','away_league',
                'away_team_game_number','home_league',
                'home_team_game_number','day_or_night','park_id',
                'attendance','time_of_game','away_line_scores',
                'home_line_scores','year','id','outcome',
               'Date','away_team','home_team']
df_norm.drop(columns=cols_to_drop,inplace=True)
df_norm.drop(df_norm.loc[:,'hb_ump_id':'acquisition_info'],axis=1,inplace=True)

In [35]:
df_norm.replace([np.inf,-np.inf],np.nan,inplace=True)

In [36]:
df_norm.dropna(inplace=True)

In [37]:
X_norm = df_norm.drop(columns='target')
y_norm = df_norm.target

In [41]:
baseline_prediction(X_norm,y_norm)

Logistic Regression Accuracy: 0.5362338807450678
Logistic Regression Test Accuracy: 0.5305009092411969
KNN Accuracy: 0.6925768764465998
KNN Test Accuracy: 0.5055381054719789
Tree Accuracy: 0.6330871817480437
Tree Test Accuracy: 0.5081005124814019
Random Forest Accuracy: 0.6330871817480437
Random Forest Test Accuracy: 0.5213258389816499
XGBoost Accuracy: 0.5655240824424116
XGBoost Test Accuracy: 0.5260373615473632


## Aggregate data

#### Dropping cols for prediction

In [38]:
cols_to_drop_agg = ['home_team','away_team','Date']
df_agg.drop(columns=cols_to_drop_agg,inplace=True)

In [39]:
df_agg.replace([np.inf,-np.inf],np.nan,inplace=True)
df_agg.dropna(inplace=True)

In [40]:
X_agg = df_agg.drop(columns='target')
y_agg = df_agg.target

In [43]:
baseline_prediction(X_agg,y_agg)

Logistic Regression Accuracy: 0.5471378091872792
Logistic Regression Test Accuracy: 0.5436718253126988
KNN Accuracy: 0.7037455830388692
KNN Test Accuracy: 0.5328598685605258
Tree Accuracy: 0.7872791519434629
Tree Test Accuracy: 0.5370998516005936
Random Forest Accuracy: 0.7872791519434629
Random Forest Test Accuracy: 0.5328598685605258
XGBoost Accuracy: 0.6055830388692579
XGBoost Test Accuracy: 0.5534237863048548


# Feature Selection

In [25]:
estimator = LogisticRegression()

feature_selector = RFECV(estimator=estimator, step=1, cv=10,n_jobs=-1,min_features_to_select=15)

feature_selector.fit(X_train,y_train)

selected_wrapper = X_train.columns[feature_selector.support_]

In [45]:
baseline_prediction(X_agg[selected_wrapper],y_agg)

Logistic Regression Accuracy: 0.5481625441696113
Logistic Regression Test Accuracy: 0.543247827008692
KNN Accuracy: 0.7068904593639576
KNN Test Accuracy: 0.5307398770404919
Tree Accuracy: 0.7998939929328622
Tree Test Accuracy: 0.5359338562645749
Random Forest Accuracy: 0.7998939929328622
Random Forest Test Accuracy: 0.5383718465126139
XGBoost Accuracy: 0.6054416961130742
XGBoost Test Accuracy: 0.5534237863048548


# In Depth Modeling

Going to use XG boost for now as it perfomed the best out of all combinations. This will be using the aggregated data.

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': [100,300,500],
              'learning_rate': [1,0.1,0.05,0.01],
              'max_depth': [3, 5, 7, 10],
              'colsample_bytree': [0.5,0.45,0.4],
              'min_child_weight': [1, 2, 3]
             }
gsearch1 = GridSearchCV(
    estimator = clf_xgb,
    param_grid = param_dist, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    iid=False, 
    cv=5)

In [None]:
gsearch1.fit(X_train,y_train)

In [26]:
selected_wrapper

Index(['home_team_score', 'home_at_bats', 'home_hits', 'home_doubles',
       'home_triples', 'home_hrs', 'home_rbi', 'home_sh', 'home_sf',
       'home_hbp', 'home_walk', 'home_int_walk', 'home_so', 'home_sb',
       'home_cs', 'home_gidp', 'home_left_on_base', 'home_pitchers_used',
       'home_pitch_earned_runs', 'home_team_earned_runs',
       'home_pitch_wild_pitches', 'home_def_putouts', 'home_def_assists',
       'home_def_errors', 'home_def_double_plays', 'home_singles', 'home_PASO',
       'home_total_bases', 'home_runs_created', 'home_outcome',
       'away_team_score', 'away_at_bats', 'away_hits', 'away_doubles',
       'away_triples', 'away_hrs', 'away_rbi', 'away_sh', 'away_sf',
       'away_hbp', 'away_walk', 'away_int_walk', 'away_so', 'away_sb',
       'away_cs', 'away_gidp', 'away_left_on_base', 'away_pitchers_used',
       'away_pitch_earned_runs', 'away_team_earned_runs',
       'away_pitch_wild_pitches', 'away_def_putouts', 'away_def_assists',
       'away_def_error