In [188]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

In [169]:
import warnings
warnings.filterwarnings('ignore')

In [282]:
training = pd.read_csv('training.csv')
training

Unnamed: 0,InPlay,Velo,SpinRate,HorzBreak,InducedVertBreak
0,0,95.33,2893.0,10.68,21.33
1,0,94.41,2038.0,17.13,5.77
2,0,90.48,2183.0,6.61,15.39
3,0,93.04,2279.0,9.33,14.57
4,0,95.17,2384.0,6.99,17.62
...,...,...,...,...,...
9995,0,93.61,2074.0,13.08,7.39
9996,1,90.72,1928.0,14.10,6.08
9997,1,94.19,2694.0,0.98,14.95
9998,0,92.65,2176.0,9.28,17.62


In [283]:
training.dropna(axis = 0, inplace=True)
training

Unnamed: 0,InPlay,Velo,SpinRate,HorzBreak,InducedVertBreak
0,0,95.33,2893.0,10.68,21.33
1,0,94.41,2038.0,17.13,5.77
2,0,90.48,2183.0,6.61,15.39
3,0,93.04,2279.0,9.33,14.57
4,0,95.17,2384.0,6.99,17.62
...,...,...,...,...,...
9995,0,93.61,2074.0,13.08,7.39
9996,1,90.72,1928.0,14.10,6.08
9997,1,94.19,2694.0,0.98,14.95
9998,0,92.65,2176.0,9.28,17.62


In [284]:
x = training.drop('InPlay', axis=1)
y = training.InPlay
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2)

Predicting probabilities - logistic regression?

In [159]:
log_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

In [181]:
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'none', 'elasticnet'],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [182]:
grid_search = GridSearchCV(log_pipe, param_grid, cv=5, scoring='neg_log_loss')

In [183]:
grid_search.fit(x, y)
best_params = grid_search.best_params_
best_score = -grid_search.best_score_
best_params, best_score

({'classifier__C': 0.01,
  'classifier__penalty': 'l2',
  'classifier__solver': 'sag'},
 0.5799009933857702)

Random Forest

In [171]:
rf_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

param_grid_rf = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [173]:
grid_search_rf = GridSearchCV(rf_pipe, param_grid_rf, cv=3, scoring='neg_log_loss', verbose=1)
grid_search_rf.fit(x,y)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [174]:
best_params_rf = grid_search_rf.best_params_
best_score_rf = -grid_search_rf.best_score_
best_params_rf, best_score_rf

({'classifier__max_depth': 10,
  'classifier__min_samples_leaf': 4,
  'classifier__min_samples_split': 2,
  'classifier__n_estimators': 150},
 0.5835634334088041)

Try boosting this

In [175]:
xgb_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'))
])

param_grid_xgb = {
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 6, 9],
    'classifier__n_estimators': [50, 100, 150]
}

grid_search_xgb = GridSearchCV(xgb_pipe, param_grid_xgb, cv=3, scoring='neg_log_loss', verbose=1)

grid_search_xgb.fit(x,y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [176]:
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = -grid_search_xgb.best_score_
best_params_xgb, best_score_rf

({'classifier__learning_rate': 0.1,
  'classifier__max_depth': 3,
  'classifier__n_estimators': 50},
 0.5835634334088041)

In [179]:
mlp_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', MLPClassifier(max_iter=500))
])

param_grid_mlp = {
    'classifier__hidden_layer_sizes': [(10,), (20,), (10, 10), (100,)],
    'classifier__activation': ['relu', 'tanh', 'logistic'],
    'classifier__solver': ['adam', 'sgd']
}
mlp_gs = GridSearchCV(mlp_pipe, param_grid_mlp, cv=3, scoring='neg_log_loss', verbose=1)
mlp_gs.fit(x,y)
best_params_mlp = mlp_gs.best_params_
best_score_mlp = -mlp_gs.best_score_
best_params_mlp, best_score_mlp

Fitting 3 folds for each of 24 candidates, totalling 72 fits


({'classifier__activation': 'tanh',
  'classifier__hidden_layer_sizes': (100,),
  'classifier__solver': 'adam'},
 0.5795724550741278)

Going back to logistic regression - not good enough on others to account for lack of interpretability

In [221]:
best_params

{'classifier__C': 0.01,
 'classifier__penalty': 'l2',
 'classifier__solver': 'sag'}

In [285]:
best_log = LogisticRegression(C=.01, penalty='l2', solver='sag')
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
best_log.fit(x_train_scaled, y_train)

yh = best_log.predict_proba(x_val_scaled)[:, 1]

In [286]:
loss_log = log_loss(y_val, yh)
loss_log

0.5774640208885886

In order to show the effect of each metric, we want to take the exp of the coefficients. The coefficients are currently in log form. This will give us coefficients similar to linear regression, where each coefficient will correspond to the change in probability of InPlay with a 1 unit increase in the feature, holding all other features constant. The change is the percentage of the previous probability.

In [287]:
coefs = best_log.coef_[0]

ratio_coefs = np.exp(coefs)

feature_names = x.columns
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Log Coefficient': coefs,
    'Coefficient Ratio': ratio_coefs
})

coef_df

Unnamed: 0,Feature,Log Coefficient,Coefficient Ratio
0,Velo,-0.069715,0.932659
1,SpinRate,-0.037556,0.96314
2,HorzBreak,0.06123,1.063144
3,InducedVertBreak,-0.133838,0.874732


Again, the coefficient ratio is the multiplier we apply to the probability given a 1-unit increase in the feature. We see that an increase in horizontal break leads to an increase in in-play probability. This could be bias with vertical break. Let's fit a linear regression between the two to see if there is a correlation.

In [288]:
vert = x_train_scaled[:, 3].reshape(-1, 1)
horiz = x_train_scaled[:, 2].reshape(-1, 1)
vert_val = x_val_scaled[:, 3].reshape(-1, 1)
horiz_val = x_val_scaled[:, 2].reshape(-1, 1)
lin_reg = LinearRegression()
lin_reg.fit(vert, horiz)
lin_reg.coef_, lin_reg.score(vert_val, horiz_val)

(array([[-0.58085568]]), 0.31483254909111613)

We can see through a quick and basic linear model that vertical break has a negative effect on horizontal break with a decent r-square, which may be leaking into the results of our logistic regression model. We are already using regularization and cross-validation, so there may not be much else we can do. We can try an interaction term.

In [289]:
best_log_int = LogisticRegression(C=.01, penalty='l2', solver='sag')
x_train['interaction_term'] = x_train.InducedVertBreak * x_train.HorzBreak
x_val['interaction_term'] = x_val.InducedVertBreak * x_val.HorzBreak
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
best_log_int.fit(x_train_scaled, y_train)

yh = best_log_int.predict_proba(x_val_scaled)[:, 1]
loss_log_int = log_loss(y_val, yh)
loss_log_int

0.577808322997021

Basically the same loss

In [290]:
coefs_int = best_log_int.coef_[0]

ratio_coefs_int = np.exp(coefs_int)

feature_names = x_train.columns
coef_df_int = pd.DataFrame({
    'Feature': feature_names,
    'Log Coefficient': coefs_int,
    'Coefficient Ratio': ratio_coefs_int
})

coef_df_int

Unnamed: 0,Feature,Log Coefficient,Coefficient Ratio
0,Velo,-0.071946,0.930581
1,SpinRate,-0.039097,0.961658
2,HorzBreak,-0.068376,0.933909
3,InducedVertBreak,-0.230553,0.794094
4,interaction_term,0.132893,1.142127


With the interaction term, we now see that horizontal break has a negative effect on the probability of the ball being put into play. This aligns better with our assumptions. Additionally, the effect of vertical break increased.

It is also assumed that a bigger difference in horizontal break and vertical break is important

In [291]:
best_log_int_diff = LogisticRegression(C=.01, penalty='l2', solver='sag')
x_train['break_diff'] = np.abs(x_train.InducedVertBreak - x_train.HorzBreak)
x_val['break_diff'] = np.abs(x_val.InducedVertBreak - x_val.HorzBreak)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
best_log_int_diff.fit(x_train_scaled, y_train)

yh = best_log_int_diff.predict_proba(x_val_scaled)[:, 1]
loss_log_int_diff = log_loss(y_val, yh)
loss_log_int_diff

0.5784705894584745

In [292]:
coefs_int_diff = best_log_int_diff.coef_[0]

ratio_coefs_int_diff = np.exp(coefs_int_diff)

feature_names = x_train.columns
coefs_int_diff = pd.DataFrame({
    'Feature': feature_names,
    'Log Coefficient': coefs_int_diff,
    'Coefficient Ratio': ratio_coefs_int_diff
})

coefs_int_diff

Unnamed: 0,Feature,Log Coefficient,Coefficient Ratio
0,Velo,-0.073588,0.929055
1,SpinRate,-0.028084,0.972306
2,HorzBreak,-0.011276,0.988787
3,InducedVertBreak,-0.153871,0.857382
4,interaction_term,0.012926,1.01301
5,break_diff,-0.118702,0.888073


In [304]:
x['break_diff'] = np.abs(x.InducedVertBreak - x.HorzBreak)
x['interaction_term'] = x.InducedVertBreak * x.HorzBreak

In [305]:
final_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2', 'none', 'elasticnet'],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
log_gs = GridSearchCV(final_pipe, param_grid, cv=3, scoring='neg_log_loss', verbose=1)
log_gs.fit(x,y)
best_params_log = log_gs.best_params_
best_score_log = -log_gs.best_score_
best_params_log, best_score_log

Fitting 3 folds for each of 120 candidates, totalling 360 fits


({'classifier__C': 0.01,
  'classifier__penalty': 'l2',
  'classifier__solver': 'saga'},
 0.5795180171791857)

In [306]:
final_log = LogisticRegression(C=.01, penalty='l2', solver='sag')
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
final_log.fit(x_train_scaled, y_train)

yh = final_log.predict_proba(x_val_scaled)[:, 1]
loss_log_int_diff = log_loss(y_val, yh)
loss_log_int_diff

0.5784708037826994

In [307]:
final_coefs = final_log.coef_[0]

ratio_coefs_final = np.exp(final_coefs)

feature_names = x_train.columns
final_coefs_df = pd.DataFrame({
    'Feature': feature_names,
    'Log Coefficient': final_coefs,
    'Coefficient Ratio': ratio_coefs_final
})

final_coefs_df

Unnamed: 0,Feature,Log Coefficient,Coefficient Ratio
0,Velo,-0.073579,0.929063
1,SpinRate,-0.028074,0.972316
2,HorzBreak,-0.011278,0.988785
3,InducedVertBreak,-0.153855,0.857396
4,interaction_term,0.012924,1.013008
5,break_diff,-0.118688,0.888085


In [315]:
deploy = pd.read_csv('deploy.csv')
deploy

Unnamed: 0,Velo,SpinRate,HorzBreak,InducedVertBreak
0,94.72,2375.0,3.10,18.15
1,95.25,2033.0,11.26,14.50
2,92.61,2389.0,11.00,21.93
3,94.94,2360.0,6.84,18.11
4,97.42,2214.0,16.70,13.38
...,...,...,...,...
9995,92.32,2148.0,9.72,16.70
9996,94.96,2420.0,-4.57,14.13
9997,92.83,2132.0,8.55,18.40
9998,97.12,2436.0,7.80,15.87


In [316]:
deploy.fillna(deploy.median(), inplace=True)

In [317]:
deploy['interaction_term'] = deploy.HorzBreak * deploy.InducedVertBreak
deploy['break_diff'] = np.abs(deploy.InducedVertBreak - deploy.HorzBreak)
deploy

Unnamed: 0,Velo,SpinRate,HorzBreak,InducedVertBreak,interaction_term,break_diff
0,94.72,2375.0,3.10,18.15,56.2650,15.05
1,95.25,2033.0,11.26,14.50,163.2700,3.24
2,92.61,2389.0,11.00,21.93,241.2300,10.93
3,94.94,2360.0,6.84,18.11,123.8724,11.27
4,97.42,2214.0,16.70,13.38,223.4460,3.32
...,...,...,...,...,...,...
9995,92.32,2148.0,9.72,16.70,162.3240,6.98
9996,94.96,2420.0,-4.57,14.13,-64.5741,18.70
9997,92.83,2132.0,8.55,18.40,157.3200,9.85
9998,97.12,2436.0,7.80,15.87,123.7860,8.07


In [318]:
deploy_scaled = scaler.transform(deploy)

In [319]:
deploy_yh = final_log.predict_proba(deploy_scaled)[:, 1]

In [320]:
deploy['Probability_InPlay'] = deploy_yh

In [321]:
deploy.drop('interaction_term', axis=1, inplace=True)

In [322]:
deploy

Unnamed: 0,Velo,SpinRate,HorzBreak,InducedVertBreak,break_diff,Probability_InPlay
0,94.72,2375.0,3.10,18.15,15.05,0.208525
1,95.25,2033.0,11.26,14.50,3.24,0.292691
2,92.61,2389.0,11.00,21.93,10.93,0.217061
3,94.94,2360.0,6.84,18.11,11.27,0.224867
4,97.42,2214.0,16.70,13.38,3.32,0.282747
...,...,...,...,...,...,...
9995,92.32,2148.0,9.72,16.70,6.98,0.272503
9996,94.96,2420.0,-4.57,14.13,18.70,0.212470
9997,92.83,2132.0,8.55,18.40,9.85,0.246125
9998,97.12,2436.0,7.80,15.87,8.07,0.239423


In [323]:
deploy.to_csv('predictions.csv')