In [472]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.special import logit
from sklearn.metrics import mean_squared_error
from sklearn.metrics import brier_score_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
pd.options.display.max_columns = 100

In [34]:
def getBS(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  brier = brier_score_loss(y_test, y_pred)
  return brier

def forward_selection(model, X_train, X_test, y_train, y_test):
    initial_features = X_train.columns
    temp = 1
    best_features = []
    best = 'none'
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        for new_column in remaining_features:
            temp_list = best_features+[new_column]
            brier = getBS(model, X_train[temp_list], X_test[temp_list], y_train, y_test)
            if brier<temp:
                temp = brier
                best = new_column
        if best not in best_features:
            best_features.append(best)
        else:
            break
    return brier, best_features

def backward_selection(model, X_train, X_test, y_train, y_test):
    initial_features = X_train.columns
    temp = 1
    worst_features = []
    worst = 'none'
    while (len(worst_features)<len(initial_features)):
        subset = list(set(initial_features)-set(worst_features))
        for new_column in subset:
            remaining_features = list(set(subset)-set([new_column]))
            brier = getBS(model, X_train[remaining_features], X_test[remaining_features], y_train, y_test)
            if brier<temp:
                temp = brier
                worst = new_column
        if worst not in worst_features:
            worst_features.append(worst)
        else:
            break
    
    best_features = list(set(initial_features)-set(worst_features))
    
    brier = getBS(model, X_train[best_features], X_test[best_features], y_train, y_test)
    return brier, best_features

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

rf = RandomForestClassifier(criterion = 'entropy')
lr = LogisticRegression(C=1000)

In [3]:
df = pd.read_parquet('sc19.parquet')
df = df.drop(columns = 'des').sort_values(['game_date','game_pk','at_bat_number','pitch_number'])

In [4]:
df = df.loc[df.game_type == 'R']

### Question 1 - McCracken

In [44]:
# df.head()

In [215]:
df1 = df[['pitcher','player_name','events','game_year']]
df1 = df1.dropna().sort_values(['pitcher','game_year'])
df1 = df1.groupby(['pitcher','player_name','game_year','events']).events.count().reset_index(name ='count')
df1  = pd.pivot_table(df1, index=['pitcher', 'player_name','game_year'], columns = 'events', values='count', aggfunc='sum').reset_index()
df1 = df1.fillna(0)
df1 = df1[['pitcher', 'game_year', 'player_name', 'batter_interference',
       'catcher_interf','double', 'double_play',
       'field_error', 'field_out', 'fielders_choice', 'fielders_choice_out',
       'force_out', 'grounded_into_double_play',
       'hit_by_pitch', 'home_run', 'intent_walk', 'other_advance', 'other_out',
       'passed_ball', 'runner_double_play', 'sac_bunt', 'sac_bunt_double_play', 'sac_fly',
       'sac_fly_double_play', 'single', 'stolen_base_2b', 'stolen_base_3b',
       'stolen_base_home', 'strikeout', 'strikeout_double_play', 'triple',
       'triple_play', 'walk']]
df1['bf'] = df1.iloc[:, 3:].sum(axis=1)
df1['bb'] = df1['walk'] + df1['intent_walk']
df1['k'] = df1['strikeout'] + df1['strikeout_double_play']



In [216]:
df1.head()

events,pitcher,game_year,player_name,batter_interference,catcher_interf,double,double_play,field_error,field_out,fielders_choice,fielders_choice_out,force_out,grounded_into_double_play,hit_by_pitch,home_run,intent_walk,other_advance,other_out,passed_ball,runner_double_play,sac_bunt,sac_bunt_double_play,sac_fly,sac_fly_double_play,single,stolen_base_2b,stolen_base_3b,stolen_base_home,strikeout,strikeout_double_play,triple,triple_play,walk,bf,bb,k
0,110683,2012,"Batista, Miguel",0.0,0.0,12.0,0.0,1.0,99.0,1.0,1.0,6.0,3.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,40.0,0.0,0.0,0.0,35.0,1.0,0.0,0.0,32.0,244.0,33.0,36.0
1,112020,2012,"Carpenter, Chris",0.0,0.0,4.0,0.0,1.0,34.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,12.0,0.0,1.0,0.0,3.0,72.0,3.0,12.0
2,112526,2012,"Colon, Bartolo",0.0,0.0,24.0,1.0,9.0,313.0,1.0,0.0,15.0,14.0,1.0,17.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,117.0,0.0,0.0,0.0,91.0,0.0,3.0,0.0,20.0,636.0,23.0,91.0
3,112526,2013,"Colon, Bartolo",0.0,0.0,27.0,1.0,3.0,374.0,1.0,2.0,19.0,21.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,6.0,0.0,148.0,0.0,0.0,0.0,116.0,1.0,4.0,0.0,29.0,769.0,29.0,117.0
4,112526,2014,"Colon, Bartolo",0.0,0.0,41.0,1.0,7.0,391.0,2.0,1.0,13.0,14.0,5.0,22.0,3.0,0.0,0.0,0.0,0.0,8.0,0.0,4.0,0.0,152.0,0.0,0.0,0.0,150.0,1.0,3.0,1.0,27.0,846.0,30.0,151.0


In [217]:
df1['bbrate'] = df1.bb/df1.bf
df1['krate'] = df1.k/(df1.bf-df1.bb)

In [231]:
len(df1)

5896

In [190]:
from itertools import cycle

years = cycle([2012,2013,2014,2015,2016,2017,2018,2019])

df1a = pd.DataFrame(df1.pitcher.unique())
df1a = pd.concat([df1a]*8, ignore_index=True)
df1a.columns = ['pitcher']
df1a = df1a.sort_values(by='pitcher')
df1a['game_year'] = [next(years) for i in range(len(df1a))]

In [191]:
df1full = pd.merge(df1a, df1, on = ['pitcher','game_year'], how = 'left')
df1full['player_name'] = df1full['player_name'].fillna(method = 'ffill')
df1full = df1full.fillna(0)
df1full['bbrate_prev'] = df1full.groupby('pitcher').bbrate.transform(lambda x : x.shift(1, fill_value = 0))
df1full['krate_prev'] = df1full.groupby('pitcher').krate.transform(lambda x : x.shift(1, fill_value = 0))
df1full['bf_prev'] = df1full.groupby('pitcher').bf.transform(lambda x : x.shift(1, fill_value = 0))
df1full.head(5)

Unnamed: 0,pitcher,game_year,player_name,batter_interference,catcher_interf,double,double_play,field_error,field_out,fielders_choice,fielders_choice_out,force_out,grounded_into_double_play,hit_by_pitch,home_run,intent_walk,other_advance,other_out,passed_ball,runner_double_play,sac_bunt,sac_bunt_double_play,sac_fly,sac_fly_double_play,single,stolen_base_2b,stolen_base_3b,stolen_base_home,strikeout,strikeout_double_play,triple,triple_play,walk,bf,bb,k,bbrate,krate,bbrate_prev,krate_prev,bf_prev
0,110683,2012,"Batista, Miguel",0.0,0.0,12.0,0.0,1.0,99.0,1.0,1.0,6.0,3.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,40.0,0.0,0.0,0.0,35.0,1.0,0.0,0.0,32.0,244.0,33.0,36.0,0.135246,0.170616,0.0,0.0,0.0
1,110683,2013,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135246,0.170616,244.0
2,110683,2014,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110683,2015,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110683,2016,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [210]:
df1a = df1full.loc[(df1full.game_year != 2012) & (df1full.game_year != 2019) & (df1full.bf >=200) & (df1full.bf_prev >= 200)]
df1test = df1full.loc[(df1full.game_year == 2019) & (df1full.bf >=200) & (df1full.bf_prev >= 200)]

In [211]:
# df1.head()

In [212]:
# df1.game_year.value_counts()

In [213]:
res1a = smf.ols('bbrate ~ bbrate_prev' ,df1a).fit()
print(mean_squared_error(df1test.bbrate, res1a.predict(df1test)))
print(res1a.params)

0.0004598137497809986
Intercept      0.036375
bbrate_prev    0.526399
dtype: float64


In [214]:
res1b = smf.ols('krate ~ krate_prev' ,df1a).fit()
print(mean_squared_error(df1test.krate, res1b.predict(df1test)))
print(res1b.params)

0.002001949418894451
Intercept     0.060182
krate_prev    0.744610
dtype: float64


In [17]:
# df.events.value_counts()

In [18]:
# df.description.value_counts()

### Question 2 - Called Strike

In [53]:
# From Lecture 9
def HighMiss (z, top, threshold):
    return np.maximum (0, z - (top-threshold))
def LowMiss (z, bot, threshold):
    return np.maximum (bot+threshold-z,0)
def LeftMiss (x, threshold):
    return np.maximum (0, -x-threshold)
def RightMiss (x, threshold):
    return np.maximum (0, x-threshold)

In [144]:
df2 = df[['pitch_type','pitch_name','release_pos_x','release_pos_z','zone','stand','p_throws','balls','strikes','pfx_x','pfx_z',
         'plate_x','plate_z','vx0','vy0','sz_top','sz_bot','vz0','ax','ay','az','outs_when_up','on_1b','on_2b', 'on_3b',
         'release_speed','release_spin_rate','release_extension','description',
          'game_date','game_pk','at_bat_number','pitch_number','game_year','home_team','away_team']]
# df2[df2['release_extension'].isna()]
df2 = df2.dropna(subset=['pitch_type','release_speed']).sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
df2 = df2.astype({'zone': 'int64', 'balls':'int64','strikes':'int64','outs_when_up':'int64'})
df2 = df2[df2.pitch_name != 'Intentional Ball']
df2.loc[:,'Runners'] = np.where(df2[['on_1b']].isnull(), 0, 1) + np.where(df2[['on_2b']].isnull(), 0, 1) + np.where(df2[['on_3b']].isnull(), 0, 1)
df2.loc[:, 'PA'] = np.where(df2['stand'] == df2['p_throws'], 1, 0)
df2.loc[:, 'High'] = np.maximum (0, df2.plate_z - (df2.sz_top-0.25))
df2.loc[:, 'Low'] = np.maximum (0, df2.plate_z + 0.25-df2.sz_top)
df2.loc[:, 'Left'] = np.maximum (0, -df2.plate_x - 0.6)
df2.loc[:, 'Right'] = np.maximum (0, df2.plate_x - 0.6)

# df2.loc[:, 'Bottom_Edge'] = df2.plate_z/df2.sz_bot
# df2.loc[:, 'Top_Edge'] = df2.plate_z/df2.sz_top
df2.head()

Unnamed: 0,pitch_type,pitch_name,release_pos_x,release_pos_z,zone,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,sz_top,sz_bot,vz0,ax,ay,az,outs_when_up,on_1b,on_2b,on_3b,release_speed,release_spin_rate,release_extension,description,game_date,game_pk,at_bat_number,pitch_number,game_year,home_team,away_team,Runners,PA,High,Low,Left,Right
5796860,FF,4-Seam Fastball,-2.11,6.17,11,L,R,0,0,-0.37,1.19,-0.95,2.51,3.69,-135.82,3.37,1.56,-6.28,-3.17,28.73,-19.09,0,,,,93.3,,,called_strike,2012-04-04,317733,1,1,2012,MIA,STL,0,0,0.0,0.0,0.35,0.0
5796859,FF,4-Seam Fastball,-2.24,6.14,11,L,R,0,1,-0.63,1.46,-0.83,2.67,5.07,-137.35,3.33,1.61,-6.57,-6.89,31.46,-15.36,0,,,,94.5,,,called_strike,2012-04-04,317733,1,2,2012,MIA,STL,0,0,0.0,0.0,0.23,0.0
5796858,CU,Curveball,-2.36,6.1,13,L,R,0,2,0.25,-0.46,-1.01,1.14,2.34,-118.3,3.25,1.49,-3.34,3.85,21.88,-38.6,0,,,,81.3,,,ball,2012-04-04,317733,1,3,2012,MIA,STL,0,0,0.0,0.0,0.41,0.0
5796857,FF,4-Seam Fastball,-2.03,5.98,5,L,R,1,2,-0.67,1.55,-0.22,2.36,6.36,-139.54,3.35,1.61,-7.48,-7.59,32.38,-13.62,0,,,,96.1,,,hit_into_play,2012-04-04,317733,1,4,2012,MIA,STL,0,0,0.0,0.0,0.0,0.0
5796856,FF,4-Seam Fastball,-1.9,6.02,9,L,R,0,0,-0.72,1.58,0.32,2.03,7.58,-138.67,3.47,1.67,-8.5,-8.22,30.63,-13.26,1,,,,95.6,,,hit_into_play,2012-04-04,317733,2,1,2012,MIA,STL,0,0,0.0,0.0,0.0,0.0


In [160]:
df2.pitch_name.value_counts()

4-Seam Fastball    1987175
Slider              883881
2-Seam Fastball     646787
Changeup            585483
Sinker              528033
Curveball           464182
Cutter              314357
Knuckle Curve       128865
Split-Finger         86286
Knuckleball          20442
Eephus                1832
Pitch Out             1719
Forkball              1660
Screwball              334
Fastball                34
Name: pitch_name, dtype: int64

In [154]:
dfcs = df2.loc[df2.description.isin(['ball', 'blocked_ball', 'called_strike'])]
dfcs.loc[:,'called_k'] = 1*(dfcs.description == 'called_strike');

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [134]:
df27 = dfcs.loc[(dfcs.game_year < 2018)]
df8 = dfcs.loc[(dfcs.game_year == 2018)]

In [515]:
model_cs = smf.logit('called_k ~ zone + HighMiss(plate_z, sz_top, 0.25) + LowMiss(plate_z, sz_bot, 0.25) \
+ LeftMiss(plate_x, 0.6) + RightMiss(plate_x, 0.6) + strikes + balls + Runners + stand \
+ plate_x + plate_z + sz_top + sz_bot \
+ release_pos_x + release_speed + pitch_type',df27).fit()
print(brier_score_loss(df8.called_k, model_cs.predict(df8)))
print(model_cs.summary())

Optimization terminated successfully.
         Current function value: 0.213384
         Iterations 10
0.05753603844233004
                           Logit Regression Results                           
Dep. Variable:               called_k   No. Observations:              2234439
Model:                          Logit   Df Residuals:                  2234410
Method:                           MLE   Df Model:                           28
Date:                Thu, 15 Apr 2021   Pseudo R-squ.:                  0.6619
Time:                        19:24:48   Log-Likelihood:            -4.7679e+05
converged:                       True   LL-Null:                   -1.4100e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                           6.2332   

In [479]:
df27s = dfcs.loc[(dfcs.game_year < 2018)].sample(frac=0.1)
df8s = dfcs.loc[(dfcs.game_year == 2018)].sample(frac=0.1)
X_train = df27s[['release_pos_x', 'release_pos_z', 'zone',
       'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x',
       'plate_z', 'vx0', 'vy0', 'sz_top', 'sz_bot', 'vz0', 'ax', 'ay', 'az',
       'outs_when_up', 'release_speed','Runners', 'PA', 'High','Low','Left','Right']]
y_train = df27s.called_k
X_test = df8s[['release_pos_x', 'release_pos_z', 'zone',
       'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x',
       'plate_z', 'vx0', 'vy0', 'sz_top', 'sz_bot', 'vz0', 'ax', 'ay', 'az',
       'outs_when_up', 'release_speed','Runners', 'PA', 'High','Low','Left','Right']]
y_test = df8s.called_k

In [480]:
brier_lr, flr = forward_selection(lr, X_train, X_test, y_train, y_test)
print(brier_lr, flr)

0.0888050248073472 ['zone', 'Low', 'plate_z']


In [481]:
brier_lr, flr = backward_selection(lr, X_train, X_test, y_train, y_test);
print(brier_lr, flr);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.08964953024385094 ['zone', 'balls', 'az', 'release_speed', 'PA', 'plate_x', 'pfx_x', 'release_pos_z', 'ay', 'sz_top', 'sz_bot', 'Low', 'plate_z', 'ax', 'strikes', 'Left', 'outs_when_up', 'pfx_z', 'High', 'Right', 'release_pos_x', 'Runners', 'vx0']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [482]:
brier_rf, frf = forward_selection(rf, X_train, X_test, y_train, y_test)
print(brier_rf, frf)

0.12852317111791406 ['zone', 'Low', 'Right']


In [483]:
brier_rf, frf = backward_selection(rf, X_train, X_test, y_train, y_test)
print(brier_rf, frf)

0.07249551356486858 ['zone', 'balls', 'az', 'release_speed', 'PA', 'plate_x', 'pfx_x', 'release_pos_z', 'ay', 'sz_top', 'sz_bot', 'plate_z', 'ax', 'strikes', 'outs_when_up', 'pfx_z', 'High', 'Right', 'release_pos_x', 'vy0', 'Runners', 'vx0', 'vz0']


In [484]:
rf.fit(X_train,y_train)
lr.fit(X_train,y_train)

print(brier_score_loss(y_test, rf.predict(X_test)))
print(brier_score_loss(y_test, lr.predict(X_test)));

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.07302332946268342
0.09421513775994933


### Question 3 - Swinging Strike

In [487]:
dfss = df2.copy(deep=True)
dfss.loc[:,'swinging_k'] = 1*(dfss.description == 'swinging_strike') + 1*(dfss.description == 'swinging_strike_blocked') \
                         + 1*(dfss.description == 'swinging_pitchout')
# dfss = dfss[dfss.pitch_name != 'Pitch Out']
df27ss = dfss.loc[(dfss.game_year < 2018)]
df8ss = dfss.loc[(dfss.game_year == 2018)]
dfss.head()

Unnamed: 0,pitch_type,pitch_name,release_pos_x,release_pos_z,zone,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,sz_top,sz_bot,vz0,ax,ay,az,outs_when_up,on_1b,on_2b,on_3b,release_speed,release_spin_rate,release_extension,description,game_date,game_pk,at_bat_number,pitch_number,game_year,home_team,away_team,Runners,PA,High,Low,Left,Right,swinging_k
5796860,FF,4-Seam Fastball,-2.11,6.17,11,L,R,0,0,-0.37,1.19,-0.95,2.51,3.69,-135.82,3.37,1.56,-6.28,-3.17,28.73,-19.09,0,,,,93.3,,,called_strike,2012-04-04,317733,1,1,2012,MIA,STL,0,0,0.0,0.0,0.35,0.0,0
5796859,FF,4-Seam Fastball,-2.24,6.14,11,L,R,0,1,-0.63,1.46,-0.83,2.67,5.07,-137.35,3.33,1.61,-6.57,-6.89,31.46,-15.36,0,,,,94.5,,,called_strike,2012-04-04,317733,1,2,2012,MIA,STL,0,0,0.0,0.0,0.23,0.0,0
5796858,CU,Curveball,-2.36,6.1,13,L,R,0,2,0.25,-0.46,-1.01,1.14,2.34,-118.3,3.25,1.49,-3.34,3.85,21.88,-38.6,0,,,,81.3,,,ball,2012-04-04,317733,1,3,2012,MIA,STL,0,0,0.0,0.0,0.41,0.0,0
5796857,FF,4-Seam Fastball,-2.03,5.98,5,L,R,1,2,-0.67,1.55,-0.22,2.36,6.36,-139.54,3.35,1.61,-7.48,-7.59,32.38,-13.62,0,,,,96.1,,,hit_into_play,2012-04-04,317733,1,4,2012,MIA,STL,0,0,0.0,0.0,0.0,0.0,0
5796856,FF,4-Seam Fastball,-1.9,6.02,9,L,R,0,0,-0.72,1.58,0.32,2.03,7.58,-138.67,3.47,1.67,-8.5,-8.22,30.63,-13.26,1,,,,95.6,,,hit_into_play,2012-04-04,317733,2,1,2012,MIA,STL,0,0,0.0,0.0,0.0,0.0,0


In [None]:
modeltest = smf.glm('swinging_k ~ 1', df27ss).fit()
print(brier_score_loss(df8ss.swinging_k, modeltest.predict(df8ss)))
print(modeltest.summary())

In [517]:
model_ss = smf.logit('swinging_k ~ zone + HighMiss(plate_z, sz_top, 0.25) + LowMiss(plate_z, sz_bot, 0.25) \
+ LeftMiss(plate_x, 0.6) + RightMiss(plate_x, 0.6) + strikes  + Runners + PA \
+ plate_x + plate_z + sz_top + sz_bot \
+ release_pos_x + release_speed + pitch_type +az + pfx_x + release_pos_z + ay',df27ss).fit()
print(brier_score_loss(df8ss.swinging_k, model_ss.predict(df8ss)))
print(model_ss.summary())

Optimization terminated successfully.
         Current function value: 0.301297
         Iterations 9
0.09111536097806695
                           Logit Regression Results                           
Dep. Variable:             swinging_k   No. Observations:              4208747
Model:                          Logit   Df Residuals:                  4208714
Method:                           MLE   Df Model:                           32
Date:                Thu, 15 Apr 2021   Pseudo R-squ.:                 0.06156
Time:                        19:50:33   Log-Likelihood:            -1.2681e+06
converged:                       True   LL-Null:                   -1.3513e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept                          -4.7852    

In [488]:
df27sss = dfss.loc[(dfss.game_year < 2018)].sample(frac=0.1)
df8sss = dfss.loc[(dfss.game_year == 2018)].sample(frac=0.1)
X_train = df27sss[['release_pos_x', 'release_pos_z', 'zone',
       'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x',
       'plate_z', 'vx0', 'vy0', 'sz_top', 'sz_bot', 'vz0', 'ax', 'ay', 'az',
       'outs_when_up', 'release_speed','Runners', 'PA', 'High','Low','Left','Right']]
y_train = df27sss.swinging_k
X_test = df8sss[['release_pos_x', 'release_pos_z', 'zone',
       'balls', 'strikes', 'pfx_x', 'pfx_z', 'plate_x',
       'plate_z', 'vx0', 'vy0', 'sz_top', 'sz_bot', 'vz0', 'ax', 'ay', 'az',
       'outs_when_up', 'release_speed','Runners', 'PA', 'High','Low','Left','Right']]
y_test = df8sss.swinging_k

In [489]:
brier_lr, flr = forward_selection(lr, X_train, X_test, y_train, y_test)
print(brier_lr, flr)
brier_lr, flr = backward_selection(lr, X_train, X_test, y_train, y_test);
print(brier_lr, flr);
lr.fit(X_train,y_train)
print(brier_score_loss(y_test, lr.predict(X_test)));

0.10809944088900043 ['zone']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.10809944088900043 ['balls', 'az', 'release_speed', 'PA', 'plate_x', 'pfx_x', 'release_pos_z', 'ay', 'sz_top', 'sz_bot', 'Low', 'plate_z', 'ax', 'strikes', 'Left', 'outs_when_up', 'pfx_z', 'High', 'Right', 'release_pos_x', 'vy0', 'Runners', 'vx0', 'vz0']
0.10809944088900043


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [490]:
brier_rf, frf = forward_selection(rf, X_train, X_test, y_train, y_test)
print(brier_rf, frf)
brier_rf, frf = backward_selection(rf, X_train, X_test, y_train, y_test)
print(brier_rf, frf)
rf.fit(X_train,y_train)
print(brier_score_loss(y_test, rf.predict(X_test)))

0.1741888707630959 ['zone']
0.10798789754744077 ['zone', 'az', 'release_speed', 'PA', 'plate_x', 'pfx_x', 'release_pos_z', 'ay', 'sz_top', 'sz_bot', 'plate_z', 'strikes', 'Left', 'outs_when_up', 'pfx_z', 'High', 'Right', 'release_pos_x', 'vy0', 'Runners', 'vx0', 'vz0']
0.10765326752276182


### Question 4 - Improved McCracken

In [238]:
df4 = df[['pitcher','player_name','events','pitch_type','pitch_name','release_pos_x','release_pos_z','zone','stand','p_throws','balls','strikes','pfx_x','pfx_z',
         'plate_x','plate_z','vx0','vy0','sz_top','sz_bot','vz0','ax','ay','az','outs_when_up','on_1b','on_2b', 'on_3b',
         'release_speed','release_spin_rate','release_extension','description',
          'game_date','game_pk','at_bat_number','pitch_number','game_year','home_team','away_team']]
df4 = df4.dropna(subset=['pitch_type','release_speed']).sort_values(['game_date','game_pk','at_bat_number','pitch_number'])
df4 = df4.astype({'zone': 'int64', 'balls':'int64','strikes':'int64','outs_when_up':'int64'})

In [244]:
df4.head()

Unnamed: 0,pitcher,player_name,events,pitch_type,pitch_name,release_pos_x,release_pos_z,zone,stand,p_throws,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,vx0,vy0,sz_top,sz_bot,vz0,ax,ay,az,outs_when_up,on_1b,on_2b,on_3b,release_speed,release_spin_rate,release_extension,description,game_date,game_pk,at_bat_number,pitch_number,game_year,home_team,away_team
5796860,435178,"Johnson, Josh",,FF,4-Seam Fastball,-2.11,6.17,11,L,R,0,0,-0.37,1.19,-0.95,2.51,3.69,-135.82,3.37,1.56,-6.28,-3.17,28.73,-19.09,0,,,,93.3,,,called_strike,2012-04-04,317733,1,1,2012,MIA,STL
5796859,435178,"Johnson, Josh",,FF,4-Seam Fastball,-2.24,6.14,11,L,R,0,1,-0.63,1.46,-0.83,2.67,5.07,-137.35,3.33,1.61,-6.57,-6.89,31.46,-15.36,0,,,,94.5,,,called_strike,2012-04-04,317733,1,2,2012,MIA,STL
5796858,435178,"Johnson, Josh",,CU,Curveball,-2.36,6.1,13,L,R,0,2,0.25,-0.46,-1.01,1.14,2.34,-118.3,3.25,1.49,-3.34,3.85,21.88,-38.6,0,,,,81.3,,,ball,2012-04-04,317733,1,3,2012,MIA,STL
5796857,435178,"Johnson, Josh",field_out,FF,4-Seam Fastball,-2.03,5.98,5,L,R,1,2,-0.67,1.55,-0.22,2.36,6.36,-139.54,3.35,1.61,-7.48,-7.59,32.38,-13.62,0,,,,96.1,,,hit_into_play,2012-04-04,317733,1,4,2012,MIA,STL
5796856,435178,"Johnson, Josh",single,FF,4-Seam Fastball,-1.9,6.02,9,L,R,0,0,-0.72,1.58,0.32,2.03,7.58,-138.67,3.47,1.67,-8.5,-8.22,30.63,-13.26,1,,,,95.6,,,hit_into_play,2012-04-04,317733,2,1,2012,MIA,STL


In [376]:
##Pitcher Variety
dfv = df4.groupby(['pitcher','pitch_name','game_year']).pitch_name.count().reset_index(name='pitches')
dfv = dfv.loc[dfv.pitches >= 20]
dfv = dfv.groupby(['pitcher','game_year']).pitch_name.count().reset_index(name='variety')
# dfv.head()

In [377]:
## Swinging Strikes
dfs = df4[df4['description'].isin(['called_strike','swinging_strike','swinging_strike_blocked'])].replace('swinging_strike_blocked', 'swinging_strike')
dfs = dfs.groupby(['pitcher','game_year','description']).description.count()
dfs = dfs / dfs.groupby(level=[0,1]).transform("sum")
dfs = dfs.reset_index(name='ss_prop')
dfs = dfs.loc[dfs.description == 'swinging_strike']
dfs = dfs[['pitcher','game_year','ss_prop']]
# dfs.head()

In [378]:
## Called Strike Rate
dfc = df4[df4['description'].isin(['ball','called_strike','blocked_ball'])]
dfc = dfc.groupby(['pitcher','game_year','description']).description.count()
dfc = dfc / dfc.groupby(level=[0,1]).transform("sum")
dfc = dfc.reset_index(name='cs_prop')
dfc = dfc.loc[dfc.description == 'called_strike']
dfc = dfc[['pitcher','game_year','cs_prop']]
# dfc.head()

In [388]:
## Max/Min
dfmax = df4.groupby(['pitcher','game_year']).release_speed.max().reset_index(name='max_speed')
dfmin = df4.groupby(['pitcher','game_year']).release_speed.min().reset_index(name='min_speed')
dfavg = df4.groupby(['pitcher','game_year']).release_speed.mean().reset_index(name='avg_speed')

In [426]:
## Pitcher Type
dfr = df4.groupby(['pitcher','game_year']).game_pk.nunique().reset_index(name='gp')

In [443]:
##First Pitch Strike
dffps = df4.copy(deep = True)
dffps['p_count'] = (dffps.balls).astype(str) + (dffps.strikes).astype(str)
dffps = dffps[dffps['p_count'].isin(['01','10'])]
dffps = dffps.groupby(['pitcher','game_year','p_count']).p_count.count()
dffps = dffps / dffps.groupby(level=[0,1]).transform("sum")
dffps = dffps.reset_index(name='fps_prop')
dffps = dffps.loc[dffps.p_count == '01']
dffps = dffps[['pitcher','game_year','fps_prop']]

In [491]:
## Set up Pitcher Table
dfp = df1.copy(deep = True)
dfp['hits'] = dfp.single + dfp.double + dfp.triple + dfp.home_run
dfp['hrrate'] = (dfp.home_run)/(dfp.bf-dfp.bb-dfp.k)
dfp['hrate'] = (dfp.hits - dfp.home_run)/(dfp.bf-dfp.bb-dfp.k-dfp.home_run)
dfp = dfp[['pitcher', 'game_year', 'player_name','bf','hits','home_run', 'bb', 'k', 'bbrate', 'krate', 'hrate', 'hrrate']]
years = cycle([2012,2013,2014,2015,2016,2017,2018,2019])
dfcycle = pd.DataFrame(dfp.pitcher.unique())
dfcycle = pd.concat([dfcycle]*8, ignore_index=True)
dfcycle.columns = ['pitcher']
dfcycle = dfcycle.sort_values(by='pitcher')
dfcycle['game_year'] = [next(years) for i in range(len(dfcycle))]
dfp = pd.merge(dfcycle, dfp, on = ['pitcher','game_year'], how = 'left')
dfp['player_name'] = dfp['player_name'].fillna(method = 'ffill')
dfp = dfp.fillna(0)
# dfp.head()

In [492]:
## Merge All
dfp = pd.merge(dfp, dfv, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dfs, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dfc, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dfmax, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dfmin, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dfavg, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dfr, on = ['pitcher','game_year'], how = 'left')
dfp = pd.merge(dfp, dffps, on = ['pitcher','game_year'], how = 'left')
dfp['bfp'] = dfp.bf/dfp.gp
dfp = dfp.fillna(0)

In [493]:
dfp.head()

Unnamed: 0,pitcher,game_year,player_name,bf,hits,home_run,bb,k,bbrate,krate,hrate,hrrate,variety,ss_prop,cs_prop,max_speed,min_speed,avg_speed,gp,fps_prop,bfp
0,110683,2012,"Batista, Miguel",244.0,58.0,6.0,33.0,36.0,0.135246,0.170616,0.307692,0.034286,4.0,0.316742,0.266784,95.9,47.8,89.224021,35.0,0.431718,6.971429
1,110683,2013,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,110683,2014,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110683,2015,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110683,2016,"Batista, Miguel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [494]:
## Create Previous
dfp['bbrate_prev'] = dfp.groupby('pitcher').bbrate.transform(lambda x : x.shift(1, fill_value = 0))
dfp['krate_prev'] = dfp.groupby('pitcher').krate.transform(lambda x : x.shift(1, fill_value = 0))
dfp['hrate_prev'] = dfp.groupby('pitcher').hrate.transform(lambda x : x.shift(1, fill_value = 0))
dfp['hrrate_prev'] = dfp.groupby('pitcher').hrrate.transform(lambda x : x.shift(1, fill_value = 0))
dfp['bf_prev'] = dfp.groupby('pitcher').bf.transform(lambda x : x.shift(1, fill_value = 0))
dfp['variety_prev'] = dfp.groupby('pitcher').variety.transform(lambda x : x.shift(1, fill_value = 0))
dfp['ssrate_prev'] = dfp.groupby('pitcher').ss_prop.transform(lambda x : x.shift(1, fill_value = 0))
dfp['csrate_prev'] = dfp.groupby('pitcher').cs_prop.transform(lambda x : x.shift(1, fill_value = 0))
dfp['maxspeed_prev'] = dfp.groupby('pitcher').max_speed.transform(lambda x : x.shift(1, fill_value = 0))
dfp['minspeed_prev'] = dfp.groupby('pitcher').min_speed.transform(lambda x : x.shift(1, fill_value = 0))
dfp['avgspeed_prev'] = dfp.groupby('pitcher').avg_speed.transform(lambda x : x.shift(1, fill_value = 0))
dfp['speedrange_prev'] = dfp.maxspeed_prev - dfp.minspeed_prev
dfp['bfp_prev'] = dfp.groupby('pitcher').bfp.transform(lambda x : x.shift(1, fill_value = 0))
dfp['fps_prev'] = dfp.groupby('pitcher').fps_prop.transform(lambda x : x.shift(1, fill_value = 0))

In [495]:
dftrain = dfp.loc[(dfp.game_year != 2012) & (dfp.game_year != 2019) & (dfp.bf >=200) & (dfp.bf_prev >= 200)]
dftest = dfp.loc[(dfp.game_year == 2019) & (dfp.bf >=200) & (dfp.bf_prev >= 200)]

In [500]:
res4a = smf.ols('bbrate ~ bbrate_prev+bfp_prev+fps_prev+maxspeed_prev' ,dftrain).fit()
print(mean_squared_error(dftest.bbrate, res4a.predict(dftest)))
print(res4a.params)
print(res4a.rsquared)
print(res4a.summary())
print('--------')
res4b = smf.ols('krate ~ krate_prev+variety_prev+maxspeed_prev' ,dftrain).fit()
print(mean_squared_error(dftest.krate, res4b.predict(dftest)))
print(res4b.params)
print(res4b.rsquared)
print(res4b.summary())

0.0004293711096339291
Intercept        0.008783
bbrate_prev      0.399797
bfp_prev        -0.000159
fps_prev        -0.084800
maxspeed_prev    0.000907
dtype: float64
0.3010750712988194
                            OLS Regression Results                            
Dep. Variable:                 bbrate   R-squared:                       0.301
Model:                            OLS   Adj. R-squared:                  0.299
Method:                 Least Squares   F-statistic:                     153.7
Date:                Wed, 14 Apr 2021   Prob (F-statistic):          2.16e-109
Time:                        23:20:42   Log-Likelihood:                 3620.7
No. Observations:                1432   AIC:                            -7231.
Df Residuals:                    1427   BIC:                            -7205.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef

In [436]:
res4a = smf.ols('bbrate ~ bbrate_prev' ,dftrain).fit()
print(mean_squared_error(dftest.bbrate, res4a.predict(dftest)))
print(res4a.params)
print(res4a.rsquared)
print('--------')
res4b = smf.ols('krate ~ krate_prev' ,dftrain).fit()
print(mean_squared_error(dftest.krate, res4b.predict(dftest)))
print(res4b.params)
print(res4b.rsquared)

0.0004598137497809986
Intercept      0.036375
bbrate_prev    0.526399
dtype: float64
0.2693856613373722
--------
0.002001949418894451
Intercept     0.060182
krate_prev    0.744610
dtype: float64
0.5497857934030911


In [511]:
def getMSE(model, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  return mse

def forward_selection(model, X_train, X_test, y_train, y_test):
    initial_features = X_train.columns
    temp = 1
    best_features = []
    best = 'none'
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        for new_column in remaining_features:
            temp_list = best_features+[new_column]
            mse = getMSE(model, X_train[temp_list], X_test[temp_list], y_train, y_test)
            if mse<temp:
                temp = mse
                best = new_column
        if best not in best_features:
            best_features.append(best)
        else:
            break
    return mse, best_features

def backward_selection(model, X_train, X_test, y_train, y_test):
    initial_features = X_train.columns
    temp = 1
    worst_features = []
    worst = 'none'
    while (len(worst_features)<len(initial_features)):
        subset = list(set(initial_features)-set(worst_features))
        for new_column in subset:
            remaining_features = list(set(subset)-set([new_column]))
            mse = getMSE(model, X_train[remaining_features], X_test[remaining_features], y_train, y_test)
            if mse<temp:
                temp = mse
                worst = new_column
        if worst not in worst_features:
            worst_features.append(worst)
        else:
            break
    
    best_features = list(set(initial_features)-set(worst_features))
    
    mse = getMSE(model, X_train[best_features], X_test[best_features], y_train, y_test)
    return mse, best_features


In [512]:
X_train = dftrain[['bbrate_prev', 'krate_prev', 'bf_prev', 'variety_prev',
       'ssrate_prev', 'csrate_prev', 'maxspeed_prev', 'minspeed_prev',
       'avgspeed_prev', 'bfp_prev', 'fps_prev', 'hrate_prev', 'hrrate_prev']]
y_train = dftrain.bbrate
X_test = dftest[['bbrate_prev', 'krate_prev', 'bf_prev', 'variety_prev',
       'ssrate_prev', 'csrate_prev', 'maxspeed_prev', 'minspeed_prev',
       'avgspeed_prev', 'bfp_prev', 'fps_prev', 'hrate_prev', 'hrrate_prev']]
y_test = dftest.bbrate

In [513]:
lin = LinearRegression()
mse_lin, flr = forward_selection(lin, X_train, X_test, y_train, y_test)
print(mse_lin, flr)
mse_lin, flr = backward_selection(lin, X_train, X_test, y_train, y_test);
print(mse_lin, flr)
lin.fit(X_train,y_train)
print(mean_squared_error(y_test, lin.predict(X_test)))
print('-------')
mse_rfr, flr = forward_selection(rfr, X_train, X_test, y_train, y_test)
print(mse_rfr, flr)
mse_rfr, flr = backward_selection(rfr, X_train, X_test, y_train, y_test);
print(mse_rfr, flr)
rfr.fit(X_train,y_train)
print(mean_squared_error(y_test, rfr.predict(X_test)))

0.0004227389452995973 ['bbrate_prev', 'bfp_prev', 'fps_prev', 'maxspeed_prev', 'avgspeed_prev', 'minspeed_prev', 'variety_prev', 'bf_prev', 'ssrate_prev']
0.00042107025013294567 ['bbrate_prev', 'bfp_prev', 'bf_prev', 'maxspeed_prev', 'avgspeed_prev', 'ssrate_prev', 'variety_prev', 'minspeed_prev', 'fps_prev']
0.00043336777391823135
-------
0.0004698247798623034 ['maxspeed_prev', 'bbrate_prev', 'bf_prev', 'hrate_prev', 'minspeed_prev', 'krate_prev']
0.0004458566271705252 ['bbrate_prev', 'krate_prev', 'bfp_prev', 'bf_prev', 'maxspeed_prev', 'hrrate_prev', 'avgspeed_prev', 'csrate_prev', 'variety_prev', 'hrate_prev', 'minspeed_prev', 'fps_prev']
0.00044456596076369295
