In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from modules.objects import DataframeColumnSelector
import statsmodels.api as sm
from sklearn.compose import make_column_transformer,ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import set_config
import pickle

df_train = pd.read_csv('raw_data/bb_train.csv')
# Drop the null bb_type entries as there are only six
df_train = df_train[df_train.bb_type.notnull()].copy()
df_dimension = pd.read_csv('raw_data/park_dimensions.csv')

In [6]:
features_to_use =  [
                'is_batter_lefty',
                'is_pitcher_lefty',
                'bb_type',
                'bearing',
                'pitch_name',
                'inning',
                'outs_when_up',
                'balls',
                'strikes',
                'plate_x',
                'plate_z',
                'pitch_mph',
                'launch_speed',
                'launch_angle',
                'is_home_run']

continuous_features = ['launch_angle',
                       'pitch_mph',
                       'plate_x',
                       'plate_z',
                       'launch_speed']

categorical_features = [i for i in features_to_use if (i != "is_home_run" and i not in continuous_features)]

categorical_transformers = []
continuous_transformers = []

for categorical_feature in categorical_features:

    ohe = OneHotEncoder()

    categorical_feature_pipeline = Pipeline(
        steps = [
            ('one_hot_encoder',ohe)
                ]
                                            )       
    categorical_transformers.append((
            f"{categorical_feature}_pipeline",
            categorical_feature_pipeline,
            [categorical_feature]))   

for continuous_feature in continuous_features:

    if continuous_feature not in ['launch_angle','launch_speed']:

        scaler = StandardScaler()

        numeric_feature_pipeline = Pipeline(
            steps = [
                ('scaler',scaler)
            ]
        )

        continuous_transformers.append((
            f"{continuous_feature}_pipeline",
            numeric_feature_pipeline,
            [continuous_feature]))
    
    else:
    
        scaler = StandardScaler()
        imputer = IterativeImputer()

        numeric_feature_pipeline = Pipeline(
            steps = [
                ('scaler',scaler),
                ('imputer',imputer)])

        continuous_transformers.append((
            f"{continuous_feature}_pipeline",
            numeric_feature_pipeline,
            [continuous_feature]))

all_transformers = categorical_transformers + continuous_transformers
data_preprocessor = ColumnTransformer(all_transformers,remainder='passthrough')

transformation_pipeline = Pipeline(
    steps = [
        ('column_selector',DataframeColumnSelector(col_list=features_to_use)),
        ('preprocessor',data_preprocessor)
            ])

set_config(display="diagram")

# We want to save this pre-processing pipeline for use in future ML processes
with open('pickle_files/transformation_pipeline.pickle','wb') as f:
    pickle.dump(transformation_pipeline,f)

transformation_pipeline

In [3]:
# Transform data
to_logreg_data = transformation_pipeline.fit_transform(df_train)

# Instantiate an empty list to extract column names in to
all_column_names = []

# Iterate through steps of the pipeline
for i in range(len(transformation_pipeline.named_steps['preprocessor'].transformers_)):

    try: # If the "step" is a transformer, then get the feature names
        all_column_names.extend([j for j in transformation_pipeline.named_steps['preprocessor'].transformers_[i][1].get_feature_names_out()])
    except AttributeError as e: 
        try:# Otherwise if it's a pipeline (imputer first), grab the second element's names
            all_column_names.extend(
                [i for i in transformation_pipeline.named_steps['preprocessor'].transformers_[i][1][0].get_feature_names_out()]
                )
        except AttributeError as e: # We're dealing with a string at this point
            all_column_names.append('is_home_run')

# Coerce from CSR matrix to pd.DataFrame
transformed_data = pd.DataFrame(to_logreg_data,columns=all_column_names)

# Manually coerce columns to integers where applicable
for column_name in [i for i in transformed_data.columns if i not in continuous_features]:

    transformed_data[column_name] = transformed_data[column_name].astype(int)

transformed_data

Unnamed: 0,is_batter_lefty_0,is_batter_lefty_1,is_pitcher_lefty_0,is_pitcher_lefty_1,bb_type_fly_ball,bb_type_ground_ball,bb_type_line_drive,bb_type_popup,bearing_center,bearing_left,...,balls_3,strikes_0,strikes_1,strikes_2,launch_angle,pitch_mph,plate_x,plate_z,launch_speed,is_home_run
0,1,0,0,1,0,0,1,0,0,1,...,0,0,0,1,-1.203632e-01,-0.179029,-0.265597,-0.618638,1.410355e+00,0
1,1,0,0,1,0,0,1,0,0,1,...,1,0,1,0,1.911520e-01,1.070091,-1.297363,0.973815,5.443491e-16,0
2,1,0,0,1,0,0,0,1,0,1,...,0,0,0,1,1.264149e+00,-0.398469,-0.984149,-1.003627,-8.814336e-01,0
3,0,1,0,1,1,0,0,0,0,0,...,0,0,0,1,1.437213e+00,-0.364709,-0.763056,1.446302,1.731950e-01,0
4,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,-2.242017e-01,1.222011,0.342408,1.656295,5.443491e-16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46233,0,1,1,0,0,0,1,0,0,0,...,0,0,0,1,4.680545e-01,-1.343749,0.471378,0.851319,7.343115e-01,0
46234,0,1,1,0,1,0,0,0,0,1,...,0,0,1,0,7.795698e-01,-2.778549,-1.020997,-0.496142,3.895290e-01,0
46235,1,0,0,1,0,0,1,0,1,0,...,0,1,0,0,2.887211e-17,1.492091,0.305559,1.358804,1.187261e+00,0
46236,0,1,0,1,0,0,1,0,0,1,...,0,0,0,1,5.270080e-02,-0.837349,0.489803,0.921317,7.545928e-01,0


In [11]:
X = transformed_data.drop('is_home_run',axis = 1)
y = transformed_data.is_home_run
logit_model = sm.Logit(y,X)
logit_model.fit(maxiter=100000)

         Current function value: 0.135383
         Iterations: 100000




<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x2047c7ee700>

In [13]:
result = logit_model.fit()

         Current function value: 0.135383
         Iterations: 35




In [14]:
result.summary()

0,1,2,3
Dep. Variable:,is_home_run,No. Observations:,46238.0
Model:,Logit,Df Residuals:,46197.0
Method:,MLE,Df Model:,40.0
Date:,"Sat, 03 Jun 2023",Pseudo R-squ.:,0.3461
Time:,20:13:52,Log-Likelihood:,-6259.9
converged:,False,LL-Null:,-9572.7
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
is_batter_lefty_0,-3.3059,,,,,
is_batter_lefty_1,-3.3893,,,,,
is_pitcher_lefty_0,-3.3481,5.83e+06,-5.74e-07,1.000,-1.14e+07,1.14e+07
is_pitcher_lefty_1,-3.3471,6.31e+06,-5.31e-07,1.000,-1.24e+07,1.24e+07
bb_type_fly_ball,10.2869,1.55e+07,6.63e-07,1.000,-3.04e+07,3.04e+07
bb_type_ground_ball,-14.6753,1.46e+07,-1.01e-06,1.000,-2.85e+07,2.85e+07
bb_type_line_drive,9.3343,1.25e+07,7.47e-07,1.000,-2.45e+07,2.45e+07
bb_type_popup,-11.6411,1.26e+07,-9.26e-07,1.000,-2.46e+07,2.46e+07
bearing_center,-1.7194,,,,,
