In [1]:
import sys
sys.path.append('../')

from Datasets.BaseballDataset import BaseballDataset

import torch
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F
from torch.utils.data import DataLoader
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler


In [2]:
data_config_path = "../data/config.json"
valid_path = "../data/statcast_2015-2023_cleaned.csv"
sequence_length = 200
valid_data = pd.read_csv(valid_path)

In [3]:
valid_data.columns

Index(['launch_speed', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'batter', 'pitcher', 'pfx_x', 'pfx_z', 'plate_x',
       'plate_z', 'hc_x', 'hc_y', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top',
       'sz_bot', 'launch_angle', 'release_spin_rate', 'release_extension',
       'game_pk', 'release_pos_y', 'at_bat_number', 'batter_name',
       'pitcher_name', 'events_B', 'events_S', 'events_double',
       'events_field_out', 'events_hit_by_pitch', 'events_home_run',
       'events_single', 'events_strikeout', 'events_triple', 'events_walk',
       'pitch_type_CH', 'pitch_type_CS', 'pitch_type_CU', 'pitch_type_EP',
       'pitch_type_FA', 'pitch_type_FC', 'pitch_type_FF', 'pitch_type_FO',
       'pitch_type_FS', 'pitch_type_KC', 'pitch_type_KN', 'pitch_type_PO',
       'pitch_type_SC', 'pitch_type_SI', 'pitch_type_SL', 'pitch_type_ST',
       'pitch_type_SV', 'stand_L', 'stand_R', 'p_throws_L', 'p_throws_R',
       'hit_location_0.0', 'hit_location_1.0', 'hit_loc

In [4]:

valid_dataset = BaseballDataset(valid_data,data_config_path,sequence_length)

In [5]:
valid_dataset.label_columns

['events', 'launch_speed', 'hit_location', 'hc_x', 'hc_y', 'launch_angle']

In [6]:
seq, cont_target, cat_targets = valid_dataset[2]

In [7]:
seq[-1]

tensor([-3.7555e-17,  7.2403e-01, -1.6664e-01,  3.9323e-01, -8.3054e-01,
        -5.8089e-02,  1.1461e+00,  3.0425e-01, -3.3902e-16,  1.9012e-16,
        -7.1635e-01, -1.5105e-01, -9.2301e-01,  9.6360e-01, -5.2170e-02,
         1.9702e+00,  2.7812e+00, -2.3654e-17, -1.5236e+00, -9.9139e-02,
         5.6065e-02,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
         0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0

In [8]:
valid_dataset.pitch_col_names

['launch_speed',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'hc_x',
 'hc_y',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'launch_angle',
 'release_spin_rate',
 'release_extension',
 'release_pos_y',
 'events_B',
 'events_S',
 'events_double',
 'events_field_out',
 'events_hit_by_pitch',
 'events_home_run',
 'events_single',
 'events_strikeout',
 'events_triple',
 'events_walk',
 'pitch_type_CH',
 'pitch_type_CS',
 'pitch_type_CU',
 'pitch_type_EP',
 'pitch_type_FA',
 'pitch_type_FC',
 'pitch_type_FF',
 'pitch_type_FO',
 'pitch_type_FS',
 'pitch_type_KC',
 'pitch_type_KN',
 'pitch_type_PO',
 'pitch_type_SC',
 'pitch_type_SI',
 'pitch_type_SL',
 'pitch_type_ST',
 'pitch_type_SV',
 'stand_L',
 'stand_R',
 'p_throws_L',
 'p_throws_R',
 'hit_location_0.0',
 'hit_location_1.0',
 'hit_location_2.0',
 'hit_location_3.0',
 'hit_location_4.0',
 'hit_location_5.0',
 'hit_location_6.0',
 'hit_location_7.0',
 'hit_location_8.0'

In [9]:
cont_target

tensor([-0.6863, -0.4316, -0.4295, -0.6591])

In [10]:
cat_targets

[tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]

In [11]:
valid_dataset.continuous_label_indices

tensor([ 0,  8,  9, 17])

In [12]:
valid_dataset.continuous_label_names

['launch_speed', 'hc_x', 'hc_y', 'launch_angle']

In [13]:
valid_dataset.categorical_label_indices

[tensor([21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
 tensor([52, 53, 54, 55, 56, 57, 58, 59, 60, 61])]

In [14]:
valid_dataset.categorical_label_names

[['events_B',
  'events_S',
  'events_double',
  'events_field_out',
  'events_hit_by_pitch',
  'events_home_run',
  'events_single',
  'events_strikeout',
  'events_triple',
  'events_walk'],
 ['hit_location_0.0',
  'hit_location_1.0',
  'hit_location_2.0',
  'hit_location_3.0',
  'hit_location_4.0',
  'hit_location_5.0',
  'hit_location_6.0',
  'hit_location_7.0',
  'hit_location_8.0',
  'hit_location_9.0']]

In [15]:
valid_dataset.mask_indices

tensor([73, 74])

In [16]:
valid_dataset.mask

tensor([-3.7555e-17,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00, -3.3902e-16,  1.9012e-16,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00, -2.3654e-17,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0

In [124]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:x.size(1)].transpose(0, 1)
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_heads, num_encoder_layers, hidden_dim, output_dim, sequence_length, dropout=0.1):
        super(TransformerModel, self).__init__()
        
        self.input_dim = input_dim
        self.sequence_length = sequence_length
        
        self.embedding = nn.Linear(input_dim, hidden_dim)
        self.positional_encoding = PositionalEncoding(hidden_dim, dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers)
        
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x)
        x = x[:, -1, :]  # Use the output of the last pitch in the sequence
        x = self.fc_layers(x)
        return x


def load_model(model_path, config_path):
    with open(config_path, 'r') as file:
        config = json.load(file)

    model = TransformerModel(
        input_dim=config['input_dim'],
        num_heads=config['num_heads'],
        num_encoder_layers=config['num_encoder_layers'],
        hidden_dim=config['hidden_dim'],
        output_dim=config['output_dim'],
        sequence_length=config['sequence_length'],
        dropout=config.get('dropout', 0.1)  # Optional: provide a default value for dropout if not in config
    )

    model.load_state_dict(torch.load(model_path))
    model.eval()  # Set the model to evaluation mode
    return model



def make_preds(model, dataset, scaler_path, device, batch_size):

    #get column names in correct order
    flat_cat_names = []
    for names in dataset.categorical_label_names:
        flat_cat_names = flat_cat_names + names 
    col_names = dataset.continuous_label_names + flat_cat_names

    #create dataloader for dataset
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    model.eval()

    preds_array = [] #keep trask of preds for each batch
    with torch.no_grad():
        for sequence_tensor, cont_target_tensor, cat_target_tensor in loader:
            sequence_tensor, cont_target_tensor = sequence_tensor.to(device), cont_target_tensor.to(device)
            cat_targets = [t.to(device) for t in cat_target_tensor]
            output = model(sequence_tensor)

            #first k logits correspond to continuous outputs, k = cont_target.size(1)
            cont_output = output[:, :cont_target.size(1)].cpu().squeeze(0).detach().numpy()

            #can have multiple kinds of categorical outputs. If cat_targets is (batch_size, 2, 10), there are 2 kinds of cateogorical outputs, each with 10 values.
            #The first 10 logits after the continuous logits will correspond to first categorical output, second 10 to the second, so this requires multiple softmaxes
            cat_probs = []
            start_idx = cont_target.size(1)
            for cat_target in cat_targets:
                end_idx = start_idx + cat_target.size(1)
                cat_probs.append(nn.functional.softmax(output[:, start_idx:end_idx],dim=1).cpu().squeeze(0).detach().numpy())
                start_idx = end_idx
    
            #cat continuous and categorical outputs together
            preds = cont_output
            for probs in cat_probs:
                preds = np.concatenate((preds, probs),axis=1)
            
            preds_array.append(preds)

    #make single preds pd     
    preds_array = np.vstack(preds_array)
    preds_pd = pd.DataFrame(preds_array, columns=col_names)

    #scale continuous outputs back to real values
    with open(scaler_path, "rb") as file:
        scalers = pickle.load(file)

    for column, scaler in scalers.items():
        if column in preds_pd:
            preds_pd[column] = (preds_pd[column] * scaler.scale_) + scaler.mean_
    

    return preds_pd

    


In [121]:
model_path = "small_200_output_updated/transformer_model.pth"
model_config_path = "small_200_output_updated/model_config.json"
model = load_model(model_path, model_config_path)
val_loader = DataLoader(valid_dataset, batch_size=500, shuffle=False, num_workers=0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


TransformerModel(
  (embedding): Linear(in_features=75, out_features=48, bias=True)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True)
        )
        (linear1): Linear(in_features=48, out_features=48, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=48, out_features=48, bias=True)
        (norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=48, out_features=48, bias=True)
    (1): ReLU()
    (2): L

In [125]:
predictions = make_preds(model,valid_dataset,"../data/statcast_2023-2024_cleaned_scalers.pkl",device, batch_size=500)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [126]:
predictions

Unnamed: 0,launch_speed,hc_x,hc_y,launch_angle,events_B,events_S,events_double,events_field_out,events_hit_by_pitch,events_home_run,...,hit_location_0.0,hit_location_1.0,hit_location_2.0,hit_location_3.0,hit_location_4.0,hit_location_5.0,hit_location_6.0,hit_location_7.0,hit_location_8.0,hit_location_9.0
0,55.195280,54.788218,54.600249,54.880720,0.019787,5.017832e-02,6.485818e-03,1.799739e-02,5.794608e-02,3.554757e-01,...,5.595204e-04,0.001775,5.137882e-02,0.714609,1.625907e-05,1.754272e-02,2.124547e-01,1.477010e-10,1.659736e-03,3.692037e-06
1,53.858239,53.793873,53.804926,53.845402,0.999985,8.176662e-07,2.022520e-06,5.579308e-07,4.342257e-07,2.261670e-06,...,1.175859e-06,0.999993,1.704208e-07,0.000004,9.101946e-10,1.910401e-08,9.538027e-07,1.451720e-07,3.980730e-11,1.097615e-09
2,53.852127,53.756397,53.766201,53.846532,0.999959,1.976771e-06,7.538604e-06,1.179387e-06,3.715934e-06,5.094395e-06,...,9.999474e-01,0.000020,4.652072e-07,0.000021,1.396548e-06,2.532247e-07,5.949435e-06,1.740486e-06,1.522685e-12,1.589241e-06
3,54.569995,54.449159,54.489308,54.501275,0.043222,1.217628e-02,4.021120e-01,3.995695e-02,4.916657e-02,7.232252e-02,...,1.007699e-04,0.000523,4.164403e-02,0.441495,1.672046e-04,4.098929e-02,1.013434e-01,3.709433e-01,2.781464e-03,1.290617e-05
4,53.848622,53.759195,53.768970,53.846261,0.999986,6.689947e-07,2.570798e-06,3.680438e-07,1.091018e-06,1.722585e-06,...,9.999592e-01,0.000023,2.482460e-07,0.000012,4.471449e-07,1.203748e-07,3.191357e-06,1.168365e-06,3.562819e-13,6.884530e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974348,53.851655,53.933842,53.947628,53.850139,0.999995,2.298961e-07,1.814704e-07,3.032916e-07,1.968819e-07,5.896700e-07,...,2.547291e-07,0.999996,1.436607e-07,0.000003,1.859711e-10,1.269688e-08,5.246843e-07,1.412034e-08,2.291185e-11,1.630705e-10
974349,54.872140,54.571771,54.436434,54.846194,0.007341,4.474056e-02,2.037758e-01,1.428795e-01,2.336669e-01,2.583505e-02,...,7.274951e-05,0.000301,2.544834e-02,0.583955,3.339057e-07,5.072358e-03,1.680454e-01,2.131726e-01,3.929613e-03,3.125766e-06
974350,53.853017,54.507747,54.635478,53.851903,0.999999,1.449466e-08,2.456509e-08,3.933961e-08,2.242465e-08,1.631256e-07,...,4.614255e-07,0.999998,2.245798e-08,0.000001,6.531252e-11,2.145065e-09,1.547066e-07,7.052350e-09,2.906094e-13,5.936721e-11
974351,53.848924,53.759940,53.770783,53.846814,0.999985,7.384831e-07,2.468105e-06,4.104339e-07,1.244035e-06,1.978380e-06,...,9.999619e-01,0.000020,2.665767e-07,0.000012,5.187881e-07,1.311362e-07,3.343655e-06,7.450520e-07,4.090229e-13,9.378867e-07


In [61]:
for ex in val_loader:
    sequence, cont_target, cat_targets = ex
    print(sequence.shape,cont_target.shape)
    break

torch.Size([2, 200, 75]) torch.Size([2, 4])


In [62]:
output = model(sequence).cpu()
output

tensor([[  2.4514,   1.5771,   1.1734,   1.7758,  -1.6404,  -0.7099,  -2.7559,
          -1.7352,  -0.5660,   1.2480,   0.7633,   0.5508,  -0.4030,  -1.2941,
          -5.1634,  -4.0089,  -0.6435,   1.9890,  -8.7019,  -1.7181,   0.7760,
         -20.3108,  -4.0761, -10.1843],
        [ -0.4202,  -0.5585,  -0.5347,  -0.4478,   3.0059, -11.0109, -10.1052,
         -11.3931, -11.6437,  -9.9935,  -9.8292,  -9.7752, -10.1078, -10.1953,
          -7.8852,   5.7683,  -9.8167,  -6.6162, -15.0491, -12.0051,  -8.0945,
          -9.9771, -18.1787, -14.8619]], grad_fn=<AddmmBackward0>)

In [63]:
cont_output = output[:, :cont_target.size(1)].cpu().squeeze(0).detach().numpy()
cont_output

array([[ 2.4513645 ,  1.5771134 ,  1.1734091 ,  1.7757804 ],
       [-0.42021593, -0.5584552 , -0.5347163 , -0.44778544]],
      dtype=float32)

In [64]:
valid_dataset.continuous_label_indices

tensor([ 0,  8,  9, 17])

In [65]:
valid_dataset.continuous_label_names

['launch_speed', 'hc_x', 'hc_y', 'launch_angle']

In [74]:
cat_targets

[tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
         [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])]

In [99]:
cat_probs = []
start_idx = cont_target.size(1)
for cat_target in cat_targets:
    end_idx = start_idx + cat_target.size(1)
    cat_probs.append(nn.functional.softmax(output[:, start_idx:end_idx],dim=1).cpu().squeeze(0).detach().numpy())
    start_idx = end_idx

#cat_output = nn.functional.softmax(output[:, cont_target.size(1):]).cpu().squeeze(0).detach().numpy()
cat_probs

[array([[1.9787213e-02, 5.0178301e-02, 6.4858207e-03, 1.7997392e-02,
         5.7946078e-02, 3.5547572e-01, 2.1892799e-01, 1.7702083e-01,
         6.8204962e-02, 2.7975665e-02],
        [9.9998450e-01, 8.1766859e-07, 2.0225259e-06, 5.5793294e-07,
         4.3422696e-07, 2.2616766e-06, 2.6654120e-06, 2.8132768e-06,
         2.0172190e-06, 1.8483646e-06]], dtype=float32),
 array([[5.5952009e-04, 1.7751115e-03, 5.1378813e-02, 7.1460938e-01,
         1.6259080e-05, 1.7542724e-02, 2.1245477e-01, 1.4770152e-10,
         1.6597356e-03, 3.6920435e-06],
        [1.1758632e-06, 9.9999344e-01, 1.7042129e-07, 4.1829985e-06,
         9.1019808e-10, 1.9104078e-08, 9.5380631e-07, 1.4517224e-07,
         3.9807452e-11, 1.0976196e-09]], dtype=float32)]

In [100]:
np.concatenate((cat_probs[0],cat_probs[1]),axis=1).shape

(2, 20)

In [93]:
cont_output

array([[ 2.4513645 ,  1.5771134 ,  1.1734091 ,  1.7757804 ],
       [-0.42021593, -0.5584552 , -0.5347163 , -0.44778544]],
      dtype=float32)

In [101]:
import numpy as np

preds = cont_output
for probs in cat_probs:
    preds = np.concatenate((preds, probs),axis=1)

preds

array([[ 2.4513645e+00,  1.5771134e+00,  1.1734091e+00,  1.7757804e+00,
         1.9787213e-02,  5.0178301e-02,  6.4858207e-03,  1.7997392e-02,
         5.7946078e-02,  3.5547572e-01,  2.1892799e-01,  1.7702083e-01,
         6.8204962e-02,  2.7975665e-02,  5.5952009e-04,  1.7751115e-03,
         5.1378813e-02,  7.1460938e-01,  1.6259080e-05,  1.7542724e-02,
         2.1245477e-01,  1.4770152e-10,  1.6597356e-03,  3.6920435e-06],
       [-4.2021593e-01, -5.5845523e-01, -5.3471631e-01, -4.4778544e-01,
         9.9998450e-01,  8.1766859e-07,  2.0225259e-06,  5.5793294e-07,
         4.3422696e-07,  2.2616766e-06,  2.6654120e-06,  2.8132768e-06,
         2.0172190e-06,  1.8483646e-06,  1.1758632e-06,  9.9999344e-01,
         1.7042129e-07,  4.1829985e-06,  9.1019808e-10,  1.9104078e-08,
         9.5380631e-07,  1.4517224e-07,  3.9807452e-11,  1.0976196e-09]],
      dtype=float32)

In [95]:
valid_dataset.continuous_label_names

['launch_speed', 'hc_x', 'hc_y', 'launch_angle']

In [96]:
flat_cat_names = []

for names in valid_dataset.categorical_label_names:
    flat_cat_names = flat_cat_names + names
flat_cat_names

['events_B',
 'events_S',
 'events_double',
 'events_field_out',
 'events_hit_by_pitch',
 'events_home_run',
 'events_single',
 'events_strikeout',
 'events_triple',
 'events_walk',
 'hit_location_0.0',
 'hit_location_1.0',
 'hit_location_2.0',
 'hit_location_3.0',
 'hit_location_4.0',
 'hit_location_5.0',
 'hit_location_6.0',
 'hit_location_7.0',
 'hit_location_8.0',
 'hit_location_9.0']

In [97]:
col_names = valid_dataset.continuous_label_names + flat_cat_names
len(col_names)

24

In [106]:
preds_pd = pd.DataFrame(preds, columns=col_names)
preds_pd

Unnamed: 0,launch_speed,hc_x,hc_y,launch_angle,events_B,events_S,events_double,events_field_out,events_hit_by_pitch,events_home_run,...,hit_location_0.0,hit_location_1.0,hit_location_2.0,hit_location_3.0,hit_location_4.0,hit_location_5.0,hit_location_6.0,hit_location_7.0,hit_location_8.0,hit_location_9.0
0,2.451365,1.577113,1.173409,1.77578,0.019787,0.0501783,0.006486,0.01799739,0.05794608,0.355476,...,0.00056,0.001775,0.05137881,0.714609,1.625908e-05,0.01754272,0.2124548,1.477015e-10,0.001659736,3.692043e-06
1,-0.420216,-0.558455,-0.534716,-0.447785,0.999985,8.176686e-07,2e-06,5.579329e-07,4.34227e-07,2e-06,...,1e-06,0.999993,1.704213e-07,4e-06,9.101981e-10,1.910408e-08,9.538063e-07,1.451722e-07,3.980745e-11,1.09762e-09


In [103]:
import pickle as pkl
from sklearn.preprocessing import StandardScaler

with open("../data/statcast_2023-2024_cleaned_scalers.pkl", "rb") as file:

    scalers = pkl.load(file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [104]:
scalers

{'launch_speed': StandardScaler(),
 'release_speed': StandardScaler(),
 'release_pos_x': StandardScaler(),
 'release_pos_z': StandardScaler(),
 'pfx_x': StandardScaler(),
 'pfx_z': StandardScaler(),
 'plate_x': StandardScaler(),
 'plate_z': StandardScaler(),
 'hc_x': StandardScaler(),
 'hc_y': StandardScaler(),
 'vy0': StandardScaler(),
 'vz0': StandardScaler(),
 'ax': StandardScaler(),
 'ay': StandardScaler(),
 'az': StandardScaler(),
 'sz_top': StandardScaler(),
 'sz_bot': StandardScaler(),
 'launch_angle': StandardScaler(),
 'release_spin_rate': StandardScaler(),
 'release_extension': StandardScaler(),
 'release_pos_y': StandardScaler()}

In [107]:


for column, scaler in scalers.items():
    if column in preds_pd:
        preds_pd[column] = (preds_pd[column] * scaler.scale_) + scaler.mean_


In [108]:
preds_pd

Unnamed: 0,launch_speed,hc_x,hc_y,launch_angle,events_B,events_S,events_double,events_field_out,events_hit_by_pitch,events_home_run,...,hit_location_0.0,hit_location_1.0,hit_location_2.0,hit_location_3.0,hit_location_4.0,hit_location_5.0,hit_location_6.0,hit_location_7.0,hit_location_8.0,hit_location_9.0
0,55.19528,54.788218,54.600249,54.88072,0.019787,0.0501783,0.006486,0.01799739,0.05794608,0.355476,...,0.00056,0.001775,0.05137881,0.714609,1.625908e-05,0.01754272,0.2124548,1.477015e-10,0.001659736,3.692043e-06
1,53.858239,53.793873,53.804926,53.845402,0.999985,8.176686e-07,2e-06,5.579329e-07,4.34227e-07,2e-06,...,1e-06,0.999993,1.704213e-07,4e-06,9.101981e-10,1.910408e-08,9.538063e-07,1.451722e-07,3.980745e-11,1.09762e-09
