# Predict Pitch Experiment

Starting with the most simplicistic prediction, the goal with this experiment is to predict whether a pitcher will pitch a ball or a strike, given a basic game scenario.

Looking at the problem at its most fundamental components:
- Pitcher
- R or L hand batter

In [1]:
import psycopg
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
DB_CONNECTION_STRING = "postgresql://baseball_app:baseball123@ocpwork/baseball_db"

DATASET_SIZE = 1000

RANDOM_NUMBER_SEED = 42

BATTING_HAND_LEFT = 0
BATTING_HAND_RIGHT = 1

PITCH_BALL = 0.0
PITCH_STRIKE = 1.0

In [3]:
sql = """
        select game_play_atbat.player_code as player_code, pitch_index, pitch_type_cd, home_team_flag, game_play_atbat.score_home as score_home, game_play_atbat.score_visitor as score_visitor, sky, night_flag, temperature, wind_direction, wind_speed, precipitation, field_condition, roster.batting_hand as batting_hand
        from game, game_play_atbat, game_play_atbat_pitch, roster
        where game.id = game_play_atbat.id
        and game_play_atbat.id = game_play_atbat_pitch.id
        and game_play_atbat.play_index = game_play_atbat_pitch.play_index
        and roster.player_code = game_play_atbat.player_code
        order by game_play_atbat.player_code, pitch_index
      """

if DATASET_SIZE is not None:
    sql += "limit " + str(DATASET_SIZE)

df = None

with psycopg.connect(DB_CONNECTION_STRING) as sql_connection:
    with sql_connection.cursor() as sql_cursor:
        sql_cursor.execute(sql) #, [])

        results = sql_cursor.fetchall()
        df = pd.DataFrame(results, columns=[desc[0] for desc in sql_cursor.description])


In [4]:
df.head()

Unnamed: 0,player_code,pitch_index,pitch_type_cd,home_team_flag,score_home,score_visitor,sky,night_flag,temperature,wind_direction,wind_speed,precipitation,field_condition,batting_hand
0,aardd001,1,C,True,6,3,cloudy,True,81,rtol,8,unknown,unknown,R
1,aardd001,1,C,True,6,3,cloudy,True,81,rtol,8,unknown,unknown,R
2,aardd001,1,C,True,6,3,cloudy,True,81,rtol,8,unknown,unknown,R
3,aardd001,1,C,True,6,3,cloudy,True,81,rtol,8,unknown,unknown,R
4,aardd001,1,C,True,6,3,cloudy,True,81,rtol,8,unknown,unknown,R


In [5]:

df = df[["pitch_index", "pitch_type_cd", "batting_hand"]]
print ("DF Shape: " + str(df.shape))

scaled_df = pd.DataFrame()

# Scale Pitch Type
scaled_df["pitch_type"] = df["pitch_type_cd"].apply(lambda x: PITCH_BALL if x in ["B", "H", "I", "V"] else None if x in ["+", "*", ".", "1", "2", "3", ">", "N", "U"] else PITCH_STRIKE)
print ("Scaled DF Shape: " + str(scaled_df.shape))

# Scale Batting Hand
scaled_df["batting_hand"] = df["batting_hand"].apply(lambda x: BATTING_HAND_LEFT if x == 'L' else BATTING_HAND_RIGHT)
print ("Scaled DF Shape: " + str(scaled_df.shape))

# Scale Pitch Index
std_scaler = StandardScaler()
pitch_index_df = df[["pitch_index"]]
std_scaler.fit(pitch_index_df)
scaled_df["pitch_index"] = std_scaler.transform(pitch_index_df)
print ("Scaled DF Shape: " + str(scaled_df.shape))


DF Shape: (1000, 3)
Scaled DF Shape: (1000, 1)
Scaled DF Shape: (1000, 2)
Scaled DF Shape: (1000, 3)


In [6]:
df_train, df_test = train_test_split(scaled_df, random_state=RANDOM_NUMBER_SEED, test_size=0.2)
print ("Training Shape: " + str(df_train.shape) + " Test Shape: " + str(df_test.shape))

df_train_X = df_train[["pitch_index", "batting_hand"]]
df_train_Y = df_train[["pitch_type"]]
print ("Training X Shape: " + str(df_train_X.shape) + " Test X Shape: " + str(df_train_Y.shape))

df_test_X = df_test[["pitch_index", "batting_hand"]]
df_test_Y = df_test[["pitch_type"]]
print ("Test X Shape: " + str(df_test_X.shape) + " Test Y Shape: " + str(df_test_Y.shape))

Training Shape: (800, 3) Test Shape: (200, 3)
Training X Shape: (800, 2) Test X Shape: (800, 1)
Test X Shape: (200, 2) Test Y Shape: (200, 1)


In [7]:
df_train_X_tensor = torch.tensor(df_train_X.values, dtype=torch.float32)
df_train_Y_tensor = torch.tensor(df_train_Y.values, dtype=torch.float32)
train_dataset = TensorDataset(df_train_X_tensor, df_train_Y_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=2)

df_test_X_tensor = torch.tensor(df_test_X.values, dtype=torch.float32)
df_test_Y_tensor = torch.tensor(df_test_Y.values, dtype=torch.float32)
test_dataset = TensorDataset(df_test_X_tensor, df_test_Y_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=2)


In [8]:

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(2, 2)

    def forward(self, x):
        return self.fc(x)

# Initialize the model, loss function, and optimizer
model = SimpleModel()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Train the model
for epoch in range(100):
    for i, data in enumerate(train_dataloader):
        inputs, labels = data

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


  return F.mse_loss(input, target, reduction=self.reduction)


In [9]:
total_correct = 0
total_samples = 0

model.eval()

with torch.no_grad():
    for x, y in test_dataloader:
        scores = model(x)
        _, predictions = scores.max(1)
        total_correct += (predictions == y).sum()
        total_samples += predictions.size(0)
        
    print(f'Got {total_correct} / {total_samples} with accuracy {float(total_correct)/float(total_samples)*100:.2f}') 
    
    model.train()


Got 138 / 200 with accuracy 69.00
