In [1]:
import sys
sys.path.append('../')
sys.path.append('../../')

from Datasets.BaseballDataset import BaseballDataset
from BaselineModel.BaselineModel import BaselineModel
from TransformerModel.TransformerModelRedisual import *

import torch
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F
from torch.utils.data import DataLoader
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

In [2]:
data_config_path = "../../data/configv3.json"
full_data_path = "../../data/full_cleaned_94.csv"
full_data = pd.read_csv(full_data_path)

In [15]:
sequence_length = 200


In [4]:
scaler_path = "../../data/full_scalers_94.pkl"
with open(scaler_path, "rb") as file:
    scalers = pickle.load(file)

In [5]:
m_path = "../fixed_94_200/h6_e12_h96_d0_lp0.5_lr1e-05_ep50/transformer_model.pth"
c_path = "../fixed_94_200/h6_e12_h96_d0_lp0.5_lr1e-05_ep50/model_config.json"

transformer_model = TransformerHelper(m_path,c_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [9]:
#transformer model trained on 2017 data, so baseline model will use that as well
from datetime import datetime, timedelta

start_date = pd.to_datetime("2017-01-01")
end_date = pd.to_datetime("2018-01-01")
full_data['game_date'] = pd.to_datetime(full_data['game_date'])
total_days = (end_date - start_date).days
split_date = pd.to_datetime(start_date) + timedelta(days=int(total_days * 0.6))

# Split data into train and validation sets
train = full_data[(full_data['game_date'] > start_date) & (full_data['game_date'] < split_date)].reset_index(drop=True)
valid = full_data[(full_data['game_date'] > split_date) & (full_data['game_date'] < end_date)].reset_index(drop=True)



In [12]:
train_dataset = BaseballDataset(train,data_config_path,sequence_length)
test_dataset = BaseballDataset(valid,data_config_path,sequence_length)

In [17]:
#train baseline model, uses logistic regression for categorical preds and linear regression for continuous preds
baseline_model = BaselineModel(train_dataset, scaler_path, max_iters=100, pred_mode=False, pred_mean=False)
baseline_model.train(batch_size=2000)

Creating matrices
Processing batch 0
Concatenating and appending data
Processing batch 10
Concatenating and appending data
Processing batch 20
Concatenating and appending data
Processing batch 30
Concatenating and appending data
Processing batch 40
Concatenating and appending data
Processing batch 50
Concatenating and appending data
Processing batch 60
Concatenating and appending data
Processing batch 70
Concatenating and appending data
Processing batch 80
Concatenating and appending data
Processing batch 90
Concatenating and appending data
Processing batch 100
Concatenating and appending data
Processing batch 110
Concatenating and appending data
Processing batch 120
Concatenating and appending data
Processing batch 130
Concatenating and appending data
Processing batch 140
Concatenating and appending data
Processing batch 150
Concatenating and appending data
Processing batch 160
Concatenating and appending data
Processing batch 170
Concatenating and appending data
Processing batch 180


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
# make preds with transformer model
trans_preds, true = transformer_model.make_preds(test_dataset, scaler_path, device, 2000, scale=True)


#make preds with baseline model
loader = DataLoader(test_dataset, batch_size=2000, shuffle=False, num_workers=0)
base_preds = []
for seq, _, _ in loader:

    base_preds.append(baseline_model.predict(seq, scale=True))

baseline_preds = pd.concat(base_preds, ignore_index=True)


#both models with share the same true values


# First 4 columns are continuous preds/labels
trans_cont_preds = trans_preds.iloc[:, 0:4]
base_cont_preds = baseline_preds.iloc[:,0:4]
cont_true = true.iloc[:, 0:4]

# Next 10 columns are probabilities for the events categorical feature
trans_events_preds = trans_preds.iloc[:, 4:14]
base_events_preds = baseline_preds.iloc[:, 4:14]
events_true = true.iloc[:, 4:14]


# Last 10 columns are probabilities for the hit_location categorical feature
trans_loc_preds = trans_preds.iloc[:, 14:]
base_loc_preds = baseline_preds.iloc[:,14:]
loc_true = true.iloc[:, 14:]


trans_cont_error = np.mean(np.abs(cont_true - trans_cont_preds))
base_cont_error = np.mean(np.abs(cont_true - base_cont_preds))



# # Summing the probability distributions for events and hit_location
# pred_events_sum = events_preds.sum(axis=0)
# true_events_sum = events_true.sum(axis=0)


# all_event_preds.append(pred_events_sum)
# all_event_true.append(true_events_sum)

# pred_loc_sum = loc_preds.sum(axis=0)
# true_loc_sum = loc_true.sum(axis=0)

# all_loc_preds.append(pred_loc_sum)
# all_loc_true.append(true_loc_sum)


# events_sum_abs_errors = np.abs(pred_events_sum - true_events_sum).values
# loc_sum_abs_errors = np.abs(pred_loc_sum - true_loc_sum).values

# if verbose:
#     print(f"Sum of absolute errors for Events for {name}: {events_sum_abs_errors}")
#     print(f"Sum of absolute errors for Hit Location for {name}: {loc_sum_abs_errors}")

# events_errors.append(events_sum_abs_errors)
# loc_errors.append(loc_sum_abs_errors)


#calculate top k precision
top_k = 3

# Get top-k predictions for each sample, returning the numeric index instead of column names
trans_events_top_k_preds = trans_events_preds.apply(lambda x: x.nlargest(top_k).index.map(lambda name: trans_events_preds.columns.get_loc(name)), axis=1)
trans_loc_top_k_preds = trans_loc_preds.apply(lambda x: x.nlargest(top_k).index.map(lambda name: trans_loc_preds.columns.get_loc(name)), axis=1)

base_events_top_k_preds = base_events_preds.apply(lambda x: x.nlargest(top_k).index.map(lambda name: base_events_preds.columns.get_loc(name)), axis=1)
base_loc_top_k_preds = base_loc_preds.apply(lambda x: x.nlargest(top_k).index.map(lambda name: base_loc_preds.columns.get_loc(name)), axis=1)

# Now the predictions are stored as the numeric indices corresponding to the classes

# Compute precision for each class
trans_events_class_precisions = []
trans_loc_class_precisions = []

base_events_class_precisions = []
base_loc_class_precisions = []

# For each class in events and hit_location, calculate top-k precision
for class_idx in range(10):
    # For events precision
    true_class_mask = events_true.iloc[:, class_idx] == 1  # Find where this class is the true class
    true_class_indices = events_true.index[true_class_mask]
    
    # Check if this class is in the top-k predictions when it's the true class
    trans_event_precision = np.mean([1 if class_idx in trans_events_top_k_preds.iloc[i] else 0 for i in true_class_indices])
    trans_events_class_precisions.append(trans_event_precision)

    base_event_precision = np.mean([1 if class_idx in base_events_top_k_preds.iloc[i] else 0 for i in true_class_indices])
    base_events_class_precisions.append(base_event_precision)

    # For hit location precision
    true_class_mask = loc_true.iloc[:, class_idx] == 1  # Find where this class is the true class
    true_class_indices = loc_true.index[true_class_mask]

    # Check if this class is in the top-k predictions when it's the true class
    trans_loc_precision = np.mean([1 if class_idx in trans_loc_top_k_preds.iloc[i] else 0 for i in true_class_indices])
    trans_loc_class_precisions.append(trans_loc_precision)

    base_loc_precision = np.mean([1 if class_idx in base_loc_top_k_preds.iloc[i] else 0 for i in true_class_indices])
    base_loc_class_precisions.append(base_loc_precision)



Starting Batch: 10
Starting Batch: 20
Starting Batch: 30
Starting Batch: 40
Starting Batch: 50
Starting Batch: 60


In [35]:
event_class_names = test_dataset.categorical_label_names[0]
loc_class_names = test_dataset.categorical_label_names[1]


# Create DataFrames for the results
trans_event_summary_df = pd.DataFrame({
    'Class': event_class_names,
    f'Event Precision (Top K {top_k})': trans_events_class_precisions

})

# Create DataFrames for the results
base_event_summary_df = pd.DataFrame({
    'Class': event_class_names,
    f'Event Precision (Top K {top_k})': base_events_class_precisions

})



display(trans_event_summary_df)
display(base_event_summary_df)

Unnamed: 0,Class,Event Precision (Top K 3)
0,events_B,0.989063
1,events_S,0.998288
2,events_double,0.000627
3,events_field_out,0.980328
4,events_hit_by_pitch,0.902821
5,events_home_run,0.054054
6,events_single,0.626337
7,events_strikeout,0.943473
8,events_triple,0.0
9,events_walk,0.969288


Unnamed: 0,Class,Event Precision (Top K 3)
0,events_B,0.97537
1,events_S,0.999508
2,events_double,0.0
3,events_field_out,0.913519
4,events_hit_by_pitch,0.034483
5,events_home_run,0.002457
6,events_single,0.505743
7,events_strikeout,0.941304
8,events_triple,0.0
9,events_walk,0.935538


In [36]:

trans_loc_summary_df = pd.DataFrame({
    'Class': loc_class_names,
    f'Hit Loc Precision (Top K {top_k})': trans_loc_class_precisions

})

# Create DataFrames for the results
base_loc_summary_df = pd.DataFrame({
    'Class': loc_class_names,
    f'Hit Loc Precision (Top K {top_k})': base_loc_class_precisions

})

display(trans_loc_summary_df)
display(base_loc_summary_df)

Unnamed: 0,Class,Hit Loc Precision (Top K 3)
0,hit_location_0.0,0.999895
1,hit_location_1.0,0.011567
2,hit_location_2.0,0.104027
3,hit_location_3.0,0.159664
4,hit_location_4.0,0.260981
5,hit_location_5.0,0.297308
6,hit_location_6.0,0.33387
7,hit_location_7.0,0.689617
8,hit_location_8.0,0.682989
9,hit_location_9.0,0.547699


Unnamed: 0,Class,Hit Loc Precision (Top K 3)
0,hit_location_0.0,0.999939
1,hit_location_1.0,0.006309
2,hit_location_2.0,0.0
3,hit_location_3.0,0.036615
4,hit_location_4.0,0.210835
5,hit_location_5.0,0.19793
6,hit_location_6.0,0.287186
7,hit_location_7.0,0.543443
8,hit_location_8.0,0.776514
9,hit_location_9.0,0.619809


In [31]:
display(trans_cont_error)
display(base_cont_error)

6.793103057630511

334.3855928995598