#MLB Data

## Getting Data

In [None]:
# Installing baseball scraper to get data from baseball reference and baseball savant
!pip install pybaseball
!pip install scikit-learn

# Importing needed libraries
import os
import pandas as pd
import numpy as np
from pybaseball import pitching_stats

# Creating start and end variables to take he range of data from
START=2001
END= 2024

# getting the data from baseball reference
pitching=pitching_stats(START,END,qual=20)

# creating .csv file so it does not take as long to recall the data
pitching. to_csv('pitching_data2.csv')

In [None]:
# Conect and get the needed data
import pandas as pd

filename = '/content/pitching_data2.csv'

MLB_stats = pd.read_csv(filename)

## Explore and Visualize data

In [None]:
MLB_stats.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,...,Pit+ FC,Stf+ FS,Loc+ FS,Pit+ FS,Stuff+,Location+,Pitching+,Stf+ FO,Loc+ FO,Pit+ FO
0,86,60,2001,Randy Johnson,ARI,37,21,6,10.4,2.49,...,,,,,,,,,,
1,122,60,2004,Randy Johnson,ARI,40,16,14,9.6,2.6,...,,,,,,,,,,
2,501,73,2002,Curt Schilling,ARI,35,23,7,9.3,3.23,...,,,,,,,,,,
3,4,10954,2018,Jacob deGrom,NYM,30,10,9,9.0,1.7,...,,,,,,,,,,
4,53,1303,2011,Roy Halladay,PHI,34,19,6,8.7,2.35,...,,,,,,,,,,


In [None]:
MLB_stats['IP'].describe()

Unnamed: 0,IP
count,3222.0
mean,164.214991
std,37.189135
min,100.0
25%,131.2
50%,166.2
75%,194.2
max,266.0


# Clean Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# First, identify and display columns with null values
null_counts = MLB_stats.isnull().sum()
columns_with_nulls = null_counts[null_counts > 0]

print("Columns with null values (before removal):")
print(columns_with_nulls)

if len(columns_with_nulls) > 0:
    # Calculate percentage of nulls
    null_percentage = (columns_with_nulls / len(MLB_stats)) * 100
    print("\nPercentage of null values in these columns:")
    print(null_percentage)

    # Store original shape for comparison
    original_shape = MLB_stats.shape

    # Remove all columns that have any null values
    stats_cleaned = MLB_stats.drop(columns=columns_with_nulls.index)

    # Report what was done
    print(f"\nOriginal DataFrame: {original_shape[0]} rows x {original_shape[1]} columns")
    print(f"Cleaned DataFrame: {stats_cleaned.shape[0]} rows x {stats_cleaned.shape[1]} columns")
    print(f"Removed {original_shape[1] - stats_cleaned.shape[1]} columns with null values")

    # Verify no nulls remain
    remaining_nulls = stats_cleaned.isnull().sum().sum()
    print(f"\nRemaining null values in dataset: {remaining_nulls}")

else:
    print("No columns with null values found.")
    stats_cleaned = MLB_stats.copy()

MLB_stats=stats_cleaned
# The cleaned DataFrame is now stored in 'stats_cleaned'
# You can assign it back to the original variable if needed:
# stats = stats_cleaned

Columns with null values (before removal):
BS            153
GB            153
FB            153
LD            153
IFFB          153
             ... 
Location+    2700
Pitching+    2700
Stf+ FO      3221
Loc+ FO      3221
Pit+ FO      3221
Length: 317, dtype: int64

Percentage of null values in these columns:
BS            4.748603
GB            4.748603
FB            4.748603
LD            4.748603
IFFB          4.748603
               ...    
Location+    83.798883
Pitching+    83.798883
Stf+ FO      99.968963
Loc+ FO      99.968963
Pit+ FO      99.968963
Length: 317, dtype: float64

Original DataFrame: 3222 rows x 394 columns
Cleaned DataFrame: 3222 rows x 77 columns
Removed 317 columns with null values

Remaining null values in dataset: 0


In [None]:
MLB_stats.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,...,K/BB+,H/9+,HR/9+,AVG+,WHIP+,BABIP+,LOB%+,K%+,BB%+,Events
0,86,60,2001,Randy Johnson,ARI,37,21,6,10.4,2.49,...,249,73,59,78,74,108,112,208,83,0
1,122,60,2004,Randy Johnson,ARI,40,16,14,9.6,2.6,...,331,71,59,75,65,91,96,174,53,0
2,501,73,2002,Curt Schilling,ARI,35,23,7,9.3,3.23,...,492,86,99,88,71,103,103,178,36,0
3,4,10954,2018,Jacob deGrom,NYM,30,10,9,9.0,1.7,...,226,75,38,80,70,96,112,144,64,515
4,53,1303,2011,Roy Halladay,PHI,34,19,6,8.7,2.35,...,269,93,43,94,79,102,107,124,46,0


In [None]:
# Creating the Next_WAR value to be used as our target
def next_season(player):
  player = player.sort_values('Season')
  player["Next_WAR"] = player['WAR'].shift(-1)
  return player

stats = MLB_stats.groupby('IDfg', group_keys=False).apply(next_season)

  stats = MLB_stats.groupby('IDfg', group_keys=False).apply(next_season)


In [None]:
stats.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,...,H/9+,HR/9+,AVG+,WHIP+,BABIP+,LOB%+,K%+,BB%+,Events,Next_WAR
694,889,3,2001,Kevin Appier,NYM,33,11,10,3.4,3.57,...,88,83,91,87,94,104,111,87,0,2.6
1139,1374,3,2002,Kevin Appier,ANA,34,14,12,2.6,3.92,...,99,100,101,98,102,107,103,96,0,0.1
3036,2884,3,2003,Kevin Appier,- - -,35,8,9,0.1,5.4,...,104,152,102,105,92,103,70,106,0,
1355,1927,27,2001,Ramon Ortiz,ANA,28,13,11,2.3,4.36,...,104,98,103,103,101,101,89,100,0,1.6
1924,1166,27,2002,Ramon Ortiz,ANA,29,15,9,1.6,3.77,...,85,151,88,85,82,113,112,91,0,0.8


In [None]:
# finding columns with object datatypes
stats.dtypes[stats.dtypes=='object']

Unnamed: 0,0
Name,object
Team,object
Age Rng,object


In [None]:
del stats['Age Rng']
stats.dtypes[stats.dtypes=='object']

Unnamed: 0,0
Name,object
Team,object


In [None]:
# Turning team name into a number
stats['team_code'] = stats['Team'].astype('category').cat.codes
stats

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,...,HR/9+,AVG+,WHIP+,BABIP+,LOB%+,K%+,BB%+,Events,Next_WAR,team_code
694,889,3,2001,Kevin Appier,NYM,33,11,10,3.4,3.57,...,83,91,87,94,104,111,87,0,2.6,21
1139,1374,3,2002,Kevin Appier,ANA,34,14,12,2.6,3.92,...,100,101,98,102,107,103,96,0,0.1,1
3036,2884,3,2003,Kevin Appier,- - -,35,8,9,0.1,5.40,...,152,102,105,92,103,70,106,0,,0
1355,1927,27,2001,Ramon Ortiz,ANA,28,13,11,2.3,4.36,...,98,103,103,101,101,89,100,0,1.6,1
1924,1166,27,2002,Ramon Ortiz,ANA,29,15,9,1.6,3.77,...,151,88,85,82,113,112,91,0,0.8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,2981,1005686,2001,Gil Heredia,OAK,35,7,8,-0.7,5.58,...,200,119,113,103,103,59,71,0,,23
2644,3052,1005983,2001,Chris Holt,DET,29,7,9,0.8,5.77,...,97,119,121,114,96,70,99,0,,11
3102,3195,1009771,2001,Omar Olivares,PIT,33,6,9,-0.1,6.55,...,120,109,110,102,81,77,99,0,,25
2340,2410,1010700,2001,Pat Rapp,ANA,33,5,12,1.2,4.76,...,96,98,102,91,97,68,117,0,,1


In [None]:
# dropping rows where next war is NaN
stats = stats.dropna()
stats

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,...,HR/9+,AVG+,WHIP+,BABIP+,LOB%+,K%+,BB%+,Events,Next_WAR,team_code
694,889,3,2001,Kevin Appier,NYM,33,11,10,3.4,3.57,...,83,91,87,94,104,111,87,0,2.6,21
1139,1374,3,2002,Kevin Appier,ANA,34,14,12,2.6,3.92,...,100,101,98,102,107,103,96,0,0.1,1
1355,1927,27,2001,Ramon Ortiz,ANA,28,13,11,2.3,4.36,...,98,103,103,101,101,89,100,0,1.6,1
1924,1166,27,2002,Ramon Ortiz,ANA,29,15,9,1.6,3.77,...,151,88,85,82,113,112,91,0,0.8,1
2600,2783,27,2003,Ramon Ortiz,ANA,30,16,13,0.8,5.20,...,126,108,109,101,95,74,95,0,0.8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,144,27498,2022,Spencer Strider,ATL,23,11,5,4.9,2.67,...,43,74,78,100,103,169,103,278,5.5,3
2167,2529,27552,2022,Graham Ashcraft,CIN,24,5,6,1.4,4.89,...,86,116,111,108,94,68,78,355,1.5,8
1713,1876,29837,2023,Bryce Miller,SEA,24,8,7,1.9,4.32,...,101,101,88,98,98,96,57,385,2.8,27
1495,1310,29911,2023,Andrew Abbott,CIN,24,8,6,2.2,3.87,...,107,97,99,102,111,117,111,294,1.1,8


In [None]:
predictors = stats[['SO','FIP','ERA','W']]

## Training Data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
# Display basic information about the dataset
print("Baseball dataset shape:", stats.shape)
print("\nFirst few rows of the dataset:")
print(stats.head())
print("\nData types:")
print(stats.dtypes)
print("\nSummary statistics:")
print(stats.describe())

# Check for missing values
print("\nMissing values per column:")
print(stats.isnull().sum())

Baseball dataset shape: (2355, 78)

First few rows of the dataset:
      Unnamed: 0  IDfg  Season          Name Team  Age   W   L  WAR   ERA  \
694          889     3    2001  Kevin Appier  NYM   33  11  10  3.4  3.57   
1139        1374     3    2002  Kevin Appier  ANA   34  14  12  2.6  3.92   
1355        1927    27    2001   Ramon Ortiz  ANA   28  13  11  2.3  4.36   
1924        1166    27    2002   Ramon Ortiz  ANA   29  15   9  1.6  3.77   
2600        2783    27    2003   Ramon Ortiz  ANA   30  16  13  0.8  5.20   

      ...  HR/9+  AVG+  WHIP+  BABIP+  LOB%+  K%+  BB%+  Events  Next_WAR  \
694   ...     83    91     87      94    104  111    87       0       2.6   
1139  ...    100   101     98     102    107  103    96       0       0.1   
1355  ...     98   103    103     101    101   89   100       0       1.6   
1924  ...    151    88     85      82    113  112    91       0       0.8   
2600  ...    126   108    109     101     95   74    95       0       0.8   

      t

In [None]:
# Create a numerical target variable - let's use WAR as our target
baseball = stats.copy()

# Convert category variables to appropriate types
baseball['Name'] = baseball['Name'].astype('category')
baseball['Team'] = baseball['Team'].astype('category')

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(baseball, test_size=0.3, random_state=42)
train_set.info()
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1648 entries, 1076 to 3124
Data columns (total 78 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Unnamed: 0  1648 non-null   int64   
 1   IDfg        1648 non-null   int64   
 2   Season      1648 non-null   int64   
 3   Name        1648 non-null   category
 4   Team        1648 non-null   category
 5   Age         1648 non-null   int64   
 6   W           1648 non-null   int64   
 7   L           1648 non-null   int64   
 8   WAR         1648 non-null   float64 
 9   ERA         1648 non-null   float64 
 10  G           1648 non-null   int64   
 11  GS          1648 non-null   int64   
 12  CG          1648 non-null   int64   
 13  ShO         1648 non-null   int64   
 14  SV          1648 non-null   int64   
 15  IP          1648 non-null   float64 
 16  TBF         1648 non-null   int64   
 17  H           1648 non-null   int64   
 18  R           1648 non-null   int64   
 19  ER      

In [None]:
baseball_train = train_set.drop('Next_WAR', axis=1)
baseball_labels = train_set['Next_WAR'].copy()

In [None]:
# # Define the ClusterSimilarity class from the original code
# class ClusterSimilarity(BaseEstimator, TransformerMixin):
#     def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
#         self.n_clusters = n_clusters
#         self.gamma = gamma
#         self.random_state = random_state

#     def fit(self, X, y=None, sample_weight=None):
#         self.kmeans_ = KMeans(self.n_clusters, n_init=10,
#                              random_state=self.random_state)
#         self.kmeans_.fit(X, sample_weight=sample_weight)
#         return self

#     def transform(self, X):
#         return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

#     def get_feature_names_out(self, names=None):
#         return [f'Cluster {i} similarity' for i in range(self.n_clusters)]

# # Define functions for creating ratio features
# def column_ratio(X):
#     return X[:, [0]] / X[:, [1]]

# def ratio_name(function_transformer, feature_names_in_):
#     return ['ratio']  # feature names out

# def ratio_pipeline():
#     return make_pipeline(
#         SimpleImputer(strategy='median'),
#         FunctionTransformer(column_ratio, feature_names_out=ratio_name),
#         StandardScaler()
#     )

# # Create a pipeline for categorical features
# cat_pipeline = make_pipeline(
#     SimpleImputer(strategy='most_frequent'),
#     OneHotEncoder(handle_unknown='ignore')
# )

# # Create a pipeline for features that should be log-transformed
# log_pipeline = make_pipeline(
#     SimpleImputer(strategy='median'),
#     FunctionTransformer(np.log1p, feature_names_out='one-to-one'),  # Using log1p to handle zeros
#     StandardScaler()
# )

# # Create a cluster similarity pipeline for appropriate features
# cluster_simil = ClusterSimilarity(n_clusters=5, gamma=0.1, random_state=42)

# # Default pipeline for remaining numerical features
# default_num_pipeline = make_pipeline(
#     SimpleImputer(strategy='median'),
#     StandardScaler()
# )

# # Identify numerical and categorical columns
# numerical_cols = baseball_train.select_dtypes(include=['number']).columns.tolist()
# categorical_cols = baseball_train.select_dtypes(include=['category', 'object']).columns.tolist()

# # Create the full preprocessing pipeline
# # Adjust the specific columns and transformations to match baseball data

# preprocessing = ColumnTransformer([
#     # Create ratio of strikeouts to walks (KBB)
#     ('k_per_ip', ratio_pipeline(), ['K/9', 'IP']),

#     # Log transform appropriate statistics
#     ('log', log_pipeline, ['IP', 'ERA', 'WHIP']),

#     # Apply cluster similarity to appropriate numerical data
#     ('performance_cluster', cluster_simil, ['ERA', 'FIP', 'WHIP', 'K/9']),

#     # Handle categorical data
#     ('cat', cat_pipeline, categorical_cols),
# ], remainder=default_num_pipeline)

# # Apply the preprocessing pipeline to the baseball data
# baseball_prepared = preprocessing.fit_transform(baseball_train)

# print("\nPreprocessed data shape:", baseball_prepared.shape)

# # Now we have a prepared dataset ready for machine learning models
# # Let's test on a simple model to confirm everything works

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create and train a simple linear regression model
model = LinearRegression()
model.fit(baseball_prepared, baseball_labels)

# Prepare the test data
baseball_test = test_set.drop('Next_WAR', axis=1)
baseball_test_labels = test_set['Next_WAR'].copy()
baseball_test_prepared = preprocessing.transform(baseball_test)

# Make predictions and evaluate
predictions = model.predict(baseball_test_prepared)
mse = mean_squared_error(baseball_test_labels, predictions)
rmse = np.sqrt(mse)

print("\nModel Test RMSE:", rmse)
print("Mean WAR in test set:", baseball_test_labels.mean())
print("Standard deviation of WAR in test set:", baseball_test_labels.std())


Model Test RMSE: 1.4845843466239224
Mean WAR in test set: 2.541725601131542
Standard deviation of WAR in test set: 1.7626431570521783
