<a href="https://colab.research.google.com/github/grmanjar-cmyk/mlb_wrc_plus_model/blob/main/swinging_strike_model_with__samping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pybaseball
!pip install scikit-learn

# --- 1b. Import Libraries ---
import pandas as pd
from pybaseball import statcast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading pygithub-2.8.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading pynacl-1.6.0-cp38-abi3-manylinux_2_34_x86_64.whl.metadata (9.4 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pygithub-2.8.1-py3-none-any.whl (432 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m432.7/432.7 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynacl-1.6.0-cp38-abi3-manylinux_2_34_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, pygithub, pybaseball
Successfully installed pybaseball-2.2.7 pygithub-2

In [5]:
# --- 2. Fetch Statcast Data ---
# Let's grab a week of data from a recent season.
print("Fetching Statcast data...")
start_dt = '2025-06-01'
end_dt = '2025-06-15'
sc_data = statcast(start_dt=start_dt, end_dt=end_dt)

print(f"Successfully fetched {len(sc_data)} pitches.")

Fetching Statcast data...
This is a large query, it may take a moment to complete


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_cop

Successfully fetched 58226 pitches.


In [10]:
# --- 3. Prepare Data for Modeling ---

# 3a. Define the Target Variable
sc_data['is_swinging_strike'] = (sc_data['description'] == 'swinging_strike').astype(int)

# 3b. Select Feature and Target Columns
feature_columns = [
    'release_speed', 'release_spin_rate',
    'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
    'balls', 'strikes', 'pitch_type',
    'p_throws', 'stand']
target_column = 'is_swinging_strike'

In [11]:
# 3c. Create the initial modeling DataFrame
model_df = sc_data[feature_columns + [target_column]].copy()

In [12]:
# 3d. Clean the data by dropping rows with missing values in our selected columns
rows_before = len(model_df)
model_df.dropna(inplace=True)
rows_after = len(model_df)

print(f"Data cleaned. Removed {rows_before - rows_after} rows with missing values.")

Data cleaned. Removed 24 rows with missing values.


In [13]:
# --- 4. Handle Categorical Features ---
model_df_encoded = pd.get_dummies(model_df, columns=['pitch_type', 'p_throws', 'stand'], drop_first=True)

print("Categorical features have been one-hot encoded.")
print("Shape of final model-ready data:", model_df_encoded.shape)

Categorical features have been one-hot encoded.
Shape of final model-ready data: (58202, 26)


In [14]:
# --- 5. Split Data into Training and Testing Sets ---

# 5a. Separate features (X) and target (y)
X = model_df_encoded.drop(target_column, axis=1)
y = model_df_encoded[target_column]

# 5b. Perform the split (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")
# Note: Using 'stratify=y' is good practice for imbalanced datasets.
# It ensures the proportion of swinging strikes is the same in both the train and test sets.```

Data split into 46561 training samples and 11641 testing samples.


In [15]:
### Step 6: Train and Evaluate the BALANCED Model

#This is the final step. We will initialize, train, and evaluate our new model, making sure to include the crucial `class_weight='balanced'` parameter
#We are doing this to prevent it from "cheating" by guessing not a swing strike every time as a short cut


# --- 6. Train and Evaluate the Balanced Model ---

# 6a. Initialize the model with the balancing parameter
balanced_model = RandomForestClassifier(n_estimators=100,
                                      random_state=42,
                                      n_jobs=-1,
                                      class_weight='balanced')

# 6b. Train the model on the training data
print("\nTraining the BALANCED Random Forest model...")
balanced_model.fit(X_train, y_train)
print("Training complete!")


Training the BALANCED Random Forest model...
Training complete!


In [16]:
# 6c. Make predictions on the unseen test data
predictions = balanced_model.predict(X_test)

In [17]:
# 6d. Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"\nOverall Accuracy of the Balanced Model: {accuracy:.4f}")

print("\n--- Classification Report for BALANCED Model ---")
print(classification_report(y_test, predictions, target_names=['Not a Swinging Strike', 'Swinging Strike']))


Overall Accuracy of the Balanced Model: 0.8944

--- Classification Report for BALANCED Model ---
                       precision    recall  f1-score   support

Not a Swinging Strike       0.89      1.00      0.94     10412
      Swinging Strike       0.50      0.00      0.01      1229

             accuracy                           0.89     11641
            macro avg       0.70      0.50      0.48     11641
         weighted avg       0.85      0.89      0.85     11641



In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# --- Let's re-do this step cleanly ---

# 1. Initialize the BALANCED model
# We are certain we are including class_weight='balanced'
balanced_model = RandomForestClassifier(n_estimators=100,
                                      random_state=42,
                                      n_jobs=-1,
                                      class_weight='balanced')

# 2. Train this specific model
print("Training the BALANCED model again to be sure...")
balanced_model.fit(X_train, y_train)
print("Training complete!")

# 3. Use THIS balanced_model to make predictions
# We create a new variable for these predictions to avoid confusion
balanced_predictions = balanced_model.predict(X_test)

# 4. Evaluate the predictions from the balanced model
print("\n--- Corrected Classification Report for BALANCED Model ---")
print(classification_report(y_test, balanced_predictions, target_names=['Not a Swinging Strike', 'Swinging Strike']))

Training the BALANCED model again to be sure...
Training complete!

--- Corrected Classification Report for BALANCED Model ---
                       precision    recall  f1-score   support

Not a Swinging Strike       0.89      1.00      0.94     10412
      Swinging Strike       0.50      0.00      0.01      1229

             accuracy                           0.89     11641
            macro avg       0.70      0.50      0.48     11641
         weighted avg       0.85      0.89      0.85     11641



In [19]:
# --- Let's implement Undersampling ---

# First, let's combine our training data back together for easy sampling
X_y_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes
majority_class = X_y_train[X_y_train['is_swinging_strike'] == 0]
minority_class = X_y_train[X_y_train['is_swinging_strike'] == 1]

# Get the number of samples in the minority class
n_minority = len(minority_class)
print(f"Number of 'Swinging Strike' samples in training data: {n_minority}")

# Randomly sample the majority class to match the minority class size
majority_class_undersampled = majority_class.sample(n=n_minority, random_state=42)

# Combine the undersampled majority class with the original minority class
balanced_train_df = pd.concat([majority_class_undersampled, minority_class])

# Shuffle the new balanced DataFrame
balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"New balanced training set has {len(balanced_train_df)} samples.")

# --- Now, create new X_train and y_train from this balanced set ---
X_train_balanced = balanced_train_df.drop('is_swinging_strike', axis=1)
y_train_balanced = balanced_train_df['is_swinging_strike']


# --- Train a NEW model on this balanced data ---
# Note: We do NOT need class_weight='balanced' anymore because the data itself is balanced.
final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

print("\nTraining the FINAL model on the undersampled data...")
final_model.fit(X_train_balanced, y_train_balanced)
print("Training complete!")

# --- Evaluate on the ORIGINAL, UNCHANGED test set ---
# This is crucial. We test on data that reflects the real world.
final_predictions = final_model.predict(X_test)

print("\n--- Classification Report for FINAL Undersampled Model ---")
print(classification_report(y_test, final_predictions, target_names=['Not a Swinging Strike', 'Swinging Strike']))

Number of 'Swinging Strike' samples in training data: 4915
New balanced training set has 9830 samples.

Training the FINAL model on the undersampled data...
Training complete!

--- Classification Report for FINAL Undersampled Model ---
                       precision    recall  f1-score   support

Not a Swinging Strike       0.94      0.64      0.76     10412
      Swinging Strike       0.18      0.66      0.28      1229

             accuracy                           0.65     11641
            macro avg       0.56      0.65      0.52     11641
         weighted avg       0.86      0.65      0.71     11641



In [20]:
# Create a new DataFrame containing ONLY the swinging strikes
swinging_strikes_only_df = model_df[model_df['is_swinging_strike'] == 1]

# Now, get the summary statistics for this group
print("--- Profile of a Typical Swinging Strike ---")
print(swinging_strikes_only_df.describe())

--- Profile of a Typical Swinging Strike ---
       release_speed  release_spin_rate    pfx_x     pfx_z   plate_x  \
count         6144.0             6144.0   6144.0    6144.0    6144.0   
mean       88.586035        2293.712728 -0.04825  0.480335  0.070868   
std         6.100155         406.054449  0.87238  0.732738  0.691563   
min             36.1              272.0    -2.02     -1.77     -2.12   
25%             84.3             2138.0     -0.8       0.0     -0.41   
50%             88.3             2346.5    -0.03      0.45      0.07   
75%             93.9             2535.0     0.64      1.12      0.56   
max            103.1             3424.0     1.99      2.06      2.48   

        plate_z     balls   strikes  is_swinging_strike  
count    6144.0    6144.0    6144.0              6144.0  
mean   2.233372  0.890951   1.04248                 1.0  
std    0.907007  0.930862  0.803492                 0.0  
min       -0.47       0.0       0.0                 1.0  
25%        1.48 