## XGBoost Model

- Loads Filtered Dataset
- Converts object types to categorical types
- Splits Data
- Performs Hyperparameter Tuning
- Builds, Trains, Evaluates Model
- Extracts Feature Importance

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [29]:
# Load encoded dataset
print('Loading CSV (this may take a while for large files)...')
nfl_encoded = pd.read_csv("../dataset/nfl_encoded_v2.csv", low_memory=False, index_col=0)
print(f"Dataset shape: {nfl_encoded.shape}")

Loading CSV (this may take a while for large files)...
Dataset shape: (318668, 20)


## Encode objects types as categories for XGBoost and CatBoost models

In [30]:
# Convert categoricals to category dtype for XGBoost and CatBoost models

categorical_cols = ["posteam", "defteam", "posteam_type", "game_half", "side_of_field"]
for col in categorical_cols:
    nfl_encoded[col] = nfl_encoded[col].astype('category')

In [31]:
print(f"\nData types:")
print(nfl_encoded.dtypes)


Data types:
posteam                       category
defteam                       category
posteam_type                  category
yardline_100                   float64
qtr                              int64
down                             int64
ydstogo                          int64
goal_to_go                       int64
score_differential             float64
game_half                     category
drive                            int64
posteam_timeouts_remaining       int64
defteam_timeouts_remaining       int64
shotgun                          int64
no_huddle                        int64
quarter_seconds_remaining      float64
half_seconds_remaining         float64
game_seconds_remaining         float64
side_of_field                 category
play_type                        int64
dtype: object


## Split Data

In [32]:
X = nfl_encoded.drop("play_type", axis=1)
y = nfl_encoded["play_type"]

print(f"Features: {X.shape[1]}")
print(f"Target distribution: {y.value_counts()}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

Features: 19
Target distribution: play_type
1    186163
0    132505
Name: count, dtype: int64


## Hyperparameter Tuning

In [33]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(enable_categorical=True, random_state=42, eval_metric='logloss',
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters found:
{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}


## Build and Train Model

In [34]:
print("Training XGBoost model...")
best_model.fit(X_train, y_train)
print("Model trained successfully.")

Training XGBoost model...
Model trained successfully.


## Evaluate Model

In [38]:
# Made predictions
y_pred = best_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)


In [39]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")


Accuracy: 0.7253
Confusion Matrix:
[[20524  5977]
 [11533 25700]]
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.77      0.70     26501
           1       0.81      0.69      0.75     37233

    accuracy                           0.73     63734
   macro avg       0.73      0.73      0.72     63734
weighted avg       0.74      0.73      0.73     63734



## Explore Feature Importance

In [40]:
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": xg_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20))

Top 20 Most Important Features:
                       Feature  Importance
13                     shotgun    0.832046
5                         down    0.036884
6                      ydstogo    0.020675
17      game_seconds_remaining    0.017255
16      half_seconds_remaining    0.014700
8           score_differential    0.014255
4                          qtr    0.010994
0                      posteam    0.007021
9                    game_half    0.006738
3                 yardline_100    0.006222
11  posteam_timeouts_remaining    0.005680
12  defteam_timeouts_remaining    0.005360
7                   goal_to_go    0.004893
15   quarter_seconds_remaining    0.003879
1                      defteam    0.003617
18               side_of_field    0.003554
14                   no_huddle    0.002573
10                       drive    0.002059
2                 posteam_type    0.001596
