# Clutch^2 Model for NHL Shots

###### By Trevor Rowland ([dbCooper2](<https://github.com/dBCooper2>))



In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
# Load train and test data
df_train = pd.read_csv("final_datasets/combined_shots_dataset_2015-2024_with_dates.csv")
df_test = pd.read_csv("final_datasets/combined_shots_dataset_2023-2024_with_dates.csv")


In [21]:
print(df_train.columns)

Index(['shotID', 'homeTeamCode', 'awayTeamCode', 'season', 'isPlayoffGame',
       'game_id', 'homeTeamWon', 'id', 'time', 'timeUntilNextEvent',
       ...
       'xPlayContinuedInZone', 'xPlayContinuedOutsideZone', 'xPlayStopped',
       'xShotWasOnGoal', 'isHomeTeam', 'shotWasOnGoal', 'teamCode',
       'arenaAdjustedXCordABS', 'composite_game_id', 'game_date'],
      dtype='object', length=126)


In [20]:
df_train = df_train.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])


In [22]:
print(df_test.columns)

Index(['shotID', 'arenaAdjustedShotDistance', 'arenaAdjustedXCord',
       'arenaAdjustedXCordABS', 'arenaAdjustedYCord', 'arenaAdjustedYCordAbs',
       'averageRestDifference', 'awayEmptyNet', 'awayPenalty1Length',
       'awayPenalty1TimeLeft',
       ...
       'xGoal', 'xPlayContinuedInZone', 'xPlayContinuedOutsideZone',
       'xPlayStopped', 'xRebound', 'xShotWasOnGoal', 'yCord', 'yCordAdjusted',
       'composite_game_id', 'game_date'],
      dtype='object', length=126)


In [23]:
# Handle missing values for both train and test datasets
for col in df_train.columns:
    if df_train[col].dtype == 'object':  # Categorical data (e.g., 'homeTeam')
        df_train[col].fillna(df_train[col].mode()[0], inplace=True)
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)
    else:  # Numeric data (e.g., 'score_differential')
        df_train[col].fillna(df_train[col].mean(), inplace=True)
        df_test[col].fillna(df_test[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[col].fillna(df_train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna(df_test[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [24]:
# Prepare features and target
X_train = df_train.drop(columns=['homeTeamWon'])  # Assuming 'homeTeamWon' is the target variable
y_train = df_train['homeTeamWon']
X_test = df_test.drop(columns=['homeTeamWon'])
y_test = df_test['homeTeamWon']

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

def preprocess_with_countvectorizer(df_train, df_test=None):
    """
    Preprocess categorical variables using CountVectorizer.
    If test data is provided, applies the same transformation to both datasets.
    
    Parameters:
    df_train (pd.DataFrame): Training dataset
    df_test (pd.DataFrame): Test dataset (optional)
    
    Returns:
    tuple: Processed training DataFrame and test DataFrame (if provided)
    """
    # Create copies to avoid modifying original data
    df_train = df_train.copy()
    if df_test is not None:
        df_test = df_test.copy()
    
    # Initialize dictionary to store vectorizers
    vectorizers = {}
    
    # Process each column
    for column in df_train.columns:
        if df_train[column].dtype == 'object':
            # Convert to string and handle NaN values
            df_train[column] = df_train[column].astype(str).fillna('MISSING')
            
            # Initialize and fit vectorizer
            vectorizer = CountVectorizer(lowercase=False)
            features = vectorizer.fit_transform(df_train[column])
            
            # Create feature names
            feature_names = [f"{column}_{feat}" for feat in vectorizer.get_feature_names_out()]
            
            # Convert to DataFrame
            encoded_features = pd.DataFrame(
                features.toarray(),
                columns=feature_names,
                index=df_train.index
            )
            
            # Store vectorizer for test data
            vectorizers[column] = vectorizer
            
            # Drop original column and add new features
            df_train = df_train.drop(columns=[column])
            df_train = pd.concat([df_train, encoded_features], axis=1)
            
            # Process test data if provided
            if df_test is not None:
                df_test[column] = df_test[column].astype(str).fillna('MISSING')
                test_features = vectorizer.transform(df_test[column])
                test_encoded = pd.DataFrame(
                    test_features.toarray(),
                    columns=feature_names,
                    index=df_test.index
                )
                df_test = df_test.drop(columns=[column])
                df_test = pd.concat([df_test, test_encoded], axis=1)
    
    if df_test is not None:
        return df_train, df_test
    return df_train

def remove_categorical_columns(df_train, df_test=None):
    """
    Remove all categorical (object) columns from the dataset.
    If test data is provided, removes the same columns from both datasets.
    
    Parameters:
    df_train (pd.DataFrame): Training dataset
    df_test (pd.DataFrame): Test dataset (optional)
    
    Returns:
    tuple: Processed training DataFrame and test DataFrame (if provided)
    """
    # Create copies to avoid modifying original data
    df_train = df_train.copy()
    if df_test is not None:
        df_test = df_test.copy()
    
    # Get list of categorical columns
    categorical_columns = df_train.select_dtypes(include=['object']).columns
    
    # Remove categorical columns from training data
    df_train = df_train.drop(columns=categorical_columns)
    
    # Remove categorical columns from test data if provided
    if df_test is not None:
        df_test = df_test.drop(columns=categorical_columns)
        return df_train, df_test
    
    return df_train

# Example usage in your existing code:
# Option 1: Using CountVectorizer
'''
X_train, X_test = preprocess_with_countvectorizer(
    df_train.drop(columns=['homeTeamWon']), 
    df_test.drop(columns=['homeTeamWon'])
)
'''

# Option 2: Removing categorical columns
X_train, X_test = remove_categorical_columns(
    df_train.drop(columns=['homeTeamWon']), 
    df_test.drop(columns=['homeTeamWon'])
)

In [32]:
# Train Random Forest model (CPU-based, no GPU for scikit-learn)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train.values, y_train.values)

In [33]:
# Evaluate Random Forest model
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")



Random Forest Accuracy: 0.5437702732283934


In [40]:
import pandas as pd

# Assuming you've already trained your RandomForestClassifier as rf_model
# And you have your X_train (feature matrix) and y_train (target variable)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame to show features and their importance scores
feature_names = X_train.columns  # If you used pandas DataFrame for features
# If you used sparse matrices, you may need to handle the feature names differently
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Show the top important features
print(feature_importance_df.head(30))  # Display top 10 features


                          Feature  Importance
15                  homeTeamGoals    0.367317
16                  awayTeamGoals    0.349085
5                            time    0.033986
36                   homeEmptyNet    0.033299
38               homeSkatersOnIce    0.033150
4                              id    0.026402
39               awaySkatersOnIce    0.022793
8                          period    0.022297
37                   awayEmptyNet    0.020518
0                          shotID    0.008932
3                         game_id    0.008605
49                goalieIdForShot    0.007986
53      shootingTeamForwardsOnIce    0.004871
110             composite_game_id    0.004821
9                            goal    0.003248
103          xPlayContinuedInZone    0.003037
107                    isHomeTeam    0.002859
26                 shotOnEmptyNet    0.002547
1                          season    0.002053
73     defendingTeamForwardsOnIce    0.001883
100                         xGoal 

In [37]:
# Train XGBoost model (CPU)
xgb_model = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

In [42]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

def preprocess_and_improve_features(X_train, X_test, y_train, y_test):
    """
    Preprocess data and engineer new features to improve model performance.
    """
    # Create copies to avoid modifying original data
    X_train = X_train.copy()
    X_test = X_test.copy()
    
    # Scale numerical features
    scaler = StandardScaler()
    numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    
    # Ensure column consistency between train and test
    common_columns = list(set(X_train.columns) & set(X_test.columns))
    X_train = X_train[common_columns]
    X_test = X_test[common_columns]
    
    return X_train, X_test, y_train, y_test

def tune_random_forest(X_train, y_train):
    """
    Perform grid search to find optimal Random Forest parameters.
    """
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'class_weight': ['balanced', None]
    }
    
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy',
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    print("Best Random Forest parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate both Random Forest and XGBoost models.
    """
    # Train tuned Random Forest
    print("Tuning Random Forest...")
    rf_model = tune_random_forest(X_train, y_train)
    
    # Train XGBoost with proper parameters
    print("\nTraining XGBoost...")
    xgb_model = XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        enable_categorical=True  # Enable categorical feature support
    )
    
    # Explicitly set feature names for XGBoost to avoid mismatch
    xgb_model.fit(
        X_train, 
        y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
        verbose=False
    )
    
    # Evaluate Random Forest
    y_pred_rf = rf_model.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred_rf)
    print("\nRandom Forest Results:")
    print(f"Accuracy: {rf_accuracy:.4f}")
    print("\nDetailed Random Forest Classification Report:")
    print(classification_report(y_test, y_pred_rf))
    
    # Evaluate XGBoost
    y_pred_xgb = xgb_model.predict(X_test)
    xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
    print("\nXGBoost Results:")
    print(f"Accuracy: {xgb_accuracy:.4f}")
    print("\nDetailed XGBoost Classification Report:")
    print(classification_report(y_test, y_pred_xgb))
    
    return rf_model, xgb_model

# Main execution
def main():
    # Assuming df_train and df_test are your original dataframes
    X_train = df_train.drop(columns=['homeTeamWon'])
    y_train = df_train['homeTeamWon']
    X_test = df_test.drop(columns=['homeTeamWon'])
    y_test = df_test['homeTeamWon']
    
    # Preprocess data
    X_train_processed, X_test_processed, y_train, y_test = preprocess_and_improve_features(
        X_train, X_test, y_train, y_test
    )
    
    # Train and evaluate models
    rf_model, xgb_model = train_and_evaluate_models(
        X_train_processed, X_test_processed, y_train, y_test
    )
    
    return rf_model, xgb_model

# Run the improved modeling pipeline
rf_model, xgb_model = main()

Tuning Random Forest...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


Python(94983) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(95088) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(95233) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(95485) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(97052) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(97554) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(97780) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(97911) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(97912) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(98476) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(99708) Malloc

ValueError: 
All the 2160 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
           ~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<4 lines>...
        force_all_finite=False,
        ^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/utils/validation.py", line 1301, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/utils/validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/utils/_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'C'

--------------------------------------------------------------------------------
1728 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/ensemble/_forest.py", line 363, in fit
    X, y = self._validate_data(
           ~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<4 lines>...
        force_all_finite=False,
        ^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/utils/validation.py", line 1301, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/utils/validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/sklearn/utils/_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/Users/dB/Desktop/fall_24/DS-4210/final-proj/.env/lib/python3.13/site-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'D'


In [None]:
# Optional: Plot feature importances
plt.barh(X_train.columns, rf_model.feature_importances_)
plt.title("Feature Importance from Random Forest")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()