# Capstone Project: Predicting NHL Player Salary

## Part III - Modeling

Author: Charles Ramey

Date: 05/15/2023

---

#### Notebook Links

Part I - Project Intro & Data Cleaning
- [`Part-1_setup-and-cleaning.ipynb`](../code/Part-1_setup-and-cleaning.ipynb)

Part II - Exploratory Data Analysis (EDA)
- [`Part-2_eda.ipynb`](../code/Part-2_eda.ipynb)

Part III - Modeling
- [`Part-3.2_modeling-defense.ipynb`](../code/Part-3.2_modeling-defense.ipynb)
- [`Part-3.3_modeling-goalies.ipynb`](../code/Part-3.3_modeling-goalies.ipynb)

Part IV - Conclusion, Recommendations, and Sources
- [`Part-4_conclusion-and-recommendations.ipynb`](../code/Part-4_conclusion-and-recommendations.ipynb)

### Contents

- [Data Import and Preprocessing](#Data-Import-and-Cleaning)
- [Modeling](#Modeling)

### Library Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
import xgboost as xg
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict, GridSearchCV

from sklearn.metrics import mean_squared_error
import pickle

## Data Import and Preprocessing

### Data Import

In [2]:
forwards = pd.read_csv('../data/forwards_cleaned.csv')

In [3]:
forwards.head(2)

Unnamed: 0,player,pos,contract_aav,season,pct_change,upper_limit,lower_limit,min_salary,team,games_played,...,xgoals_on_rebounds_shots,share_of_xgoals_from_rebounds_shots,xgoals_from_non_rebounds,xgoals_of_expected_rebounds,created_xgoals,created_xgoals_minus_actual_xgoals,shooting_talent_above_average,shooting_talent_adjusted_expected_goals,goals_above_shooting_talent,final_standing
0,Trent Hunter,F,600000,2010-11,0.0825,64300000,48300000,525000,NYI,17.0,...,0.1,0.068,1.9,0.5,2.5,0.4,-0.206,1.6,-0.6,27.0
1,Jason Chimera,F,1750000,2010-11,0.0825,64300000,48300000,525000,WSH,81.0,...,1.6,0.114,12.4,2.2,14.5,0.6,0.013,14.1,-4.1,2.0


### Preprocessing

In [4]:
ordered_columns = sorted(forwards.columns)
forwards = forwards.reindex(columns=ordered_columns)

In [5]:
forwards.head(2)

Unnamed: 0,assists,assists_per_60_minutes,contract_aav,created_xgoals,created_xgoals_minus_actual_xgoals,defensive_zone_giveaways,expected_goals,expected_goals_per_60_minutes,expected_pct_of_unblocked_shots_that_missed_net,expected_shooting_pct_on_unblocked_shots,...,shots_on_goal_per_60_minutes,shots_that_missed_net,shots_that_were_blocked,takeaways,team,upper_limit,xgoals_from_non_rebounds,xgoals_of_expected_rebounds,xgoals_on_rebounds_shots,xrebounds_created
0,3.0,0.84,600000,2.5,0.4,0.0,2.1,0.57,0.3,0.04,...,8.36,20.0,18.0,3.0,NYI,64300000,1.9,0.5,0.1,2.2
1,16.0,0.89,1750000,14.5,0.6,12.0,13.9,0.78,0.265,0.06,...,9.06,53.0,41.0,26.0,WSH,64300000,12.4,2.2,1.6,10.3


In [6]:
forwards_train = forwards[~forwards['season'].isin(['2020-21', '2021-22'])]
forwards_test = forwards[forwards['season'].isin(['2020-21', '2021-22'])]

---
## Modeling

In [7]:
X_train = forwards_train.drop(columns=['player', 'pos', 'team', 'season', 'contract_aav'], axis=1)
y_train = forwards_train['contract_aav']

X_test = forwards_test.drop(columns=['player', 'pos', 'team', 'season', 'contract_aav'], axis=1)
y_test = forwards_test['contract_aav']

In [8]:
ss = StandardScaler()

Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [9]:
lr = LinearRegression()

In [10]:
lr.fit(Xs_train, y_train)

In [11]:
lr.score(Xs_train, y_train), lr.score(Xs_test, y_test)

(0.8290299347610826, 0.7483061427411222)

In [12]:

pipe = Pipeline([
    ('ss', StandardScaler()),
    ('pc', PCA()),
    ('lr', LinearRegression())
])

pgrid = {'pc__n_components' : np.arange(1, 40)}

gs = GridSearchCV(pipe, pgrid, cv = 5, n_jobs=-1)

In [13]:
gs.fit(X_train, y_train)

In [14]:
gs.score(X_train, y_train)

0.7978831994263967

In [15]:
gs.score(X_test, y_test)

0.7766054864657828

In [16]:
# Instantiation of models to test

# Logisitic Regression
lr = LinearRegression()

# Decision Tree Regressor
dt = DecisionTreeRegressor()

# Random Forest Regressor
rf = RandomForestRegressor()

# Extra Trees Regressor
et = ExtraTreesRegressor()

# Adaptive Boost Regressor
ada = AdaBoostRegressor()

# Extreme Gradient Boost Regressor
xgb = xg.XGBRegressor()

In [17]:
lr_params = {
    'pc__n_components' : np.arange(1, 40)
}

dt_params = {
    'pc__n_components' : np.arange(1, 40),
    'dt__max_depth': [3,5],
    'dt__min_samples_split': [2,5,10],
    'dt__random_state': [42]}

rf_params = {
    'pc__n_components' : np.arange(1, 40),
    'rf__max_depth': [3,5],
    'rf__min_samples_split': [2,5,10],
    'rf__random_state': [42]}
    
et_params = {
    'pc__n_components' : np.arange(1, 40),
    'et__n_estimators': [50,100],
    'et__max_depth': [3,5],
    'et__min_samples_split': [2,5,10],
    'et__random_state': [42]}

ada_params = {
    'pc__n_components' : np.arange(1, 40),
    'ada__n_estimators': [50,100],
    'ada__learning_rate': [0.3,0.6],
    'ada__random_state': [42]}

xgb_params = {
    'pc__n_components' : np.arange(1, 40),
    'xgb__n_estimators': [50,100],
    'xgb__learning_rate': [0.3,0.6],
    'xgb__random_state': [42]}

In [18]:
'''
Code adapted from Devin Faye, General Assembly
'''
def evaluation(X_train, y_train, X_test, y_test, model, model_name:str,
               model_params, scores_df=None):
    # If this is the first call of this function and a previous dataframe of scores
    # is not passed in the parameters, define a new dataframe to store model metrics
    if scores_df is None:
        scores_df = pd.DataFrame(columns = ['train_score', 'test_score', 'difference', 'rmse ($)'])

    # create a pipeline that will scale the data, perform a PCA, and feed it into the selected model
    pipe = Pipeline([
        ('ss', StandardScaler()),
        ('pc', PCA(random_state=42)),
        (model_name, model)])
    # instantiate a grid search to pass the pipeline into with hyperparameters to search over
    gs = GridSearchCV(
        pipe,
        model_params,
        cv = 5, n_jobs=-1)
    # fit the model to the training data
    gs.fit(X_train, y_train)
   
    # print out the best hyperparameters for this model
    print(f"The best parameters for the {model_name} model are: {gs.best_params_}")
    
    # store the model's predicted values
    preds = gs.predict(X_test)
    
    # score the model's performance on the training and testing data
    train_score = round(gs.score(X_train, y_train), 2)
    test_score = round(gs.score(X_test, y_test), 2)
    difference = round(train_score - test_score, 2)
    rmse = int(mean_squared_error(y_test, preds, squared=False))
    
    # store the scores in the dataframe
    scores_df.loc[model_name,:] = [train_score, test_score, difference, rmse]
    # make sure the scores are being stored as a dataframe
    scores_df = pd.DataFrame(scores_df)
    
    print('')
    print(scores_df)
    
    return scores_df

In [19]:
scores = evaluation(X_train, y_train, X_test, y_test, lr, 'lr',
                    lr_params)

The best parameters for the lr model are: {'pc__n_components': 37}

   train_score test_score difference rmse ($)
lr         0.8       0.78       0.02  1257932


In [20]:
scores = evaluation(X_train, y_train, X_test, y_test, dt, 'dt',
                    dt_params, scores_df=scores)

The best parameters for the dt model are: {'dt__max_depth': 3, 'dt__min_samples_split': 10, 'dt__random_state': 42, 'pc__n_components': 1}

   train_score test_score difference rmse ($)
lr         0.8       0.78       0.02  1257932
dt        0.71        0.6       0.11  1680290


In [21]:
scores = evaluation(X_train, y_train, X_test, y_test, rf, 'rf',
                    rf_params, scores_df=scores)

The best parameters for the rf model are: {'pc__n_components': 38, 'rf__max_depth': 5, 'rf__min_samples_split': 2, 'rf__random_state': 42}

   train_score test_score difference rmse ($)
lr         0.8       0.78       0.02  1257932
dt        0.71        0.6       0.11  1680290
rf         0.9       0.69       0.21  1476799


In [22]:
scores = evaluation(X_train, y_train, X_test, y_test, et, 'et',
                    et_params, scores_df=scores)

The best parameters for the et model are: {'et__max_depth': 5, 'et__min_samples_split': 2, 'et__n_estimators': 50, 'et__random_state': 42, 'pc__n_components': 20}

   train_score test_score difference rmse ($)
lr         0.8       0.78       0.02  1257932
dt        0.71        0.6       0.11  1680290
rf         0.9       0.69       0.21  1476799
et        0.79       0.63       0.16  1618320


In [23]:
scores = evaluation(X_train, y_train, X_test, y_test, ada, 'ada',
                    ada_params, scores_df=scores)

The best parameters for the ada model are: {'ada__learning_rate': 0.3, 'ada__n_estimators': 50, 'ada__random_state': 42, 'pc__n_components': 36}

    train_score test_score difference rmse ($)
lr          0.8       0.78       0.02  1257932
dt         0.71        0.6       0.11  1680290
rf          0.9       0.69       0.21  1476799
et         0.79       0.63       0.16  1618320
ada        0.84       0.69       0.15  1478093


In [24]:
scores = evaluation(X_train, y_train, X_test, y_test, xgb, 'xgb',
                    xgb_params, scores_df=scores)

The best parameters for the xgb model are: {'pc__n_components': 17, 'xgb__learning_rate': 0.3, 'xgb__n_estimators': 100, 'xgb__random_state': 42}

    train_score test_score difference rmse ($)
lr          0.8       0.78       0.02  1257932
dt         0.71        0.6       0.11  1680290
rf          0.9       0.69       0.21  1476799
et         0.79       0.63       0.16  1618320
ada        0.84       0.69       0.15  1478093
xgb         1.0       0.61       0.39  1660214


In [25]:
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('pc', PCA(
        n_components=37,
        random_state=42)),
    ('lr', LinearRegression())])

In [26]:
lr_pipe.fit(X_train, y_train)

In [27]:
preds = lr_pipe.predict(X_test)

In [28]:
print(f"Training score: {round(lr_pipe.score(X_train, y_train), 2)}")
print(f"Test score: {round(lr_pipe.score(X_test, y_test), 2)}")
print(f"Difference: {round(lr_pipe.score(X_train, y_train)-(lr_pipe.score(X_test, y_test)), 2)}")
print(f"RMSE: ${int(mean_squared_error(y_test, preds, squared=False))}")

Training score: 0.8
Test score: 0.78
Difference: 0.02
RMSE: $1257932


In [29]:
with open('../apps/nhl-salary-predictor/models/forwards_model.pkl', 'wb') as file:
    pickle.dump(lr_pipe, file)