Author: John Paul Dillard Jr.

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [2]:
data_cleaned = pd.read_csv('Data_Science_NBA_Project/Datasets/preprocessed_nba_dataset(w_years_of_exp).csv')
data_cleaned[:3]

Unnamed: 0.1,Unnamed: 0,SalStartYr,Salary,GS,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,years_of_exp,salary_cap,Per_of_Salary_Cap
0,7454,2001,5062500.0,12.0,14.5,1.5,3.8,0.387,0.1,0.3,...,2.0,0.8,0.5,0.4,1.2,1.9,3.8,4,42500000,0.119118
1,7455,2001,11250000.0,81.0,40.0,7.5,15.8,0.472,0.1,0.8,...,9.1,3.1,1.1,1.0,2.9,2.9,20.5,5,42500000,0.264706
2,7456,2001,1995741.0,0.0,8.7,0.7,2.2,0.321,0.2,0.6,...,1.0,1.4,0.6,0.0,1.0,1.1,2.0,6,42500000,0.046959


In [3]:
#remove salary information (predicting percent of salary)

cols = set(data_cleaned.columns)
cols.remove('salary_cap')
cols.remove('Salary')

data_cleaned = data_cleaned[cols]
data_cleaned[:3]

Unnamed: 0.1,eFG%,3P%,ORB,FG,Per_of_Salary_Cap,BLK,FT%,MP,3P,years_of_exp,...,FT,PF,SalStartYr,3PA,Unnamed: 0,GS,STL,FTA,PTS,2P%
0,0.405,0.4,0.5,1.5,0.119118,0.4,0.583,14.5,0.1,4,...,0.7,1.9,2001,0.3,7454,12.0,0.5,1.2,3.8,0.386
1,0.477,0.188,2.2,7.5,0.264706,1.0,0.834,40.0,0.1,5,...,5.5,2.9,2001,0.8,7455,81.0,1.1,6.6,20.5,0.487
2,0.357,0.25,0.0,0.7,0.046959,0.0,0.667,8.7,0.2,6,...,0.5,1.1,2001,0.6,7456,0.0,0.6,0.7,2.0,0.35


In [4]:
#extract input features and output vector

X = data_cleaned.loc[:,data_cleaned.columns != 'Per_of_Salary_Cap'].to_numpy()
y = data_cleaned['Per_of_Salary_Cap'].to_numpy()
print(X.shape)

(6222, 27)


In [5]:
#split into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.2, shuffle = True)

In [11]:
models = []
for i in range(1, 4):
    #fit models
    reg_model = Pipeline([('poly',PolynomialFeatures(i)), ('reg', LinearRegression())])
    reg_model.fit(X_train, y_train)
    models.append(reg_model)
    #evaluate training and testing RMSE
    
    y_train_predict = reg_model.predict(X_train)
    y_val_predict = reg_model.predict(X_val)
    rmse_train = mean_squared_error(y_train, y_train_predict, squared=False)
    rmse_val = mean_squared_error(y_val, y_val_predict, squared=False)
    
    print(f"Degree {i} model train score: {reg_model.score(X_train, y_train)}")
    print(f"Degree {i} model val score: {reg_model.score(X_val, y_val)}")
    print(f"Degree {i} model train RMSE: {rmse_train}")
    print(f"Degree {i} model validation RMSE: {rmse_val}\n")

Degree 1 model train score: 0.5960087037315088
Degree 1 model val score: 0.5979843298075231
Degree 1 model train RMSE: 0.056719128859230764
Degree 1 model validation RMSE: 0.05249910483691516

Degree 2 model train score: 0.7183578693819948
Degree 2 model val score: 0.627429066736432
Degree 2 model train RMSE: 0.047357891500397604
Degree 2 model validation RMSE: 0.05053995981201347

Degree 3 model train score: 0.9205229779829377
Degree 3 model val score: -10.899377747127104
Degree 3 model train RMSE: 0.02515731806335726
Degree 3 model validation RMSE: 0.28562257695259063



Degree 3 overfits the training data.  Best Model: Polynomial degree 2

In [12]:
#evaluate model on test data
best_model_index = 1
best_model = models[best_model_index]
y_test_predict = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_test_predict, squared=False)
    
print(f"Degree {best_model_index+1} model test score: {best_model.score(X_test, y_test)}")
print(f"Degree {best_model_index+1} model test RMSE: {rmse_test}")

Degree 2 model test score: 0.6667963983933376
Degree 2 model test RMSE: 0.051781988608136945
