Author: John Paul Dillard Jr.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pickle

In [2]:
data_cleaned = pd.read_csv('Data_Science_NBA_Project/Datasets/preprocessed_nba_dataset(w_years_of_exp).csv')
data_cleaned[:3]

Unnamed: 0.1,Unnamed: 0,SalStartYr,Salary,GS,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,years_of_exp,salary_cap,Per_of_Salary_Cap
0,7454,2001,5062500.0,12.0,14.5,1.5,3.8,0.387,0.1,0.3,...,2.0,0.8,0.5,0.4,1.2,1.9,3.8,4,42500000,0.119118
1,7455,2001,11250000.0,81.0,40.0,7.5,15.8,0.472,0.1,0.8,...,9.1,3.1,1.1,1.0,2.9,2.9,20.5,5,42500000,0.264706
2,7456,2001,1995741.0,0.0,8.7,0.7,2.2,0.321,0.2,0.6,...,1.0,1.4,0.6,0.0,1.0,1.1,2.0,6,42500000,0.046959


In [6]:
cols = set(data_cleaned.columns)
cols.remove('salary_cap')
cols.remove('SalStartYr')
cols.remove('Salary')

data_cleaned = data_cleaned[cols]
data_cleaned[:3]

Unnamed: 0,3P%,TRB,TOV,BLK,3P,STL,PTS,FT,Per_of_Salary_Cap,years_of_exp,...,DRB,ORB,GS,2P,2P%,PF,3PA,FG,FT%,MP
0,0.4,2.0,1.2,0.4,0.1,0.5,3.8,0.7,0.119118,4,...,1.6,0.5,12.0,1.3,0.386,1.9,0.3,1.5,0.583,14.5
1,0.188,9.1,2.9,1.0,0.1,1.1,20.5,5.5,0.264706,5,...,6.9,2.2,81.0,7.3,0.487,2.9,0.8,7.5,0.834,40.0
2,0.25,1.0,1.0,0.0,0.2,0.6,2.0,0.5,0.046959,6,...,1.0,0.0,0.0,0.5,0.35,1.1,0.6,0.7,0.667,8.7


In [8]:
data_cleaned[:3]

Unnamed: 0.1,Unnamed: 0,SalStartYr,Salary,GS,MP,FG,FGA,FG%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,years_of_exp,salary_cap,Per_of_Salary_Cap
0,7454,2001,5062500.0,12.0,14.5,1.5,3.8,0.387,0.1,0.3,...,2.0,0.8,0.5,0.4,1.2,1.9,3.8,4,42500000,0.119118
1,7455,2001,11250000.0,81.0,40.0,7.5,15.8,0.472,0.1,0.8,...,9.1,3.1,1.1,1.0,2.9,2.9,20.5,5,42500000,0.264706
2,7456,2001,1995741.0,0.0,8.7,0.7,2.2,0.321,0.2,0.6,...,1.0,1.4,0.6,0.0,1.0,1.1,2.0,6,42500000,0.046959


In [15]:
#extract input features and output vector
X = data_cleaned.loc[:,data_cleaned.columns != 'Per_of_Salary_Cap'].to_numpy()
y = data_cleaned['Per_of_Salary_Cap'].to_numpy()
print(X.shape)

(6222, 29)


In [16]:
#split into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.2, shuffle = True)

In [12]:
lin_model_file_names = ['poly_model_deg1.sav', 
                        'ploy_model_deg2.sav', 
                        'ploy_model_deg3.sav', 
                        'poly_model_deg4.sav']

lin_models = [SVR(kernel='poly', degree=1), 
          SVR(kernel='poly', degree=2), 
          SVR(kernel='poly', degree=3), 
          SVR(kernel='poly', degree=4)]

In [20]:
#train models of different polynomial degrees
models = []
for deg in range(1, 4):
    model = SVR(kernel='poly', degree=deg)
    model.fit(X_train, y_train)
    models.append(model) #save for later
    
    y_train_predict = model.predict(X_train)
    y_val_predict = model.predict(X_val)
    rmse_train = mean_squared_error(y_train, y_train_predict, squared=False)
    rmse_val = mean_squared_error(y_val, y_val_predict, squared=False)
    
    print(f"Degree {deg} model train score: {model.score(X_train, y_train)}")
    print(f"Degree {deg} model val score: {model.score(X_val, y_val)}")
    print(f"Degree {deg} model train RMSE: {rmse_train}")
    print(f"Degree {deg} model validation RMSE: {rmse_val}\n")

Degree 1 model train score: 0.6617301834041465
Degree 1 model val score: 0.6933101577608418
Degree 1 model train RMSE: 0.05080218860072337
Degree 1 model validation RMSE: 0.04897892631927916

Degree 2 model train score: 0.49173873718859973
Degree 2 model val score: 0.5229220248588454
Degree 2 model train RMSE: 0.062272213619184384
Degree 2 model validation RMSE: 0.061087766713653936

Degree 3 model train score: 0.3084603215436841
Degree 3 model val score: 0.33454098256861964
Degree 3 model train RMSE: 0.07263724274157198
Degree 3 model validation RMSE: 0.07214732306135732



Degree 1 best 

scores: R^2 score = 0.69 

Validation RMSE = 0.049

In [24]:
#evaluate model on test data
best_model_index = 0
best_model = models[best_model_index]
y_test_predict = best_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_test_predict, squared=False)
    
print(f"Degree {best_model_index+1} model test score: {best_model.score(X_test, y_test)}")
print(f"Degree {best_model_index+1} model test RMSE: {rmse_test}")

Degree 1 model test score: 0.674348454112879
Degree 1 model test RMSE: 0.05212519255603451
