In [2]:
import pandas as pd
import numpy as np

import plotly.express as px

In [19]:
df = pd.read_csv('data/NBA_Player_Dataset-2013-2021.csv')

In [175]:
df = df[~df['Year'].isin([2013, 2014, 2015])]

In [176]:
df.columns

Index(['Year', 'FULL NAME', 'TEAM', 'POS', 'AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'ORTG', 'DRTG',
       'TO_100_Games', 'Salary', 'Cap Maximum', 'Salary%OfCap', 'Traded',
       'Center', 'Forward', 'Guard'],
      dtype='object')

In [177]:
x_to_scale = df[['Year', 'AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'TO_100_Games']]

In [178]:
x_to_scale = df[['Year', 'AGE', 'GP', 'MPG', 'PPG', 'RPG',
       'APG', 'SPG', 'BPG']]

In [179]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_to_scale)

x_scaled = scaler.transform(x_to_scale)

In [180]:
x_scaled_df = pd.DataFrame(x_scaled, columns = x_to_scale.columns)

In [181]:
x_scaled_df

Unnamed: 0,Year,AGE,GP,MPG,PPG,RPG,APG,SPG,BPG
0,1.416741,-0.950553,0.500433,0.372099,-0.031143,1.287571,-0.442265,-0.229664,0.333104
1,1.416741,0.513794,0.787126,0.745341,-0.220451,2.494507,0.774872,0.520014,0.874509
2,1.416741,-0.433166,-0.072954,1.459831,1.641086,2.735894,0.885521,2.261201,1.046774
3,1.416741,2.412462,-0.031997,0.350771,0.773420,0.804797,-0.552914,-0.737510,1.538961
4,1.416741,-0.701352,0.418521,-0.353055,-0.228339,-0.522832,0.000330,-0.495678,-0.294434
...,...,...,...,...,...,...,...,...,...
2898,-1.511352,-0.827139,0.008959,0.276123,-0.039030,-0.663641,0.608899,0.096809,-0.491308
2899,-1.511352,-1.301806,0.049915,1.886393,1.625310,-0.080289,0.608899,0.616747,-0.454394
2900,-1.511352,2.020862,1.114776,0.532060,0.868075,1.850808,-0.110318,-0.229664,-0.626660
2901,-1.511352,1.308861,0.991907,-0.150438,-0.393985,0.925490,0.000330,0.544197,0.185448


In [182]:
X = pd.concat([x_scaled_df, df[['Center', 'Forward', 'Guard']]], axis = 1)
y = df['Salary']

In [184]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=18)

In [185]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(random_state=46)

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

In [186]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print('MSE', mse)
print('MAE', mae)
print('MAPE', mape)

MSE [2.17682048e+13]
MAE 3042986.9583333335
MAPE 2.7768948060930496


In [187]:
results = pd.DataFrame(zip(*[y_test,y_pred]), columns = ['Y True', 'Y Tree Pred'])
results['Difference'] = abs(results['Y True'] - results['Y Tree Pred'])
results

Unnamed: 0,Y True,Y Tree Pred,Difference
0,5195501.0,27647871.21,22452370.21
1,2036280.0,2985280.03,949000.03
2,2159029.0,22950936.39,20791907.39
3,8000000.0,9762810.09,1762810.09
4,2299080.0,6866305.86,4567225.86
...,...,...,...
721,1015695.0,4141835.74,3126140.74
722,1471382.0,6819531.91,5348149.91
723,2412840.0,2323592.46,89247.54
724,1663861.0,1944771.17,280910.17


# Adaboost

In [188]:
from sklearn.ensemble import AdaBoostRegressor

regr = AdaBoostRegressor(random_state=7, n_estimators=10)

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

In [189]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print('MSE', mse)
print('MAE', mae)
print('MAPE', mape)

MSE [2.7756361e+13]
MAE 3995238.0297388467
MAPE 5.449896538121396


# Linear Regression

In [190]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(X_test)

In [191]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print('MSE', mse)
print('MAE', mae)
print('MAPE', mape)

MSE [2.9692816e+13]
MAE 4029608.7170335124
MAPE 4.3980597004847555


# KNN

In [194]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=5)

neigh.fit(X_train, y_train)

y_pred = neigh.predict(X_test)

In [195]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print('MSE', mse)
print('MAE', mae)
print('MAPE', mape)

MSE [2.35549529e+13]
MAE 3233834.6231404953
MAPE 2.4311523210793218
