In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso 
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR 
from sklearn.preprocessing import PolynomialFeatures

In [4]:
df = pd.read_csv('nba_salaries.csv')

df.head(10)

Unnamed: 0.1,Unnamed: 0,Player Name,Salary,Position,Age,Team,GP,GS,MP,FG,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional
0,0,Stephen Curry,48070014,PG,34,GSW,56,56,34.7,10.0,...,0.7,5.4,6.1,6.3,0.9,0.4,3.2,2.1,29.4,curryst01
1,1,John Wall,47345760,PG,32,LAC,34,3,22.2,4.1,...,0.4,2.3,2.7,5.2,0.8,0.4,2.4,1.7,11.4,walljo01
2,2,Russell Westbrook,47080179,PG,34,LAL/LAC,73,24,29.1,5.9,...,1.2,4.6,5.8,7.5,1.0,0.5,3.5,2.2,15.9,westbru01
3,3,LeBron James,44474988,PF,38,LAL,55,54,35.5,11.1,...,1.2,7.1,8.3,6.8,0.9,0.6,3.2,1.6,28.9,jamesle01
4,4,Kevin Durant,44119845,PF,34,BRK/PHO,47,47,35.6,10.3,...,0.4,6.3,6.7,5.0,0.7,1.4,3.3,2.1,29.1,duranke01
5,5,Bradley Beal,43279250,SG,29,WAS,50,50,33.5,8.9,...,0.8,3.1,3.9,5.4,0.9,0.7,2.9,2.1,23.2,bealbr01
6,6,Kawhi Leonard,42492492,SF,31,LAC,52,50,33.6,8.6,...,1.1,5.4,6.5,3.9,1.4,0.5,1.7,1.6,23.8,leonaka01
7,7,Paul George,42492492,SF,32,LAC,56,56,34.6,8.2,...,0.8,5.3,6.1,5.1,1.5,0.4,3.1,2.8,23.8,georgpa01
8,8,Giannis Antetokounmpo,42492492,PF,28,MIL,63,63,32.1,11.2,...,2.2,9.6,11.8,5.7,0.8,0.8,3.9,3.1,31.1,antetgi01
9,9,Damian Lillard,42492492,PG,32,POR,58,58,36.3,9.6,...,0.8,4.0,4.8,7.3,0.9,0.3,3.3,1.9,32.2,lillada01


In [5]:
df.drop(df.columns[0], axis=1, inplace=True)
df

Unnamed: 0,Player Name,Salary,Position,Age,Team,GP,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Player-additional
0,Stephen Curry,48070014,PG,34,GSW,56,56,34.7,10.0,20.2,...,0.7,5.4,6.1,6.3,0.9,0.4,3.2,2.1,29.4,curryst01
1,John Wall,47345760,PG,32,LAC,34,3,22.2,4.1,9.9,...,0.4,2.3,2.7,5.2,0.8,0.4,2.4,1.7,11.4,walljo01
2,Russell Westbrook,47080179,PG,34,LAL/LAC,73,24,29.1,5.9,13.6,...,1.2,4.6,5.8,7.5,1.0,0.5,3.5,2.2,15.9,westbru01
3,LeBron James,44474988,PF,38,LAL,55,54,35.5,11.1,22.2,...,1.2,7.1,8.3,6.8,0.9,0.6,3.2,1.6,28.9,jamesle01
4,Kevin Durant,44119845,PF,34,BRK/PHO,47,47,35.6,10.3,18.3,...,0.4,6.3,6.7,5.0,0.7,1.4,3.3,2.1,29.1,duranke01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,Justin Minaya,35096,SF,23,POR,4,0,22.3,1.8,5.8,...,0.8,3.0,3.8,1.0,0.5,1.3,1.0,2.3,4.3,minayju01
463,Kobi Simmons,32795,SG,25,CHO,5,0,5.6,0.2,1.2,...,0.2,0.6,0.8,1.0,0.0,0.4,0.2,0.0,1.0,simmoko01
464,Gabe York,32171,SG,29,IND,3,0,18.7,2.7,7.0,...,0.0,2.0,2.0,1.7,0.7,0.0,0.0,1.7,8.0,yorkga01
465,RaiQuan Gray,5849,PF,23,BRK,1,0,35.0,6.0,12.0,...,3.0,6.0,9.0,7.0,0.0,1.0,4.0,5.0,16.0,grayra01


In [6]:
important_num_cols = list(df.corr()["Salary"][(df.corr()["Salary"]>0.50) | 
                                              (df.corr()["Salary"]<-0.50)].index)
cat_cols = ["Position", "Team"]
important_cols = important_num_cols + cat_cols

df = df[important_cols]

In [7]:
X = df.drop("Salary", axis=1)
y = df["Salary"]

In [8]:
X = pd.get_dummies(X, columns=cat_cols)

In [9]:
important_num_cols.remove("Salary")

scaler = StandardScaler()
X[important_num_cols] = scaler.fit_transform(X[important_num_cols])

In [10]:
X.head()

Unnamed: 0,GS,MP,FG,FGA,2P,2PA,FT,FTA,DRB,TRB,...,Team_POR,Team_SAC,Team_SAS,Team_SAS/MIL,Team_SAS/PHI,Team_SAS/TOR,Team_TOR,Team_UTA,Team_WAS,Team_WAS/TOR
0,1.232158,1.554622,2.707967,2.608452,1.380525,1.254321,2.019667,1.663713,1.604721,1.130666,...,0,0,0,0,0,0,0,0,0,0
1,-0.726051,0.244137,0.304907,0.554745,0.374117,0.665666,0.551725,0.766294,-0.212231,-0.364023,...,0,0,0,0,0,0,0,0,0,0
2,0.049843,0.967525,1.038044,1.292485,1.179243,1.506602,0.870843,1.294188,1.13583,0.998782,...,0,0,0,0,0,0,0,0,0,0
3,1.158263,1.638493,3.155995,3.00723,3.2927,3.076349,2.019667,2.138818,2.601114,2.097818,...,0,0,0,0,0,0,0,0,0,0
4,0.899632,1.648977,2.830156,2.229613,2.990778,2.543756,3.232314,2.77229,2.132223,1.394435,...,0,0,0,0,0,0,0,0,0,0


In [11]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, 
                                     scoring='neg_mean_squared_error', cv=5)).mean()
    return rmse

In [13]:
def evaluation(y, predictions):
    mae = mean_absolute_error(y, predictions)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    r_squared = r2_score(y, predictions)
    return mae, mse, rmse, r_squared

In [38]:
models = pd.DataFrame(columns=["Model","MAE","MSE","RMSE","R2 Score",
                               "RMSE (Cross-Validation)"])

In [39]:
#Linear Regression 

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
predictions = lin_reg.predict(X_test)

In [40]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R2 Score: ", r_squared)
rmse_cross_val = rmse_cv(lin_reg)
print("RMSE Cross-Validation: ", rmse_cross_val)

MAE:  5.181730702534094e+16
MSE:  5.206999316741946e+34
RMSE:  2.281885035829357e+17
R2 Score:  -3.4659042156948875e+20
RMSE Cross-Validation:  1.5071038827116344e+18


In [41]:
import warnings
warnings.filterwarnings("ignore")

new_row = {"Model": "LinearRegression", "MAE": mae, "MSE": mse, "RMSE": rmse, 
           "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}

models = models.append(new_row, ignore_index = True)


In [42]:
models

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score,RMSE (Cross-Validation)
0,LinearRegression,5.181731e+16,5.206999e+34,2.281885e+17,-3.465904e+20,1.507104e+18


In [46]:
#Ridge Regression

ridge = Ridge()
ridge.fit(X_train, y_train)
predictions = ridge.predict(X_test)

In [47]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print('MSE:', mse)
print('RMSE:', rmse)
print('R2 Score:', r_squared)

MAE: 6065315.326268968
MSE: 67687243280704.54
RMSE: 8227225.734152706
R2 Score: 0.54945739846544


In [48]:
rmse_cross_val = rmse_cv(ridge)
print("RMSE Cross-Validation:", rmse_cross_val)

RMSE Cross-Validation: 9682975.570331499


In [55]:
new_row = {"Model": "Ridge", "MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared,
          "RMSE (Cross-Validation)": rmse_cross_val}

models = models.append(new_row, ignore_index=True)

In [58]:
models

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score,RMSE (Cross-Validation)
0,LinearRegression,5.181731e+16,5.206999e+34,2.281885e+17,-3.465904e+20,1.507104e+18
2,Ridge,6065315.0,67687240000000.0,8227226.0,0.5494574,9682976.0


In [59]:
#Lasso Regression 

lasso = Lasso()
lasso.fit(X_train,y_train)
predictions = lasso.predict(X_test)

In [60]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)
print('R2 Score:', r_squared)

MAE: 6315622.805370234
MSE: 72010334842948.05
RMSE: 8485890.33884766
R2 Score: 0.5206818593132858


In [62]:
rmse_cross_val = rmse_cv(lasso)
print("RMSE Cross-Validation:", rmse_cross_val)

RMSE Cross-Validation: 9864361.150228053


In [61]:
new_row = {'Model': 'Lasso', 'MAE': mae, 'MSE': mse,  "RMSE": rmse, "R2 Score": r_squared, 
           "RMSE (Cross-Validation)": rmse_cross_val}

models = models.append(new_row, ignore_index=True)

In [65]:
#Elastic Net

elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
predictions = elastic_net.predict(X_test)

In [66]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)
print('R2 Score:', r_squared)

MAE: 5696216.952141008
MSE: 65437620398827.74
RMSE: 8089352.285494045
R2 Score: 0.5644314304476432


In [67]:
rmse_cross_val = rmse_cv(elastic_net)
print('RMSE Cross-Validation:', rmse_cross_val)

RMSE Cross-Validation: 9067344.650853463


In [70]:
new_row = {'Model': 'ElasticNet', 'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2 Score': r_squared,
          'RMSE (Cross-Validation)': rmse_cross_val}

models = models.append(new_row, ignore_index=True)

In [71]:
#Support Vector Machines

svr = SVR(C=100000)
svr.fit(X_train,y_train)
predictions = svr.predict(X_test)

In [72]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print('MAE:', mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

MAE: 7592555.055915048
MSE: 159321487361874.56
RMSE: 12622261.578729644
R2 Score: -0.06048221078663696


In [73]:
rmse_cross_val = rmse_cv(svr)
print("RMSE Cross-Validation:", rmse_cross_val)

RMSE Cross-Validation: 8018215.829990846


In [74]:
new_row = {"Model": "SVR","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, 
           "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

In [75]:
#Random Forest

random_forest = RandomForestRegressor(n_estimators = 100)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)

In [78]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

MAE: 5464042.577354838
MSE: 68366929625661.11
RMSE: 8268429.69527232
R2 Score: 0.5449332423727727


In [80]:
rmse_cross_val = rmse_cv(random_forest)
print("RMSE Cross-Validation:", rmse_cross_val)

RMSE Cross-Validation: 9451277.720829101


In [81]:
new_row = {"Model": "RandomForestRegressor","MAE": mae, "MSE": mse, "RMSE": rmse, 
           "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

In [84]:
#Polynomial Regression

poly_reg = PolynomialFeatures(degree=2)
X_train_2d = poly_reg.fit_transform(X_train)
X_test_2d = poly_reg.transform(X_test)

In [85]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_2d, y_train)
predictions = lin_reg.predict(X_test_2d)

In [87]:
mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

MAE: 8756908.93036932
MSE: 153411478117009.66
RMSE: 12385938.725708667
R2 Score: -0.021143765147297655


In [88]:
rmse_cross_val = rmse_cv(lin_reg)
print("RMSE Cross-Validation:", rmse_cross_val)

RMSE Cross-Validation: 1.5071038827116344e+18


In [89]:
new_row = {"Model": "Polynomial Regression (degree=2)","MAE": mae, "MSE": mse, "RMSE": rmse, 
           "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

In [90]:
models.sort_values(by='RMSE (Cross-Validation)')

Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score,RMSE (Cross-Validation)
4,SVR,7592555.0,159321500000000.0,12622260.0,-0.06048221,8018216.0
3,ElasticNet,5696217.0,65437620000000.0,8089352.0,0.5644314,9067345.0
5,RandomForestRegressor,5464043.0,68366930000000.0,8268430.0,0.5449332,9451278.0
1,Ridge,6065315.0,67687240000000.0,8227226.0,0.5494574,9682976.0
2,Lasso,6315623.0,72010330000000.0,8485890.0,0.5206819,9682976.0
0,LinearRegression,5.181731e+16,5.206999e+34,2.281885e+17,-3.465904e+20,1.507104e+18
6,Polynomial Regression (degree=2),8756909.0,153411500000000.0,12385940.0,-0.02114377,1.507104e+18


In [91]:
#ElesticNet and RandomForest are good models, because they have low RMSE and high r2 score