# PREDICTING NBA SALARIES USING MACHINE LEARNING

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#NBA players stats 1950-2017
nba = pd.read_csv('NBA.csv')
#NBA annual salary cap data
salaryCap = pd.read_csv('SalaryCap.csv')

### DATA CLEANING

We will drop unknown columns and rows with NaN values, remove rows with seasons before 2000 and players with unknown salaries, rename columns for better understanding.

In [3]:
nba = nba.drop(['blanl', 'blank2'], axis=1)
nba = nba.drop(nba.tail(2).index)
nba = nba[(nba[' Player Salary in $ '].notna()) & (nba['Season Start'] >= 2001)]
nba.rename(columns={"Pos": "Position","Tm": "Team","G":"Games Played","GS":"Games Started"," Player Salary in $ ":"Salary","Season Start":"Season", "MP":"Minutes Played"}, inplace=True)

We will also remove the dollar sign in the salary column and convert percentages to decimals.

In [4]:
nba['Salary'] = nba['Salary'].str.replace(',','').str.replace('$','')
nba['Salary'] = nba['Salary'].astype(float)
for c in nba.columns:
    if c[-1] == '%':
        if nba[c].dtype == object:
            nba[c] = nba[c].str.replace('%','')
            nba[c] = nba[c].astype(float)
            nba[c] = nba[c] / 100.0   

Since players who have played less than a certain amount of games may distort the data later on when doing analysis, we will only keep players who have played at least 10 games.

In [5]:
nba = nba[nba['Games Played'] >= 10]

Some positions are the same, but named differently. It will be updated here.

In [15]:
nba['Position'].replace("SG-PG", "PG-SG", True)
nba['Position'].replace("SF-SG", "SG-SF", True)
nba['Position'].replace("PF-SF", "SF-PF", True)
nba['Position'].replace("C-PF", "PF-C", True)
nba['Position'].replace(["PG-SF", "SG-PF"], "SF", True)

In [16]:
nba

Unnamed: 0,#,Season,Player Name,Salary,Position,Age,Team,Games Played,Games Started,Minutes Played,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
21,15151.0,2001.0,A.J. Guyton,465850.0,PG,22.0,CHI,33.0,8.0,630.0,...,0.833,10.0,26.0,36.0,64.0,9.0,5.0,24.0,35.0,198.0
22,15684.0,2002.0,A.J. Guyton,18748.0,PG,23.0,CHI,45.0,6.0,607.0,...,0.815,12.0,32.0,44.0,81.0,10.0,7.0,37.0,23.0,244.0
24,24304.0,2017.0,A.J. Hammons,1312611.0,C,24.0,DAL,22.0,0.0,163.0,...,0.450,8.0,28.0,36.0,4.0,1.0,13.0,10.0,21.0,48.0
25,20357.0,2010.0,A.J. Price,762195.0,PG,23.0,IND,56.0,2.0,865.0,...,0.800,12.0,76.0,88.0,106.0,35.0,3.0,59.0,53.0,410.0
26,20966.0,2011.0,A.J. Price,854389.0,PG,24.0,IND,50.0,0.0,795.0,...,0.667,16.0,56.0,72.0,111.0,29.0,1.0,53.0,61.0,323.0
27,21529.0,2012.0,A.J. Price,885120.0,PG,25.0,IND,44.0,1.0,568.0,...,0.800,13.0,48.0,61.0,86.0,20.0,2.0,32.0,30.0,172.0
29,22709.0,2014.0,A.J. Price,62552.0,SG,27.0,MIN,28.0,0.0,99.0,...,0.000,1.0,9.0,10.0,13.0,1.0,0.0,7.0,5.0,44.0
36,18815.0,2008.0,Aaron Brooks,1045560.0,PG,23.0,HOU,51.0,0.0,608.0,...,0.857,13.0,43.0,56.0,87.0,13.0,5.0,44.0,69.0,264.0
37,19415.0,2009.0,Aaron Brooks,1118520.0,PG,24.0,HOU,80.0,35.0,1998.0,...,0.866,33.0,124.0,157.0,238.0,46.0,8.0,125.0,152.0,894.0
42,21751.0,2013.0,Aaron Brooks,3396250.0,PG,28.0,TOT,53.0,20.0,997.0,...,0.769,13.0,67.0,80.0,114.0,30.0,10.0,67.0,94.0,376.0
