# NBA Player Value based on Win-Shares and Salary

In [1]:
#import packages
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


### Create df from web source

In [2]:
#Creating df from url ensures up to date information as site is updated daily
url_stats = 'https://www.basketball-reference.com/leagues/NBA_2019_advanced.html'
html_stats = requests.get(url_stats).content
df_list_stats = pd.read_html(html_stats)
df = df_list_stats[-1]
df = pd.DataFrame(data = df)
print(df)
name_of_data_file = 'nba_player_stats.csv'
#saves file to csv
df.to_csv('nba_player_stats.csv')
df = pd.read_csv('nba_player_stats.csv')



      Rk                  Player  Pos  Age   Tm   G    MP   PER   TS%   3PAr  \
0      1            Alex Abrines   SG   25  OKC  31   588   6.3  .507   .809   
1      2              Quincy Acy   PF   28  PHO  10   123   2.9  .379   .833   
2      3            Jaylen Adams   PG   22  ATL  23   251   9.0  .543   .754   
3      4            Steven Adams    C   25  OKC  69  2323  18.8  .602   .001   
4      5             Bam Adebayo    C   21  MIA  70  1582  17.5  .626   .026   
5      6               Deng Adel   SF   21  CLE  15   181   1.4  .369   .636   
6      7  DeVaughn Akoon-Purcell   SG   25  DEN   7    22   8.3  .322   .400   
7      8       LaMarcus Aldridge    C   33  SAS  70  2306  22.7  .574   .030   
8      9            Rawle Alkins   SG   21  CHI   3     6  37.9  .375   .250   
9     10           Grayson Allen   SG   23  UTA  31   292   4.5  .459   .640   
10    11           Jarrett Allen    C   20  BRK  70  1866  18.9  .633   .085   
11    12            Kadeem Allen   SG   

In [3]:
#create df from downloaded csv of NBA player stats
#pulled data from Basketball-Reference.com

#df = pd.read_csv('D:\dudad\Documents\Data Science Career Track\Data Wrangling for Capstone 1\player_stats.csv')
#print(df)


In [4]:
#examine the data
#data is fairly clean directly from the source
df.head()


Unnamed: 0.1,Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,...,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,0,1,Alex Abrines,SG,25,OKC,31,588,6.3,0.507,...,,0.1,0.6,0.7,0.055,,-2.4,-0.8,-3.2,-0.2
1,1,2,Quincy Acy,PF,28,PHO,10,123,2.9,0.379,...,,-0.1,0.1,0.0,-0.018,,-5.7,-0.1,-5.8,-0.1
2,2,3,Jaylen Adams,PG,22,ATL,23,251,9.0,0.543,...,,0.1,0.1,0.2,0.043,,-1.7,-2.2,-3.9,-0.1
3,3,4,Steven Adams,C,25,OKC,68,2298,18.7,0.601,...,,4.5,3.5,8.1,0.168,,0.7,2.1,2.8,2.8
4,4,5,Bam Adebayo,C,21,MIA,69,1553,17.6,0.629,...,,3.0,2.6,5.5,0.172,,-0.4,3.3,2.9,1.9


In [5]:
#remove unnamed columns from site used for formatting purposes
df = df.drop(columns=['Unnamed: 19', 'Unnamed: 24'])

In [6]:
#understanding how data was initially interpreted
df.get_dtype_counts()


int64      1
object    27
dtype: int64

In [7]:
df.dtypes

Unnamed: 0     int64
Rk            object
Player        object
Pos           object
Age           object
Tm            object
G             object
MP            object
PER           object
TS%           object
3PAr          object
FTr           object
ORB%          object
DRB%          object
TRB%          object
AST%          object
STL%          object
BLK%          object
TOV%          object
USG%          object
OWS           object
DWS           object
WS            object
WS/48         object
OBPM          object
DBPM          object
BPM           object
VORP          object
dtype: object

In [8]:
df['Player'] = df['Player'].astype(str)
print(df.dtypes)

Unnamed: 0     int64
Rk            object
Player        object
Pos           object
Age           object
Tm            object
G             object
MP            object
PER           object
TS%           object
3PAr          object
FTr           object
ORB%          object
DRB%          object
TRB%          object
AST%          object
STL%          object
BLK%          object
TOV%          object
USG%          object
OWS           object
DWS           object
WS            object
WS/48         object
OBPM          object
DBPM          object
BPM           object
VORP          object
dtype: object


## Breakdown of column names: 

In [9]:
# Rank, Player Name, Position, Age, Team, Games, Minutes Played, Player Efficiency Rating,
# True Shooting Percentage, 3-point attempt rate, Free Throw Attempt Rate, Offensive Rebound Percentage,
# Defensive Rebound Percentage, Total Rebound Percentage, Assist Percentage, Steal Percentage, Block Percentage,
# Turnover Percentage, Usage Percentage, Offensive Win Shares, Defensive Win Shares, Win Shares, Win Shares Per 48 minutes,
# Offensive Box plus/minus, Defensive box plus/minus, Box plus/minus, Value over replacement player
#full definition of all columns can be found at https://www.basketball-reference.com/leagues/NBA_2019_advanced.html
list(df)

['Unnamed: 0',
 'Rk',
 'Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [10]:
# Total missing values for each feature
print(df.isnull().sum())

Unnamed: 0    0
Rk            0
Player        0
Pos           0
Age           0
Tm            0
G             0
MP            0
PER           0
TS%           6
3PAr          8
FTr           8
ORB%          0
DRB%          0
TRB%          0
AST%          0
STL%          0
BLK%          0
TOV%          6
USG%          0
OWS           0
DWS           0
WS            0
WS/48         0
OBPM          0
DBPM          0
BPM           0
VORP          0
dtype: int64


In [11]:
#changing data type of Player column in df
df['Player'] = df['Player'].astype(str)

In [12]:
#check Player column data type
print(print(df['Player']))

0                Alex Abrines
1                  Quincy Acy
2                Jaylen Adams
3                Steven Adams
4                 Bam Adebayo
5                   Deng Adel
6      DeVaughn Akoon-Purcell
7           LaMarcus Aldridge
8                Rawle Alkins
9               Grayson Allen
10              Jarrett Allen
11               Kadeem Allen
12            Al-Farouq Aminu
13            Justin Anderson
14              Kyle Anderson
15              Ryan Anderson
16              Ryan Anderson
17              Ryan Anderson
18               Ike Anigbogu
19      Giannis Antetokounmpo
20            Carmelo Anthony
21                 OG Anunoby
22                     Player
23           Ryan Arcidiacono
24               Trevor Ariza
25               Trevor Ariza
26               Trevor Ariza
27              D.J. Augustin
28              Deandre Ayton
29               Dwayne Bacon
                ...          
670         Russell Westbrook
671             Derrick White
672       

## Create dataframe for player salary information from Basketball Reference

### Pull data directly from website for freshness

In [13]:
url_salary = 'https://www.basketball-reference.com/contracts/players.html'
html_salary = requests.get(url_salary).content
df_list_salary = pd.read_html(html_salary)
df_salary = df_list_salary[-1]
print(df_salary)
#saves file to csv for offline use
df.to_csv('nba_player_contracts.csv')
df_salary = pd.read_csv('nba_player_contracts.csv')

    Unnamed: 0_level_0                   Salary Unnamed: 2_level_0  \
                    Rk                   Player                 Tm   
0                    1            Stephen Curry                GSW   
1                    2               Chris Paul                HOU   
2                    3        Russell Westbrook                OKC   
3                    4             LeBron James                LAL   
4                    5            Blake Griffin                DET   
5                    6           Gordon Hayward                BOS   
6                    7               Kyle Lowry                TOR   
7                    8              Paul George                OKC   
8                    9              Mike Conley                MEM   
9                   10             James Harden                HOU   
10                  11             Kevin Durant                GSW   
11                  12             Paul Millsap                DEN   
12                  

In [14]:
#remove first row from table as this is a title row, not column names
df_salary.columns = df_salary.columns.droplevel(0)
print(df_salary)

AttributeError: 'Index' object has no attribute 'droplevel'

In [None]:
#remove rows that are utilized as continued header row
df.drop(df.index[22], inplace = True)
print(df)

In [None]:
#create df_salary from nba player salaries data from Basketball Reference
#df_salary = pd.read_csv('D:\dudad\Documents\Data Science Career Track\Data Wrangling for Capstone 1\player_salary.csv')
#print(df_salary)

In [None]:
df_salary.head()

In [None]:
# Total missing values for each feature
print(df_salary.isnull().sum())

In [None]:
cols_stats = list(df.columns)

In [None]:
print(cols_stats)


In [None]:
cols_salary = list(df_salary.columns)

In [None]:
print(cols_salary)


## Converting columns to correct datatype for future analysis

In [None]:
#df column data is not correct type
print(type(df['WS']))

In [None]:
#found another header row at row 48, removing
df.drop(df.index[48], inplace = True)
print(df)

In [None]:
#found another header row at row 70, removing
df.drop(df.index[70], inplace = True)
print(df)

In [None]:
#found another header row at row 99, removing
df.drop(df.index[99], inplace = True)


In [None]:
#found another header row at row 128, removing
df.drop(df.index[128], inplace = True)

In [None]:
#found another header row at row 150, removing
df.drop(df.index[150], inplace = True)

In [None]:
#found another header row at row 176, removing
df.drop(df.index[176], inplace = True)

In [None]:
#found another header row at row 204, removing
df.drop(df.index[204], inplace = True)

In [None]:
#found another header row at row 228, removing
df.drop(df.index[228], inplace = True)

In [None]:
#found another header row at row 252, removing
df.drop(df.index[252], inplace = True)

In [None]:
#found another header row at row 279, removing
df.drop(df.index[279], inplace = True)

In [None]:
#found another header row at row 303, removing
df.drop(df.index[303], inplace = True)

In [None]:
#found another header row at row 333, removing
df.drop(df.index[333], inplace = True)

In [None]:
#found another header row at row 363, removing
df.drop(df.index[363], inplace = True)

In [None]:
#found another header row at row 387, removing
df.drop(df.index[387], inplace = True)

In [None]:
#found another header row at row 420, removing
df.drop(df.index[420], inplace = True)

In [None]:
#found another header row at row 444, removing
df.drop(df.index[444], inplace = True)

In [None]:
#found another header row at row 470, removing
df.drop(df.index[470], inplace = True)

In [None]:
#found another header row at row 496, removing
df.drop(df.index[496], inplace = True)

In [None]:
#found another header row at row 522, removing
df.drop(df.index[522], inplace = True)

In [None]:
#found another header row at row 546, removing
df.drop(df.index[546], inplace = True)

In [None]:
#found another header row at row 580, removing
df.drop(df.index[580], inplace = True)

In [None]:
#found another header row at row 606, removing
df.drop(df.index[606], inplace = True)

In [None]:
#found another header row at row 628, removing
df.drop(df.index[628], inplace = True)

In [None]:
#found another header row at row 648, removing
df.drop(df.index[648], inplace = True)

In [None]:
#view df after removing header rows throughout df
print(df)

In [None]:
print(df.isnull().sum())

In [None]:
#change necessary columns to correct type
df[["Age","G","MP", "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", 
    "OWS","DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP"]] = df[["Rk", "Age","G","MP", "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", 
    "OWS","DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP"]].apply(pd.to_numeric)

In [None]:
df['WS'].max()

In [None]:
df['WS'].min()