In [1]:
# All the packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from operator import itemgetter
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# NBA season we will be analyzing to create our model 
year = 2020
url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
html = urlopen(url)
soup_adv = BeautifulSoup(html)

In [3]:
# NBA season we will be analyzing to create our model 
year = 2020
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
html = urlopen(url)
soup_pg = BeautifulSoup(html)

In [4]:
# To see the column headers we have
soup_adv.findAll('tr', limit=2)
# GetText() function will help extract the text from the data we need into a list format
headers_adv = [th.getText() for th in soup_adv.findAll('tr', limit=2)[0].findAll('th')]
# Exclude the first column to remove ranking of the players from the webpage
headers_adv = headers_adv[1:]
headers_adv

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 '\xa0',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 '\xa0',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [5]:
# To see the column headers we have
soup_pg.findAll('tr', limit=2)
# GetText() function will help extract the text from the data we need into a list format
headers_pg = [th.getText() for th in soup_pg.findAll('tr', limit=2)[0].findAll('th')]
# Exclude the first column to remove ranking of the players from the webpage
headers_pg = headers_pg[1:]
headers_pg

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [6]:
# Remove the first header row
rows_adv = soup_adv.findAll('tr')[1:]
player_stats_adv = [[td.getText() for td in rows_adv[i].findAll('td')]
            for i in range(len(rows_adv))]

# Remove the first header row
rows_pg = soup_pg.findAll('tr')[1:]
player_stats_pg = [[td.getText() for td in rows_pg[i].findAll('td')]
            for i in range(len(rows_pg))]

In [7]:
# 2020 Player per game stats
stats_2020_pg = pd.DataFrame(player_stats_pg, columns = headers_pg)
stats_2020_pg

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26,OKC,46,46,26.8,4.5,7.5,.600,...,.586,3.3,5.9,9.2,2.6,0.7,1.1,1.5,1.9,10.8
1,Bam Adebayo,PF,22,MIA,51,51,34.7,6.3,10.7,.585,...,.688,2.5,7.8,10.4,4.8,1.2,1.2,2.7,2.4,16.1
2,LaMarcus Aldridge,C,34,SAS,47,47,33.1,7.3,14.6,.501,...,.842,2.0,5.5,7.4,2.4,0.6,1.7,1.3,2.4,18.8
3,Nickeil Alexander-Walker,SG,21,NOP,40,0,12.5,1.9,5.6,.342,...,.607,0.2,1.8,2.0,1.9,0.3,0.2,1.0,1.1,5.3
4,Grayson Allen,SG,24,MEM,30,0,16.6,2.6,5.9,.449,...,.857,0.2,2.0,2.2,1.4,0.2,0.0,0.8,1.2,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,Thaddeus Young,PF,31,CHI,52,8,23.6,3.9,8.9,.444,...,.586,1.4,3.3,4.6,1.8,1.2,0.3,1.5,1.9,9.7
552,Trae Young,PG,21,ATL,47,47,34.9,9.2,20.6,.445,...,.845,0.6,4.0,4.6,9.0,1.1,0.1,4.9,1.7,29.3
553,Cody Zeller,C,27,CHO,48,29,23.1,4.4,8.5,.521,...,.686,2.8,4.3,7.0,1.5,0.7,0.4,1.3,2.6,11.2
554,Ante Žižić,C,23,CLE,16,0,9.9,1.8,3.1,.571,...,.867,0.9,2.6,3.4,0.2,0.4,0.3,0.3,1.3,4.3


In [8]:
# Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
stats_2020_pg = stats_2020_pg[stats_2020_pg['Player'].notnull()]

# stats_2019_pg.drop(stats_2019_pg.index[0])

# Convert string/object to float
cols = ['Age', 'G', 'GS','MP', 'FG','3P','FT%','FG%', '3P%','AST', 'STL', 'BLK','TRB', 'PTS','eFG%']
stats_2020_pg[cols] = stats_2020_pg[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# Replacing na values to 0
stats_2020_pg = stats_2020_pg[:].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [9]:
stats_2020_pg

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26.0,OKC,46.0,46.0,26.8,4.5,7.5,0.600,...,0.586,3.3,5.9,9.2,2.6,0.7,1.1,1.5,1.9,10.8
1,Bam Adebayo,PF,22.0,MIA,51.0,51.0,34.7,6.3,10.7,0.585,...,0.688,2.5,7.8,10.4,4.8,1.2,1.2,2.7,2.4,16.1
2,LaMarcus Aldridge,C,34.0,SAS,47.0,47.0,33.1,7.3,14.6,0.501,...,0.842,2.0,5.5,7.4,2.4,0.6,1.7,1.3,2.4,18.8
3,Nickeil Alexander-Walker,SG,21.0,NOP,40.0,0.0,12.5,1.9,5.6,0.342,...,0.607,0.2,1.8,2.0,1.9,0.3,0.2,1.0,1.1,5.3
4,Grayson Allen,SG,24.0,MEM,30.0,0.0,16.6,2.6,5.9,0.449,...,0.857,0.2,2.0,2.2,1.4,0.2,0.0,0.8,1.2,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,Thaddeus Young,PF,31.0,CHI,52.0,8.0,23.6,3.9,8.9,0.444,...,0.586,1.4,3.3,4.6,1.8,1.2,0.3,1.5,1.9,9.7
552,Trae Young,PG,21.0,ATL,47.0,47.0,34.9,9.2,20.6,0.445,...,0.845,0.6,4.0,4.6,9.0,1.1,0.1,4.9,1.7,29.3
553,Cody Zeller,C,27.0,CHO,48.0,29.0,23.1,4.4,8.5,0.521,...,0.686,2.8,4.3,7.0,1.5,0.7,0.4,1.3,2.6,11.2
554,Ante Žižić,C,23.0,CLE,16.0,0.0,9.9,1.8,3.1,0.571,...,0.867,0.9,2.6,3.4,0.2,0.4,0.3,0.3,1.3,4.3


In [10]:
# 2019 Player advanced stats
stats_2020_adv = pd.DataFrame(player_stats_adv, columns = headers_adv)
stats_2020_adv

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26,OKC,46,1232,20.8,.610,.003,.420,...,,3.1,1.9,5.0,.194,,1.4,3.2,4.6,2.1
1,Bam Adebayo,PF,22,MIA,51,1770,20.7,.621,.020,.483,...,,4.0,2.8,6.8,.184,,1.2,3.8,5.0,3.1
2,LaMarcus Aldridge,C,34,SAS,47,1554,20.4,.581,.192,.240,...,,3.2,1.3,4.5,.139,,1.5,0.6,2.1,1.6
3,Nickeil Alexander-Walker,SG,21,NOP,40,498,7.9,.445,.520,.124,...,,-0.7,0.3,-0.4,-0.036,,-3.9,-2.1,-6.0,-0.5
4,Grayson Allen,SG,24,MEM,30,498,11.4,.577,.517,.199,...,,0.5,0.2,0.6,.062,,-0.8,-2.7,-3.5,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,Thaddeus Young,PF,31,CHI,52,1226,12.7,.515,.368,.126,...,,-0.2,1.6,1.3,.052,,-2.0,0.8,-1.3,0.2
552,Trae Young,PG,21,ATL,47,1642,24.3,.598,.452,.433,...,,4.1,0.6,4.7,.137,,7.6,-2.3,5.3,3.0
553,Cody Zeller,C,27,CHO,48,1111,18.5,.573,.165,.344,...,,1.9,0.9,2.8,.121,,0.1,-0.3,-0.2,0.5
554,Ante Žižić,C,23,CLE,16,159,18.7,.621,.000,.306,...,,0.4,0.1,0.5,.152,,-2.4,-1.5,-3.9,-0.1


In [11]:
# Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
stats_2020_adv = stats_2020_adv[stats_2020_adv['Player'].notnull()]

# stats_2019_pg.drop(stats_2019_pg.index[0])

# Convert string/object to float
cols = ['Age', 'G', 'TRB%','AST%','STL%','BLK%','PER','TS%', 'WS','USG%','BPM','VORP']
stats_2020_adv[cols] = stats_2020_adv[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# Replacing na values to 0
stats_2020_adv = stats_2020_adv[:].fillna(0)


In [12]:
stats_2020_adv

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26.0,OKC,46.0,1232,20.8,0.610,.003,.420,...,,3.1,1.9,5.0,.194,,1.4,3.2,4.6,2.1
1,Bam Adebayo,PF,22.0,MIA,51.0,1770,20.7,0.621,.020,.483,...,,4.0,2.8,6.8,.184,,1.2,3.8,5.0,3.1
2,LaMarcus Aldridge,C,34.0,SAS,47.0,1554,20.4,0.581,.192,.240,...,,3.2,1.3,4.5,.139,,1.5,0.6,2.1,1.6
3,Nickeil Alexander-Walker,SG,21.0,NOP,40.0,498,7.9,0.445,.520,.124,...,,-0.7,0.3,-0.4,-0.036,,-3.9,-2.1,-6.0,-0.5
4,Grayson Allen,SG,24.0,MEM,30.0,498,11.4,0.577,.517,.199,...,,0.5,0.2,0.6,.062,,-0.8,-2.7,-3.5,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,Thaddeus Young,PF,31.0,CHI,52.0,1226,12.7,0.515,.368,.126,...,,-0.2,1.6,1.3,.052,,-2.0,0.8,-1.3,0.2
552,Trae Young,PG,21.0,ATL,47.0,1642,24.3,0.598,.452,.433,...,,4.1,0.6,4.7,.137,,7.6,-2.3,5.3,3.0
553,Cody Zeller,C,27.0,CHO,48.0,1111,18.5,0.573,.165,.344,...,,1.9,0.9,2.8,.121,,0.1,-0.3,-0.2,0.5
554,Ante Žižić,C,23.0,CLE,16.0,159,18.7,0.621,.000,.306,...,,0.4,0.1,0.5,.152,,-2.4,-1.5,-3.9,-0.1


In [13]:
# Merge both per game and advanced stats
merged_2020_stats = pd.merge(stats_2020_pg, stats_2020_adv, on=['Player','Tm'], how='outer')
merged_2020_stats

Unnamed: 0,Player,Pos_x,Age_x,Tm,G_x,GS,MP_x,FG,FGA,FG%,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26.0,OKC,46.0,46.0,26.8,4.5,7.5,0.600,...,,3.1,1.9,5.0,.194,,1.4,3.2,4.6,2.1
1,Bam Adebayo,PF,22.0,MIA,51.0,51.0,34.7,6.3,10.7,0.585,...,,4.0,2.8,6.8,.184,,1.2,3.8,5.0,3.1
2,LaMarcus Aldridge,C,34.0,SAS,47.0,47.0,33.1,7.3,14.6,0.501,...,,3.2,1.3,4.5,.139,,1.5,0.6,2.1,1.6
3,Nickeil Alexander-Walker,SG,21.0,NOP,40.0,0.0,12.5,1.9,5.6,0.342,...,,-0.7,0.3,-0.4,-0.036,,-3.9,-2.1,-6.0,-0.5
4,Grayson Allen,SG,24.0,MEM,30.0,0.0,16.6,2.6,5.9,0.449,...,,0.5,0.2,0.6,.062,,-0.8,-2.7,-3.5,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,Thaddeus Young,PF,31.0,CHI,52.0,8.0,23.6,3.9,8.9,0.444,...,,-0.2,1.6,1.3,.052,,-2.0,0.8,-1.3,0.2
528,Trae Young,PG,21.0,ATL,47.0,47.0,34.9,9.2,20.6,0.445,...,,4.1,0.6,4.7,.137,,7.6,-2.3,5.3,3.0
529,Cody Zeller,C,27.0,CHO,48.0,29.0,23.1,4.4,8.5,0.521,...,,1.9,0.9,2.8,.121,,0.1,-0.3,-0.2,0.5
530,Ante Žižić,C,23.0,CLE,16.0,0.0,9.9,1.8,3.1,0.571,...,,0.4,0.1,0.5,.152,,-2.4,-1.5,-3.9,-0.1


In [14]:
merged_2020_stats.dtypes

Player     object
Pos_x      object
Age_x     float64
Tm         object
G_x       float64
GS        float64
MP_x      float64
FG        float64
FGA        object
FG%       float64
3P        float64
3PA        object
3P%       float64
2P         object
2PA        object
2P%        object
eFG%      float64
FT         object
FTA        object
FT%       float64
ORB        object
DRB        object
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV        object
PF         object
PTS       float64
Pos_y      object
Age_y     float64
G_y       float64
MP_y       object
PER       float64
TS%       float64
3PAr       object
FTr        object
ORB%       object
DRB%       object
TRB%      float64
AST%      float64
STL%      float64
BLK%      float64
TOV%       object
USG%      float64
           object
OWS        object
DWS        object
WS        float64
WS/48      object
           object
OBPM       object
DBPM       object
BPM       float64
VORP      float64
dtype: obj

In [15]:
# Save the datasets
merged_2020_stats.to_csv('2019_2020_data.csv')