In [1]:
# All the packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from operator import itemgetter
plt.style.use('fivethirtyeight')
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# NBA season we will be analyzing to create our model 
year = 2020
url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
html = urlopen(url)
soup_adv = BeautifulSoup(html)

In [3]:
# NBA season we will be analyzing to create our model 
year = 2020
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
html = urlopen(url)
soup_pg = BeautifulSoup(html)

In [4]:
# To see the column headers we have
soup_adv.findAll('tr', limit=2)
# GetText() function will help extract the text from the data we need into a list format
headers_adv = [th.getText() for th in soup_adv.findAll('tr', limit=2)[0].findAll('th')]
# Exclude the first column to remove ranking of the players from the webpage
headers_adv = headers_adv[1:]
headers_adv

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 '\xa0',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 '\xa0',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [5]:
# To see the column headers we have
soup_pg.findAll('tr', limit=2)
# GetText() function will help extract the text from the data we need into a list format
headers_pg = [th.getText() for th in soup_pg.findAll('tr', limit=2)[0].findAll('th')]
# Exclude the first column to remove ranking of the players from the webpage
headers_pg = headers_pg[1:]
headers_pg

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [6]:
# Remove the first header row
rows_adv = soup_adv.findAll('tr')[1:]
player_stats_adv = [[td.getText() for td in rows_adv[i].findAll('td')]
            for i in range(len(rows_adv))]

# Remove the first header row
rows_pg = soup_pg.findAll('tr')[1:]
player_stats_pg = [[td.getText() for td in rows_pg[i].findAll('td')]
            for i in range(len(rows_pg))]

In [7]:
# 2020 Player per game stats
stats_2020_pg = pd.DataFrame(player_stats_pg, columns = headers_pg)
stats_2020_pg

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26,OKC,44,44,26.7,4.5,7.5,.600,...,.579,3.4,5.9,9.3,2.6,0.7,1.1,1.5,1.9,10.8
1,Bam Adebayo,PF,22,MIA,47,47,34.5,6.1,10.4,.589,...,.692,2.5,7.9,10.4,4.7,1.2,1.1,2.8,2.5,16.0
2,LaMarcus Aldridge,C,34,SAS,43,43,33.2,7.4,14.6,.510,...,.845,2.0,5.5,7.5,2.3,0.6,1.7,1.4,2.5,19.1
3,Nickeil Alexander-Walker,SG,21,NOP,39,0,12.7,2.0,5.8,.342,...,.607,0.2,1.8,2.0,1.9,0.3,0.2,1.0,1.1,5.4
4,Grayson Allen,SG,24,MEM,30,0,16.6,2.6,5.9,.449,...,.857,0.2,2.0,2.2,1.4,0.2,0.0,0.8,1.2,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,Thaddeus Young,PF,31,CHI,50,6,23.3,3.8,8.8,.434,...,.596,1.4,3.2,4.6,1.8,1.1,0.3,1.4,1.9,9.4
543,Trae Young,PG,21,ATL,44,44,35.1,9.3,20.8,.446,...,.842,0.6,4.0,4.6,9.2,1.2,0.1,4.8,1.6,29.4
544,Cody Zeller,C,27,CHO,45,26,23.1,4.4,8.4,.517,...,.686,2.8,4.2,7.0,1.3,0.6,0.4,1.3,2.5,11.2
545,Ante Žižić,C,23,CLE,16,0,9.9,1.8,3.1,.571,...,.867,0.9,2.6,3.4,0.2,0.4,0.3,0.3,1.3,4.3


In [8]:
# Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
stats_2020_pg = stats_2020_pg[stats_2020_pg['Player'].notnull()]

# stats_2019_pg.drop(stats_2019_pg.index[0])

# Convert string/object to float
cols = ['Age', 'G', 'GS','MP', 'FG%', '3P%','AST', 'STL', 'BLK','TRB', 'PTS','eFG%']
stats_2020_pg[cols] = stats_2020_pg[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# Replacing na values to 0
stats_2020_pg = stats_2020_pg[:].fillna(0)

# Keep the first record in the dataset
stats_2020_pg = stats_2020_pg.drop_duplicates(['Player'], keep = 'first')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [9]:
stats_2020_pg

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26.0,OKC,44.0,44.0,26.7,4.5,7.5,0.600,...,.579,3.4,5.9,9.3,2.6,0.7,1.1,1.5,1.9,10.8
1,Bam Adebayo,PF,22.0,MIA,47.0,47.0,34.5,6.1,10.4,0.589,...,.692,2.5,7.9,10.4,4.7,1.2,1.1,2.8,2.5,16.0
2,LaMarcus Aldridge,C,34.0,SAS,43.0,43.0,33.2,7.4,14.6,0.510,...,.845,2.0,5.5,7.5,2.3,0.6,1.7,1.4,2.5,19.1
3,Nickeil Alexander-Walker,SG,21.0,NOP,39.0,0.0,12.7,2.0,5.8,0.342,...,.607,0.2,1.8,2.0,1.9,0.3,0.2,1.0,1.1,5.4
4,Grayson Allen,SG,24.0,MEM,30.0,0.0,16.6,2.6,5.9,0.449,...,.857,0.2,2.0,2.2,1.4,0.2,0.0,0.8,1.2,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,Thaddeus Young,PF,31.0,CHI,50.0,6.0,23.3,3.8,8.8,0.434,...,.596,1.4,3.2,4.6,1.8,1.1,0.3,1.4,1.9,9.4
543,Trae Young,PG,21.0,ATL,44.0,44.0,35.1,9.3,20.8,0.446,...,.842,0.6,4.0,4.6,9.2,1.2,0.1,4.8,1.6,29.4
544,Cody Zeller,C,27.0,CHO,45.0,26.0,23.1,4.4,8.4,0.517,...,.686,2.8,4.2,7.0,1.3,0.6,0.4,1.3,2.5,11.2
545,Ante Žižić,C,23.0,CLE,16.0,0.0,9.9,1.8,3.1,0.571,...,.867,0.9,2.6,3.4,0.2,0.4,0.3,0.3,1.3,4.3


In [10]:
# 2019 Player advanced stats
stats_2020_adv = pd.DataFrame(player_stats_adv, columns = headers_adv)
stats_2020_adv

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26,OKC,44,1173,21.1,.609,.003,.424,...,,3.0,1.8,4.8,.197,,1.6,3.3,4.8,2.0
1,Bam Adebayo,PF,22,MIA,47,1621,20.6,.625,.022,.503,...,,3.6,2.6,6.2,.185,,1.1,3.8,5.0,2.8
2,LaMarcus Aldridge,C,34,SAS,43,1426,20.7,.590,.189,.247,...,,3.2,1.2,4.4,.147,,1.7,0.6,2.3,1.6
3,Nickeil Alexander-Walker,SG,21,NOP,39,494,8.2,.445,.520,.124,...,,-0.6,0.3,-0.3,-0.034,,-3.8,-2.2,-6.0,-0.5
4,Grayson Allen,SG,24,MEM,30,498,11.5,.577,.517,.199,...,,0.5,0.1,0.6,.059,,-0.7,-2.9,-3.6,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,Thaddeus Young,PF,31,CHI,50,1165,12.5,.505,.368,.130,...,,-0.3,1.5,1.3,.052,,-2.1,0.8,-1.3,0.2
543,Trae Young,PG,21,ATL,44,1543,24.5,.597,.447,.429,...,,4.0,0.6,4.6,.142,,7.8,-2.3,5.5,2.9
544,Cody Zeller,C,27,CHO,45,1041,18.2,.573,.177,.361,...,,1.8,0.8,2.6,.119,,0.0,-0.6,-0.6,0.4
545,Ante Žižić,C,23,CLE,16,159,18.8,.621,.000,.306,...,,0.4,0.2,0.5,.155,,-2.3,-1.4,-3.7,-0.1


In [11]:
# Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
stats_2020_adv = stats_2020_adv[stats_2020_adv['Player'].notnull()]

# stats_2019_pg.drop(stats_2019_pg.index[0])

# Convert string/object to float
cols = ['Age', 'G', 'PER','TS%', 'WS','USG%','BPM','VORP']
stats_2020_adv[cols] = stats_2020_adv[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# Replacing na values to 0
stats_2020_adv = stats_2020_adv[:].fillna(0)

# Keep the first record in the dataset
stats_2020_adv = stats_2020_adv.drop_duplicates(['Player'], keep = 'first')

In [12]:
stats_2020_adv

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26.0,OKC,44.0,1173,21.1,0.609,.003,.424,...,,3.0,1.8,4.8,.197,,1.6,3.3,4.8,2.0
1,Bam Adebayo,PF,22.0,MIA,47.0,1621,20.6,0.625,.022,.503,...,,3.6,2.6,6.2,.185,,1.1,3.8,5.0,2.8
2,LaMarcus Aldridge,C,34.0,SAS,43.0,1426,20.7,0.590,.189,.247,...,,3.2,1.2,4.4,.147,,1.7,0.6,2.3,1.6
3,Nickeil Alexander-Walker,SG,21.0,NOP,39.0,494,8.2,0.445,.520,.124,...,,-0.6,0.3,-0.3,-0.034,,-3.8,-2.2,-6.0,-0.5
4,Grayson Allen,SG,24.0,MEM,30.0,498,11.5,0.577,.517,.199,...,,0.5,0.1,0.6,.059,,-0.7,-2.9,-3.6,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,Thaddeus Young,PF,31.0,CHI,50.0,1165,12.5,0.505,.368,.130,...,,-0.3,1.5,1.3,.052,,-2.1,0.8,-1.3,0.2
543,Trae Young,PG,21.0,ATL,44.0,1543,24.5,0.597,.447,.429,...,,4.0,0.6,4.6,.142,,7.8,-2.3,5.5,2.9
544,Cody Zeller,C,27.0,CHO,45.0,1041,18.2,0.573,.177,.361,...,,1.8,0.8,2.6,.119,,0.0,-0.6,-0.6,0.4
545,Ante Žižić,C,23.0,CLE,16.0,159,18.8,0.621,.000,.306,...,,0.4,0.2,0.5,.155,,-2.3,-1.4,-3.7,-0.1


In [13]:
# Merge both per game and advanced stats
merged_2020_stats = pd.merge(stats_2020_pg, stats_2020_adv, on='Player', how='outer')
merged_2020_stats

Unnamed: 0,Player,Pos_x,Age_x,Tm_x,G_x,GS,MP_x,FG,FGA,FG%,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Steven Adams,C,26.0,OKC,44.0,44.0,26.7,4.5,7.5,0.600,...,,3.0,1.8,4.8,.197,,1.6,3.3,4.8,2.0
1,Bam Adebayo,PF,22.0,MIA,47.0,47.0,34.5,6.1,10.4,0.589,...,,3.6,2.6,6.2,.185,,1.1,3.8,5.0,2.8
2,LaMarcus Aldridge,C,34.0,SAS,43.0,43.0,33.2,7.4,14.6,0.510,...,,3.2,1.2,4.4,.147,,1.7,0.6,2.3,1.6
3,Nickeil Alexander-Walker,SG,21.0,NOP,39.0,0.0,12.7,2.0,5.8,0.342,...,,-0.6,0.3,-0.3,-0.034,,-3.8,-2.2,-6.0,-0.5
4,Grayson Allen,SG,24.0,MEM,30.0,0.0,16.6,2.6,5.9,0.449,...,,0.5,0.1,0.6,.059,,-0.7,-2.9,-3.6,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,Thaddeus Young,PF,31.0,CHI,50.0,6.0,23.3,3.8,8.8,0.434,...,,-0.3,1.5,1.3,.052,,-2.1,0.8,-1.3,0.2
491,Trae Young,PG,21.0,ATL,44.0,44.0,35.1,9.3,20.8,0.446,...,,4.0,0.6,4.6,.142,,7.8,-2.3,5.5,2.9
492,Cody Zeller,C,27.0,CHO,45.0,26.0,23.1,4.4,8.4,0.517,...,,1.8,0.8,2.6,.119,,0.0,-0.6,-0.6,0.4
493,Ante Žižić,C,23.0,CLE,16.0,0.0,9.9,1.8,3.1,0.571,...,,0.4,0.2,0.5,.155,,-2.3,-1.4,-3.7,-0.1


In [14]:
merged_2020_stats.dtypes

Player     object
Pos_x      object
Age_x     float64
Tm_x       object
G_x       float64
GS        float64
MP_x      float64
FG         object
FGA        object
FG%       float64
3P         object
3PA        object
3P%       float64
2P         object
2PA        object
2P%        object
eFG%      float64
FT         object
FTA        object
FT%        object
ORB        object
DRB        object
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV        object
PF         object
PTS       float64
Pos_y      object
Age_y     float64
Tm_y       object
G_y       float64
MP_y       object
PER       float64
TS%       float64
3PAr       object
FTr        object
ORB%       object
DRB%       object
TRB%       object
AST%       object
STL%       object
BLK%       object
TOV%       object
USG%      float64
           object
OWS        object
DWS        object
WS        float64
WS/48      object
           object
OBPM       object
DBPM       object
BPM       float64
VORP      

In [15]:
# Save the datasets
merged_2020_stats.to_csv('2019_2020_data.csv')