In [1]:
# All the packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from operator import itemgetter
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# NBA season we will be analyzing to create our model 
year = 2019
url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
html = urlopen(url)
soup_adv = BeautifulSoup(html)

In [3]:
# NBA season we will be analyzing to create our model 
year = 2019
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
html = urlopen(url)
soup_pg = BeautifulSoup(html)

In [4]:
# To see the column headers we have
soup_adv.findAll('tr', limit=2)
# GetText() function will help extract the text from the data we need into a list format
headers_adv = [th.getText() for th in soup_adv.findAll('tr', limit=2)[0].findAll('th')]
# Exclude the first column to remove ranking of the players from the webpage
headers_adv = headers_adv[1:]
headers_adv

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 '\xa0',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 '\xa0',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [5]:
# To see the column headers we have
soup_pg.findAll('tr', limit=2)
# GetText() function will help extract the text from the data we need into a list format
headers_pg = [th.getText() for th in soup_pg.findAll('tr', limit=2)[0].findAll('th')]
# Exclude the first column to remove ranking of the players from the webpage
headers_pg = headers_pg[1:]
headers_pg

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [6]:
# Remove the first header row
rows_adv = soup_adv.findAll('tr')[1:]
player_stats_adv = [[td.getText() for td in rows_adv[i].findAll('td')]
            for i in range(len(rows_adv))]

# Remove the first header row
rows_pg = soup_pg.findAll('tr')[1:]
player_stats_pg = [[td.getText() for td in rows_pg[i].findAll('td')]
            for i in range(len(rows_pg))]

In [7]:
# 2019 Player per game stats
stats_2019_pg = pd.DataFrame(player_stats_pg, columns = headers_pg)
stats_2019_pg

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,.357,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,.222,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,.345,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,.595,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,.576,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,Tyler Zeller,C,29,MEM,4,1,20.5,4.0,7.0,.571,...,.778,2.3,2.3,4.5,0.8,0.3,0.8,1.0,4.0,11.5
730,Ante Žižić,C,22,CLE,59,25,18.3,3.1,5.6,.553,...,.705,1.8,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8
731,Ivica Zubac,C,21,TOT,59,37,17.6,3.6,6.4,.559,...,.802,1.9,4.2,6.1,1.1,0.2,0.9,1.2,2.3,8.9
732,Ivica Zubac,C,21,LAL,33,12,15.6,3.4,5.8,.580,...,.864,1.6,3.3,4.9,0.8,0.1,0.8,1.0,2.2,8.5


In [8]:
# Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
stats_2019_pg = stats_2019_pg[stats_2019_pg['Player'].notnull()]

# stats_2019_pg.drop(stats_2019_pg.index[0])

# Convert string/object to float
cols = ['Age', 'G', 'GS','MP', 'FG%', '3P%','AST', 'STL', 'BLK','TRB', 'PTS','eFG%']
stats_2019_pg[cols] = stats_2019_pg[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# Replacing na values to 0
stats_2019_pg = stats_2019_pg[:].fillna(0)

# Keep the first record in the dataset
stats_2019_pg = stats_2019_pg.drop_duplicates(['Player'], keep = 'first')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [9]:
stats_2019_pg

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,SG,25.0,OKC,31.0,2.0,19.0,1.8,5.1,0.357,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,Quincy Acy,PF,28.0,PHO,10.0,0.0,12.3,0.4,1.8,0.222,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,Jaylen Adams,PG,22.0,ATL,34.0,1.0,12.6,1.1,3.2,0.345,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,Steven Adams,C,25.0,OKC,80.0,80.0,33.4,6.0,10.1,0.595,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,Bam Adebayo,C,21.0,MIA,82.0,28.0,23.3,3.4,5.9,0.576,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,Trae Young,PG,20.0,ATL,81.0,81.0,30.9,6.5,15.5,0.418,...,.829,0.8,2.9,3.7,8.1,0.9,0.2,3.8,1.7,19.1
726,Cody Zeller,C,26.0,CHO,49.0,47.0,25.4,3.9,7.0,0.551,...,.787,2.2,4.6,6.8,2.1,0.8,0.8,1.3,3.3,10.1
727,Tyler Zeller,C,29.0,TOT,6.0,1.0,15.5,2.7,5.0,0.533,...,.778,1.8,2.2,4.0,0.7,0.2,0.5,0.7,3.3,7.7
730,Ante Žižić,C,22.0,CLE,59.0,25.0,18.3,3.1,5.6,0.553,...,.705,1.8,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8


In [10]:
# 2019 Player advanced stats
stats_2019_adv = pd.DataFrame(player_stats_adv, columns = headers_adv)
stats_2019_adv

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Álex Abrines,SG,25,OKC,31,588,6.3,.507,.809,.083,...,,0.1,0.6,0.6,.053,,-2.4,-0.9,-3.4,-0.2
1,Quincy Acy,PF,28,PHO,10,123,2.9,.379,.833,.556,...,,-0.1,0.0,-0.1,-0.022,,-5.7,-0.3,-5.9,-0.1
2,Jaylen Adams,PG,22,ATL,34,428,7.6,.474,.673,.082,...,,-0.1,0.2,0.1,.011,,-3.1,-1.3,-4.4,-0.3
3,Steven Adams,C,25,OKC,80,2669,18.5,.591,.002,.361,...,,5.1,4.0,9.1,.163,,0.6,2.1,2.7,3.2
4,Bam Adebayo,C,21,MIA,82,1913,17.9,.623,.031,.465,...,,3.4,3.4,6.8,.171,,-0.6,3.6,3.0,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,Tyler Zeller,C,29,MEM,4,82,19.4,.640,.000,.643,...,,0.2,0.1,0.3,.191,,-0.3,-1.3,-1.5,0.0
730,Ante Žižić,C,22,CLE,59,1082,16.2,.590,.000,.399,...,,1.7,0.3,2.0,.087,,-1.7,-1.4,-3.1,-0.3
731,Ivica Zubac,C,21,TOT,59,1040,18.9,.604,.000,.332,...,,1.9,1.3,3.2,.148,,-1.2,1.0,-0.3,0.5
732,Ivica Zubac,C,21,LAL,33,516,19.5,.633,.000,.342,...,,1.1,0.6,1.8,.165,,-1.3,0.4,-0.9,0.1


In [11]:
# Use notnull() function to find the non-missing values, when there are missing values in the dataframe.
stats_2019_adv = stats_2019_adv[stats_2019_adv['Player'].notnull()]

# stats_2019_pg.drop(stats_2019_pg.index[0])

# Convert string/object to float
cols = ['Age', 'G', 'PER','TS%', 'WS','USG%','BPM','VORP']
stats_2019_adv[cols] = stats_2019_adv[cols].apply(pd.to_numeric, errors='coerce', axis=1)

# Replacing na values to 0
stats_2019_adv = stats_2019_adv[:].fillna(0)

# Keep the first record in the dataset
stats_2019_adv = stats_2019_adv.drop_duplicates(['Player'], keep = 'first')

In [12]:
stats_2019_adv

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Álex Abrines,SG,25.0,OKC,31.0,588,6.3,0.507,.809,.083,...,,0.1,0.6,0.6,.053,,-2.4,-0.9,-3.4,-0.2
1,Quincy Acy,PF,28.0,PHO,10.0,123,2.9,0.379,.833,.556,...,,-0.1,0.0,-0.1,-0.022,,-5.7,-0.3,-5.9,-0.1
2,Jaylen Adams,PG,22.0,ATL,34.0,428,7.6,0.474,.673,.082,...,,-0.1,0.2,0.1,.011,,-3.1,-1.3,-4.4,-0.3
3,Steven Adams,C,25.0,OKC,80.0,2669,18.5,0.591,.002,.361,...,,5.1,4.0,9.1,.163,,0.6,2.1,2.7,3.2
4,Bam Adebayo,C,21.0,MIA,82.0,1913,17.9,0.623,.031,.465,...,,3.4,3.4,6.8,.171,,-0.6,3.6,3.0,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,Trae Young,PG,20.0,ATL,81.0,2503,17.0,0.539,.384,.330,...,,2.5,0.7,3.3,.062,,1.8,-2.8,-1.1,0.6
726,Cody Zeller,C,26.0,CHO,49.0,1243,17.2,0.611,.064,.409,...,,2.6,1.2,3.9,.150,,0.2,1.7,2.0,1.2
727,Tyler Zeller,C,29.0,TOT,6.0,93,17.3,0.607,.033,.600,...,,0.2,0.1,0.3,.167,,-1.0,-1.2,-2.2,0.0
730,Ante Žižić,C,22.0,CLE,59.0,1082,16.2,0.590,.000,.399,...,,1.7,0.3,2.0,.087,,-1.7,-1.4,-3.1,-0.3


In [13]:
# Merge both per game and advanced stats
merged_2019_stats = pd.merge(stats_2019_pg, stats_2019_adv, on='Player', how='outer')
merged_2019_stats

Unnamed: 0,Player,Pos_x,Age_x,Tm_x,G_x,GS,MP_x,FG,FGA,FG%,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Álex Abrines,SG,25.0,OKC,31.0,2.0,19.0,1.8,5.1,0.357,...,,0.1,0.6,0.6,.053,,-2.4,-0.9,-3.4,-0.2
1,Quincy Acy,PF,28.0,PHO,10.0,0.0,12.3,0.4,1.8,0.222,...,,-0.1,0.0,-0.1,-0.022,,-5.7,-0.3,-5.9,-0.1
2,Jaylen Adams,PG,22.0,ATL,34.0,1.0,12.6,1.1,3.2,0.345,...,,-0.1,0.2,0.1,.011,,-3.1,-1.3,-4.4,-0.3
3,Steven Adams,C,25.0,OKC,80.0,80.0,33.4,6.0,10.1,0.595,...,,5.1,4.0,9.1,.163,,0.6,2.1,2.7,3.2
4,Bam Adebayo,C,21.0,MIA,82.0,28.0,23.3,3.4,5.9,0.576,...,,3.4,3.4,6.8,.171,,-0.6,3.6,3.0,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,Trae Young,PG,20.0,ATL,81.0,81.0,30.9,6.5,15.5,0.418,...,,2.5,0.7,3.3,.062,,1.8,-2.8,-1.1,0.6
526,Cody Zeller,C,26.0,CHO,49.0,47.0,25.4,3.9,7.0,0.551,...,,2.6,1.2,3.9,.150,,0.2,1.7,2.0,1.2
527,Tyler Zeller,C,29.0,TOT,6.0,1.0,15.5,2.7,5.0,0.533,...,,0.2,0.1,0.3,.167,,-1.0,-1.2,-2.2,0.0
528,Ante Žižić,C,22.0,CLE,59.0,25.0,18.3,3.1,5.6,0.553,...,,1.7,0.3,2.0,.087,,-1.7,-1.4,-3.1,-0.3


In [14]:
merged_2019_stats.dtypes

Player     object
Pos_x      object
Age_x     float64
Tm_x       object
G_x       float64
GS        float64
MP_x      float64
FG         object
FGA        object
FG%       float64
3P         object
3PA        object
3P%       float64
2P         object
2PA        object
2P%        object
eFG%      float64
FT         object
FTA        object
FT%        object
ORB        object
DRB        object
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV        object
PF         object
PTS       float64
Pos_y      object
Age_y     float64
Tm_y       object
G_y       float64
MP_y       object
PER       float64
TS%       float64
3PAr       object
FTr        object
ORB%       object
DRB%       object
TRB%       object
AST%       object
STL%       object
BLK%       object
TOV%       object
USG%      float64
           object
OWS        object
DWS        object
WS        float64
WS/48      object
           object
OBPM       object
DBPM       object
BPM       float64
VORP      

In [16]:
# Save the datasets
merged_2019_stats.to_csv('2018_2019_data.csv')