In [137]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from bs4 import BeautifulSoup
import urllib.request
import numpy as np

In [85]:
def clean_bkref_dataframe(df):
    """ Cleans dataframes we get from importing bkref CSV files.
    """
    df = df.drop(columns=['Unnamed: 29'])
    df = df.drop(columns=['Rk'])
    df = df.drop_duplicates(subset=['Player'])
    df['Year'] = 2019
    df[['Player', 'player_id']] = df['Player'].str.split('\\', expand=True)
    df = df.reset_index(drop=True)
    return df

In [86]:
team_df = pd.read_csv("data/nba-api-team-stats.csv")
team_df = team_df.drop(columns=['Unnamed: 0'])
team_df = team_df[team_df.YEAR >= 1980]
team_df.columns = ['Tm', 'Year', 'GP', 'WINS', 'LOSSES', 'WIN_PCT', 'CONF_RANK',
       'DIV_RANK', 'PO_WINS', 'PO_LOSSES', 'CONF_COUNT', 'DIV_COUNT',
       'NBA_FINALS_APPEARANCE', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'PF',
       'STL', 'TOV', 'BLK', 'PTS', 'PTS_RANK']
team_df.to_json("data/team-stats.json")

In [87]:
player_df = pd.read_json("data/1980-2018-per100-labeled.json")

In [88]:
player_team_combined = pd.merge(player_df, team_df, on=['Tm','Year'])

In [89]:
player_team_combined.to_json("data/combined-player-team-stats.json")

In [90]:
player_df19 = pd.read_json("data/2019-per-100.json")

In [91]:
combined_19 = pd.merge(player_df19, team_df, on=['Tm','Year'])

In [92]:
combined_19.to_json("data/2019-combined.json")

Using BeautifulSoup to parse basketball-reference.com per possession stats

In [109]:
urllib.request.urlretrieve("https://www.basketball-reference.com/leagues/NBA_2019_per_poss.html", filename="data/2019_per_poss.html")
with open("data/2019_per_poss.html") as html_doc:
    soup = BeautifulSoup(html_doc, 'html.parser')

In [110]:
table = soup.find("table")
table_body = table.find('tbody')
rows = table_body.find_all('tr')

In [129]:
bkref_table_data = []
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    bkref_table_data.append([ele for ele in cols]) 

In [130]:
scraped_player_df = pd.DataFrame(bkref_table_data)

In [131]:
table_head = table.find('thead')
headers = table_head.find_all('th')
bkref_header_data = []
cols = table_head.find_all('th')
cols = [ele.text.strip() for ele in cols]
bkref_header_data.append([ele for ele in cols])
bkref_header_data = bkref_header_data[0]
bkref_header_data.remove('Rk')
scraped_player_df.columns=bkref_header_data

In [132]:
scraped_player_df['Year'] = 2019

In [139]:
scraped_player_df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Unnamed: 29,ORtg,DRtg,Year
0,Alex Abrines,SG,25,OKC,29,2,573,4.5,12.5,.359,3.3,10.1,.323,1.2,2.4,.517,1.0,1.1,.923,0.4,3.4,3.8,1.6,1.4,0.5,1.1,4.2,13.2,,104,110,2019
1,Quincy Acy,PF,28,PHO,7,0,68,1.4,7.1,.200,1.4,7.1,.200,0.0,0.0,,3.5,4.3,.833,1.4,3.5,5.0,3.5,0.7,2.8,0.7,10.6,7.8,,107,116,2019
2,Jaylen Adams,PG,22,ATL,9,0,43,3.2,10.7,.300,2.1,6.4,.333,1.1,4.3,.250,0.0,0.0,,0.0,3.2,3.2,6.4,3.2,1.1,1.1,4.3,8.6,,99,111,2019
3,Steven Adams,C,25,OKC,45,45,1536,8.8,14.4,.609,0.0,0.0,.000,8.8,14.4,.610,3.4,6.2,.557,6.4,7.4,13.8,2.3,2.0,1.1,2.2,3.6,21.0,,125,105,2019
4,Bam Adebayo,C,21,MIA,45,6,994,6.6,11.9,.554,0.0,0.4,.125,6.6,11.5,.568,4.4,6.1,.732,4.6,10.3,14.8,4.7,1.6,2.0,3.1,5.6,17.7,,119,105,2019
5,Deng Adel,SF,21,CLE,2,0,11,4.6,4.6,1.000,4.6,4.6,1.000,0.0,0.0,,0.0,0.0,,4.6,9.1,13.7,0.0,0.0,0.0,0.0,4.6,13.7,,271,121,2019
6,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,22,6.8,22.6,.300,0.0,9.1,.000,6.8,13.6,.500,2.3,4.5,.500,2.3,6.8,9.1,13.6,4.5,0.0,4.5,9.1,15.8,,84,104,2019
7,LaMarcus Aldridge,C,33,SAS,48,48,1563,12.4,24.2,.511,0.1,0.5,.188,12.3,23.7,.518,6.5,7.7,.837,4.4,8.7,13.0,3.7,0.8,1.8,2.9,3.3,31.3,,115,110,2019
8,Rawle Alkins,SG,21,CHI,2,0,4,12.2,48.9,.250,12.2,12.2,1.000,0.0,36.7,.000,0.0,0.0,,36.7,0.0,36.7,24.5,12.2,0.0,0.0,0.0,36.7,,126,94,2019
9,Grayson Allen,SG,23,UTA,21,1,237,5.9,18.3,.322,3.3,11.6,.281,2.6,6.7,.394,4.7,6.1,.767,0.2,1.2,1.4,3.0,0.4,0.4,2.8,5.5,19.7,,91,112,2019


In [140]:
scraped_player_df.to_json("data/scraped-player-data-2019.json")

In [141]:
scraped_player_df = pd.read_json("data/scraped-player-data-2019.json")

In [144]:
scraped_player_df.replace('', np.nan)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Unnamed: 29,ORtg,DRtg,Year
0,Alex Abrines,SG,25.0,OKC,29.0,2.0,573.0,4.5,12.5,.359,3.3,10.1,.323,1.2,2.4,.517,1.0,1.1,.923,0.4,3.4,3.8,1.6,1.4,0.5,1.1,4.2,13.2,,104,110.0,2019
1,Quincy Acy,PF,28.0,PHO,7.0,0.0,68.0,1.4,7.1,.200,1.4,7.1,.200,0.0,0.0,,3.5,4.3,.833,1.4,3.5,5.0,3.5,0.7,2.8,0.7,10.6,7.8,,107,116.0,2019
10,Jarrett Allen,C,20.0,BRK,46.0,46.0,1236.0,8.0,13.9,.576,0.2,1.1,.138,7.9,12.8,.615,5.0,6.7,.741,4.8,11.1,15.9,2.6,1.0,2.9,2.5,4.2,21.2,,124,107.0,2019
100,Michael Carter-Williams,PG,27.0,HOU,16.0,1.0,145.0,8.6,20.9,.410,2.4,6.5,.368,6.2,14.4,.429,4.1,8.9,.462,1.0,3.4,4.5,7.2,3.1,2.1,3.4,9.3,23.6,,99,111.0,2019
101,Alex Caruso,SG,24.0,LAL,3.0,0.0,17.0,5.5,8.2,.667,2.7,2.7,1.000,2.7,5.5,.500,2.7,11.0,.250,2.7,0.0,2.7,2.7,0.0,5.5,0.0,8.2,16.5,,133,111.0,2019
102,Omri Casspi,SF,30.0,MEM,34.0,0.0,483.0,7.9,15.3,.521,1.6,4.5,.349,6.4,10.8,.592,3.9,5.9,.661,1.6,9.5,11.1,2.7,1.6,0.9,2.2,3.4,21.3,,111,108.0,2019
103,Willie Cauley-Stein,C,25.0,SAC,47.0,47.0,1339.0,9.5,17.9,.532,0.0,0.0,1.000,9.5,17.8,.531,2.9,5.3,.536,4.0,10.4,14.5,4.4,1.9,1.0,2.1,4.9,21.9,,113,109.0,2019
104,Troy Caupain,PG,23.0,ORL,2.0,0.0,9.0,11.0,22.0,.500,5.5,11.0,.500,5.5,11.0,.500,0.0,0.0,,0.0,11.0,11.0,16.5,5.5,0.0,0.0,0.0,27.4,,147,101.0,2019
105,Tyler Cavanaugh,PF,24.0,UTA,4.0,0.0,9.0,5.4,21.4,.250,0.0,10.7,.000,5.4,10.7,.500,0.0,0.0,,5.4,5.4,10.7,0.0,0.0,0.0,0.0,0.0,10.7,,70,112.0,2019
106,Tyson Chandler,C,36.0,TOT,43.0,6.0,758.0,3.6,5.9,.611,0.0,0.1,.000,3.6,5.8,.617,2.8,4.6,.613,5.3,11.8,17.1,2.0,1.2,1.1,2.1,5.9,10.0,,123,107.0,2019


In [145]:
scraped_player_df = scraped_player_df.loc[scraped_player_df.MP > 300]
scraped_player_df = scraped_player_df.fillna(0)

In [146]:
combined_19 = pd.merge(scraped_player_df, team_df, on=['Tm','Year'])
combined_19.to_json("data/2019-combined.json")

In [108]:
scraped_player_df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,ORtg,DRtg,Year
0,Alex Abrines,SG,25.0,OKC,29.0,2.0,573.0,4.5,12.5,0.359,3.3,10.1,0.323,1.2,2.4,0.517,1.0,1.1,0.923,0.4,3.4,3.8,1.6,1.4,0.5,1.1,4.2,13.2,104.0,110.0,2019
10,Jarrett Allen,C,20.0,BRK,46.0,46.0,1236.0,8.0,13.9,0.576,0.2,1.1,0.138,7.9,12.8,0.615,5.0,6.7,0.741,4.8,11.1,15.9,2.6,1.0,2.9,2.5,4.2,21.2,124.0,107.0,2019
102,Omri Casspi,SF,30.0,MEM,34.0,0.0,483.0,7.9,15.3,0.521,1.6,4.5,0.349,6.4,10.8,0.592,3.9,5.9,0.661,1.6,9.5,11.1,2.7,1.6,0.9,2.2,3.4,21.3,111.0,108.0,2019
103,Willie Cauley-Stein,C,25.0,SAC,47.0,47.0,1339.0,9.5,17.9,0.532,0.0,0.0,1.0,9.5,17.8,0.531,2.9,5.3,0.536,4.0,10.4,14.5,4.4,1.9,1.0,2.1,4.9,21.9,113.0,109.0,2019
106,Tyson Chandler,C,36.0,TOT,43.0,6.0,758.0,3.6,5.9,0.611,0.0,0.1,0.0,3.6,5.8,0.617,2.8,4.6,0.613,5.3,11.8,17.1,2.0,1.2,1.1,2.1,5.9,10.0,123.0,107.0,2019
108,Tyson Chandler,C,36.0,LAL,36.0,6.0,669.0,3.5,5.8,0.602,0.0,0.1,0.0,3.5,5.7,0.61,2.5,4.0,0.632,5.4,11.1,16.5,1.9,1.2,1.2,1.9,5.2,9.5,126.0,106.0,2019
109,Wilson Chandler,PF,31.0,PHI,33.0,29.0,865.0,4.5,10.3,0.441,2.1,5.5,0.386,2.4,4.8,0.506,0.7,1.0,0.722,2.2,6.5,8.7,3.6,0.9,0.9,1.8,4.9,11.9,111.0,111.0,2019
11,Al-Farouq Aminu,PF,28.0,POR,48.0,48.0,1439.0,5.3,12.2,0.432,2.2,6.2,0.355,3.1,6.0,0.511,2.9,3.5,0.835,2.6,10.6,13.2,1.8,1.6,0.6,1.3,3.3,15.6,119.0,108.0,2019
111,Gary Clark,PF,24.0,HOU,33.0,1.0,513.0,3.4,10.5,0.324,2.8,9.5,0.296,0.6,1.0,0.6,0.6,0.6,1.0,1.7,7.3,9.0,1.4,1.4,2.1,0.4,3.3,10.2,109.0,112.0,2019
112,Ian Clark,SG,27.0,NOP,31.0,0.0,408.0,5.0,13.7,0.361,2.1,6.7,0.31,2.9,7.0,0.41,1.3,1.4,0.917,0.6,4.4,5.0,4.5,0.8,0.5,2.7,5.0,13.3,93.0,117.0,2019


In [121]:
test_cols = rows[1].find_all('td')