In [94]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns', None)


data = pd.read_excel('../data/nba_player_data.xlsx')

In [95]:
data.sample(5)

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
1983,2014-15,Playoffs,101111,130,Charlie Villanueva,1610612742,DAL,5,43,11,25,0.44,8,19,0.421,0,0,0.0,4,9,13,3,1,1,0,10,30,34,0.0,0.0
2831,2016-17,Regular%20Season,202083,79,Wesley Matthews,1610612742,DAL,73,2495,333,847,0.393,174,479,0.363,146,179,0.816,18,241,259,210,77,15,102,161,986,898,2.06,0.76
4489,2018-19,Regular%20Season,203937,286,Kyle Anderson,1610612763,MEM,43,1282,150,276,0.543,9,34,0.265,37,64,0.578,48,203,251,128,54,37,58,112,346,605,2.21,0.93
6767,2021-22,Regular%20Season,1630631,297,Jose Alvarado,1610612740,NOP,54,834,131,294,0.446,32,110,0.291,36,53,0.679,25,75,100,152,71,7,40,73,330,440,3.8,1.77
6919,2021-22,Regular%20Season,1630792,449,Malcolm Hill,1610612741,CHI,19,212,24,52,0.462,13,36,0.361,11,14,0.786,11,24,35,8,7,3,3,24,72,91,2.67,2.33


In [96]:
data.shape

(8835, 30)

Data cleaning & analysis preparation

In [97]:
data.drop(columns=['RANK', 'EFF'], inplace=True)

In [98]:
data['season_start_year'] = data['Year'].str[:4].astype(int)

In [99]:
data['TEAM'] = data['TEAM'].replace(to_replace=['NOP', 'NOH'], value='NO') # use 'NO' for New Orleans
data.TEAM.nunique() # there should be 30 unique teams

30

In [100]:
data['Season_type'] = data['Season_type'].replace('Regular%20Season', 'Regular')

In [101]:
rs_df = data[data['Season_type']=='Regular']
playoffs_df = data[data['Season_type']=='Playoffs']

In [102]:
data.columns

Index(['Year', 'Season_type', 'PLAYER_ID', 'PLAYER', 'TEAM_ID', 'TEAM', 'GP',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
      dtype='object')

In [103]:
total_cols = ['MIN', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
    'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',]

Which player stats are correlated with each other?

In [104]:
data_per_min = data.groupby(['PLAYER', 'PLAYER_ID', 'Year'])[total_cols].sum().reset_index()
for col in data_per_min.columns[4:]:
    data_per_min[col] = data_per_min[col] / data_per_min['MIN']

data_per_min['FG%'] = data_per_min['FGM'] / data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M'] / data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM'] / data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A'] / data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS'] / data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M'] / data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA'] / data_per_min['FGA']
data_per_min['TRU%'] = 0.5 * data_per_min['PTS'] / (data_per_min['FGA'] + 0.475 * data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST'] / data_per_min['TOV']

# Only look at players who have recorded at least 50 min of play time in a season
data_per_min = data_per_min[data_per_min['MIN']>=50]
data_per_min.drop(columns='PLAYER_ID', inplace=True)
# data_per_min.head(4)

fig = px.imshow(data_per_min.select_dtypes(include=[float, int]).corr(), width=640, height=640)
fig.show()

---

Distribution of minutes played

In [108]:
fig = px.histogram(x=rs_df['MIN'], histnorm='percent')
fig.show()

In [None]:
# Minutes per game
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
    return df.loc[(df['MIN'] >= min_MIN) & (df['GP'] >= min_GP), 'MIN'] /\
    df.loc[(df['MIN'] >= min_MIN) & (df['GP'] >= min_GP), 'GP']


In [117]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df, 50, 5), histnorm='percent', name='RS',
                           xbins={'start': 0, 'end': 46, 'size': 1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df, 5, 1), histnorm='percent', name='Playoffs',
                           xbins={'start': 0, 'end': 46, 'size': 1}))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

In [None]:
((hist_data(rs_df, 50, 5) >=12) & (hist_data(rs_df, 50, 5) <= 34)).mean()

np.float64(0.7418734708144006)

In [None]:
# Points per game
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
    return df.loc[(df['MIN'] >= min_MIN) & (df['GP'] >= min_GP), 'PTS'] /\
    df.loc[(df['MIN'] >= min_MIN) & (df['GP'] >= min_GP), 'GP']


fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df, 50, 5), histnorm='percent', name='RS',
                           xbins={'start': 0, 'end': 38, 'size': 1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df, 5, 1), histnorm='percent', name='Playoffs',
                           xbins={'start': 0, 'end': 38, 'size': 1}))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

---