In [3]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns', None)

data = pd.read_csv('NBA_player_data.csv')

In [4]:
data.sample(10)

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
5838,2020-21,Regular%20Season,1627826,147,Ivica Zubac,LAC,72,1609,257,394,0.652,1,4,0.25,135,171,0.789,189,330,519,90,24,62,81,187,650,1091,1.11,0.3
3908,2017-18,Regular%20Season,1628504,453,Xavier Rathan-Mayes,MEM,5,118,12,42,0.286,1,14,0.071,4,9,0.444,0,5,5,18,6,3,11,18,29,15,1.64,0.55
3366,2016-17,Playoffs,203477,128,Isaiah Canaan,CHI,3,95,13,26,0.5,5,14,0.357,4,6,0.667,0,4,4,4,3,0,5,6,35,26,0.8,0.6
1245,2013-14,Playoffs,202710,86,Jimmy Butler,CHI,5,218,22,57,0.386,6,20,0.3,18,23,0.783,6,20,26,11,6,0,3,13,68,68,3.67,2.0
4619,2018-19,Regular%20Season,1629057,415,Robert Williams III,BOS,32,283,36,51,0.706,0,0,0.0,9,15,0.6,27,54,81,7,9,40,10,36,81,187,0.7,0.9
2489,2015-16,Regular%20Season,1627362,428,Briante Weber,MIA,7,169,14,39,0.359,0,5,0.0,3,4,0.75,8,17,25,21,9,3,9,12,31,54,2.33,1.0
5871,2020-21,Regular%20Season,1630191,180,Isaiah Stewart,DET,68,1455,226,409,0.553,21,63,0.333,64,92,0.696,159,294,453,59,39,86,67,184,537,896,0.88,0.58
615,2012-13,Playoffs,201947,148,Earl Clark,LAL,4,82,7,19,0.368,0,4,0.0,0,0,0.0,2,10,12,1,1,1,6,4,14,11,0.17,0.17
3587,2017-18,Regular%20Season,1628381,134,John Collins,ATL,74,1785,314,545,0.576,16,47,0.34,133,186,0.715,176,365,541,98,47,80,105,215,777,1154,0.93,0.45
5074,2019-20,Regular%20Season,1629634,129,Brandon Clarke,MEM,58,1300,296,479,0.618,23,64,0.359,85,112,0.759,92,253,345,81,32,48,55,100,700,941,1.47,0.58


In [5]:
data.shape

(7293, 29)

# DATA CLEANING AND ANALYSIS

In [6]:
data.drop(columns=['RANK','EFF'], inplace=True)


In [7]:
data['season_start_year'] =data['Year'].str[:4].astype(int)

In [8]:
data['TEAM'].replace(to_replace=['NOP','NOH'], value='NO', inplace=True)

In [9]:
data['Season_type'].replace('Regular%20Season', 'RS', inplace=True)

In [10]:
rs_df = data[data['Season_type']=='RS']
playoffs_df = data[data['Season_type']=='Playoffs']

In [11]:
total_cols = ['MIN','FGM','FGA','FG3M','FG3A','FTM','FTA',
              'OREB','DREB','REB','AST','STL','BLK','TOV','PTS']

# Which player stats are correlated with each other?

In [12]:
data_per_min = data.groupby(['PLAYER','PLAYER_ID','Year'])[total_cols].sum().reset_index()
for col in data_per_min.columns[4:]:
    data_per_min[col] = data_per_min[col]/data_per_min['MIN']

data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

data_per_min = data_per_min[data_per_min['MIN']>=50]
data_per_min.drop(columns='PLAYER_ID', inplace=True)

fig = px.imshow(data_per_min.corr())
fig.show()

  fig = px.imshow(data_per_min.corr())


# How many minutes played distributed?

In [13]:
fig = px.histogram(x=playoffs_df['MIN'], histnorm='percent')
fig.show()

We can notice that a large number of plyers see very few playing time while a very small percent play alost the whole game. 

In [14]:
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
    return df.loc[(df['MIN']>=min_MIN) & (df['GP']>=min_GP), 'MIN']/\
    df.loc[(df['MIN']>=min_MIN) & (df['GP']>=min_GP), 'GP']

In [15]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df,50,5), histnorm='percent', name='RS',
                           xbins={'start':0,'end':46,'size':1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df,5,1), histnorm='percent',
                           name='Playoffs', xbins={'start':0,'end':46,'size':1}))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

## Mean of Playoff VS RS for mins played for the middle portion of players
We can see that as the season comes to playoff season the rotation of players tightens. The more productive players will get more time and the less productive will see less MINS

In [16]:
((hist_data(playoffs_df,5,1)>=12)&(hist_data(playoffs_df,5,1)<=34)).mean()

0.4944038929440389

In [17]:
((hist_data(rs_df,50,5)>=12)&(hist_data(rs_df,50,5)<=34)).mean()

0.7495223943960942

# How has the game changed over the past 10 Years?

In [18]:
change_df = data.groupby('season_start_year')[total_cols].sum().reset_index()
change_df['POSS_est'] = change_df['FGA']-change_df['OREB']+change_df['TOV']+0.44*change_df['FTA']
change_df = change_df[list(change_df.columns[0:2])+['POSS_est']+list(change_df.columns[2:-1])]

change_df['FG%'] = change_df['FGM']/change_df['FGA']
change_df['3PT%'] = change_df['FG3M']/change_df['FG3A']
change_df['FT%'] = change_df['FTM']/change_df['FTA']
change_df['AST%'] = change_df['AST']/change_df['FGM']
change_df['FG3A%'] = change_df['FG3A']/change_df['FGA']
change_df['PTS/FGA'] = change_df['PTS']/change_df['FGA']
change_df['FG3M/FGM'] = change_df['FG3M']/change_df['FGM']
change_df['FTA/FGA'] = change_df['FTA']/change_df['FGA']
change_df['TRU%'] = 0.5*change_df['PTS']/(change_df['FGA']+0.475*change_df['FTA'])
change_df['AST_TOV'] = change_df['AST']/change_df['TOV']

change_df

Unnamed: 0,season_start_year,MIN,POSS_est,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,FG%,3PT%,FT%,AST%,FG3A%,PTS/FGA,FG3M/FGM,FTA/FGA,TRU%,AST_TOV
0,2012,635884,248201.92,97235,215105,18808,52569,44125,58618,29237,81362,110599,57694,20376,13444,36542,257403,0.452035,0.357777,0.752755,0.593346,0.244388,1.196639,0.193428,0.272509,0.529748,1.578841
1,2013,638373,254032.8,99251,218411,20480,56952,47219,62420,28669,83812,112481,57657,20156,12369,36826,266201,0.454423,0.359601,0.756472,0.580921,0.260756,1.218808,0.206346,0.285791,0.536565,1.56566
2,2014,634546,253004.12,98251,219265,20724,59276,45098,60248,28566,85231,113797,57727,20261,12665,35796,262324,0.448092,0.349619,0.748539,0.587546,0.27034,1.196379,0.210929,0.274773,0.529129,1.612666
3,2015,636391,258064.8,100351,222344,22524,63673,46516,61520,27426,87611,115037,58251,20562,13046,36078,269742,0.451332,0.353745,0.756112,0.580473,0.286372,1.213174,0.224452,0.276688,0.536126,1.614585
4,2016,632482,258443.8,102147,223333,25408,71018,46806,60620,26470,87173,113643,59162,20143,12409,34908,276508,0.457375,0.357768,0.772121,0.579185,0.317992,1.238097,0.24874,0.271433,0.54835,1.694798
5,2017,633425,260904.52,103729,225523,27530,76245,43721,57008,25397,88678,114075,60739,20181,12636,35695,278709,0.459949,0.361073,0.766927,0.585555,0.338081,1.235834,0.265403,0.252781,0.551677,1.701611
6,2018,634231,268739.84,107374,233717,29817,84143,46671,60811,27128,91360,118488,64257,19940,12984,35394,291236,0.459419,0.354361,0.767476,0.598441,0.360021,1.246105,0.277693,0.260191,0.554519,1.815477
7,2019,552262,234384.64,92997,202223,28032,78279,40949,52906,22802,79318,102120,55445,17368,11085,31685,254975,0.459874,0.358104,0.773995,0.596202,0.387092,1.260861,0.301429,0.261622,0.560746,1.749882
8,2020,562518,235759.48,95849,205754,29549,80653,39624,50917,22918,80151,103069,57311,17491,11272,30520,260871,0.465843,0.366372,0.778208,0.59793,0.391988,1.267878,0.308287,0.247465,0.56726,1.877818
9,2021,635572,264004.96,106569,231293,32733,92552,44740,57709,27052,89602,116654,64618,20006,12387,34372,290611,0.460753,0.353671,0.775269,0.606349,0.40015,1.256463,0.307153,0.249506,0.561665,1.87996


# per 48 min stats

In [19]:
change_per48_df = change_df.copy()
for col in change_per48_df.columns[2:18]:
    change_per48_df[col] = (change_per48_df[col]/change_per48_df['MIN'])*48*5

change_per48_df.drop(columns='MIN', inplace=True)

fig = go.Figure()
for col in change_per48_df.columns[1:]:
    fig.add_trace(go.Scatter(x=change_per48_df['season_start_year'],
                             y=change_per48_df[col], name=col))
fig.show()

# Per posestion stats

In [20]:
change_per100_df = change_df.copy()

for col in change_per100_df.columns[3:18]:
    change_per100_df[col] = (change_per100_df[col]/change_per100_df['POSS_est'])*100

change_per100_df.drop(columns=['MIN','POSS_est'], inplace=True)
change_per100_df

fig = go.Figure()
for col in change_per100_df.columns[1:]:
    fig.add_trace(go.Scatter(x=change_per100_df['season_start_year'],
                             y=change_per100_df[col], name=col))
fig.show()

In [21]:
import plotly.express as px

# Assuming 'data' is your DataFrame
scatter_plot = px.scatter(data, x='MIN', y='FT_PCT', hover_data=['PLAYER', 'TEAM'],
                          title='Scatter Plot of Free Throw Percentage vs Play Time',
                          labels={'MIN': 'Minutes Played', 'FT_PCT': 'Free Throw Percentage (%)'})

# Display the plot
scatter_plot.show()
