# What Makes an NBA Champion?

#### Daniel Abboudi and Sean Campi
##### Data Bootcamp, NYU Stern 4/25/2021

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from sklearn.manifold import TSNE
import seaborn as sns
import plotly.express as px

Data is from [NBA.com](https://www.nba.com/stats/teams/)
<br>
Includes Advanced Stats, Traditional Stats (standardized over 100 possessions), and Opponent Stats (standardized over 100 possessions)
<br>
<br>
*Note: Unfortunately, NBA.com does not allow for scraping or downloading of their data. We copy and pasted the relevant data into three excel files and uploaded them to the project's github repository*

In [2]:
# Import the data sets
adv = pd.read_excel('https://github.com/danielabboudi/DB_Project/raw/main/NBA_Advanced.xlsx')
trad = pd.read_excel('https://github.com/danielabboudi/DB_Project/raw/main/NBA_Traditional_per100.xlsx')
opp = pd.read_excel('https://github.com/danielabboudi/DB_Project/raw/main/NBA_Opponent_per100.xlsx')

In [20]:
# Merge the data sets
merge1 = pd.merge(trad,adv,how='left',left_on=['Season','TEAM'],right_on=['Season','TEAM'])
df = pd.merge(merge1,opp,how='left',left_on=['Season','TEAM'],right_on=['Season','TEAM'])

In [23]:
# Convert percentages into decimals
df['FG%'] = df['FG%']/100
df['3P%'] = df['3P%']/100
df['FT%'] = df['FT%']/100
df['OREB%'] = df['OREB%']/100
df['DREB%'] = df['DREB%']/100
df['TOV%'] = df['TOV%']/100
df['TS%'] = df['TS%']/100

df['OPPFG%'] = df['OPPFG%']/100
df['OPP3P%'] = df['OPP3P%']/100
df['OPPFT%'] = df['OPPFT%']/100

In [8]:
df.columns

Index(['Season', 'TEAM', 'GP_x', 'W_x', 'L_x', 'WIN%', 'MIN_x', 'PTS', 'FGM',
       'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB',
       'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', '+/-',
       'Unnamed: 28', 'GP_y', 'W_y', 'L_y', 'MIN_y', 'OFFRTG', 'DEFRTG',
       'NETRTG', 'AST%', 'AST/TO', 'AST RATIO', 'OREB%', 'DREB%', 'REB%',
       'TOV%', 'EFG%', 'TS%', 'PACE', 'PIE', 'POSS', 'GP', 'W', 'L', 'MIN',
       'OPPFGM', 'OPPFGA', 'OPPFG%', 'OPP3PM', 'OPP3PA', 'OPP3P%', 'OPPFTM',
       'OPPFTA', 'OPPFT%', 'OPPOREB', 'OPPDREB', 'OPPREB', 'OPPAST', 'OPPTOV',
       'OPPSTL', 'OPPBLK', 'OPPBLKA', 'OPPPF', 'OPPPFD', 'OPPPTS', 'OPP +/-'],
      dtype='object')

Statistician Dean Oliver famously determined that there are ["four factors"](https://www.basketball-reference.com/about/factors.html) that contribute to winning NBA games.
   1. Shooting - eFG% or TS% (shooting efficiency, weighting 3-pointers higher than 2-pointers)
   2. Turnovers - TOV% (turnovers per total plays in a game)
   3. Rebounding - OREB% and DREB% (rebounds per available total rebounds in a game)
   4. Free Throws - FTR (free throw rate per field goals attempted)
<br>

We may want to expand some of these metrics to get a better comparison between teams. For example, two teams can have identical TS%, but with one team making more of their 3-point attempts and the other highly efficient on 2-pointers at the rim and making a lot of free throws. Statistically, those two teams might look similar based on TS%, but they are very different stylistically.

In [24]:
# Calculate new columns for additional metrics
df['2P%'] = round((df['FGM']-df['3PM'])/(df['FGA']-df['3PA']),4)                 # Expanding shooting
df['FTR'] = round(df['FTA']/df['FGA'],4)
df['3PFREQ'] = round(df['3PA']/df['FGA'],4)                                      # Expanding shooting
df['FGAFREQ'] = round(df['FGA']/(df['FGA']+df['TOV']+0.44*df['FTA']),4)          # Expanding shooting
df['STL%'] = round(df['STL']/df['OPPTOV'],4)                                     # Expanding turnovers

df['OPPTS%'] = round(df['OPPPTS']/(2*(df['OPPFGA']+0.44*df['OPPFTA'])),4)
df['OPP2P%'] = round((df['OPPFGM']-df['OPP3PM'])/(df['OPPFGA']-df['OPP3PA']),4)
df['OPPFTR'] = round(df['OPPFTA']/df['OPPFGA'],4)
df['OPP3PFREQ'] = round(df['OPP3PA']/df['OPPFGA'],4)
df['OPPFGAFREQ'] = round(df['OPPFGA']/(df['OPPFGA']+df['OPPTOV']+0.44*df['OPPFTA']),4)
df['OPPFTAFREQ'] = round((0.44*df['OPPFTA'])/(df['OPPFGA']+df['OPPTOV']+0.44*df['OPPFTA']),4)
df['OPPTOV%'] = round(df['OPPTOV']/(df['OPPFGA']+df['OPPTOV']+0.44*df['OPPFTA']),4)
df['OPPSTL%'] = round(df['OPPSTL']/df['TOV'],4)
df['OPPOREB%'] = 1-df['DREB%']
df['OPPDREB%'] = 1-df['OREB%']
df['OPPAST/TO'] = df['OPPAST']/df['OPPTOV']

In [52]:
df.columns

Index(['Season', 'TEAM', 'GP_x', 'W_x', 'L_x', 'WIN%', 'MIN_x', 'PTS', 'FGM',
       'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB',
       'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', '+/-',
       'Unnamed: 28', 'GP_y', 'W_y', 'L_y', 'MIN_y', 'OFFRTG', 'DEFRTG',
       'NETRTG', 'AST%', 'AST/TO', 'AST RATIO', 'OREB%', 'DREB%', 'REB%',
       'TOV%', 'EFG%', 'TS%', 'PACE', 'PIE', 'POSS', 'GP', 'W', 'L', 'MIN',
       'OPPFGM', 'OPPFGA', 'OPPFG%', 'OPP3PM', 'OPP3PA', 'OPP3P%', 'OPPFTM',
       'OPPFTA', 'OPPFT%', 'OPPOREB', 'OPPDREB', 'OPPREB', 'OPPAST', 'OPPTOV',
       'OPPSTL', 'OPPBLK', 'OPPBLKA', 'OPPPF', 'OPPPFD', 'OPPPTS', 'OPP +/-',
       '2P%', 'FTR', '3PFREQ', 'FGAFREQ', 'FTAFREQ', 'STL%', 'OPPTS%',
       'OPP2P%', 'OPPFTR', 'OPP3PFREQ', 'OPPFGAFREQ', 'OPPFTAFREQ', 'OPPTOV%',
       'OPPSTL%', 'OPPOREB%', 'OPPDREB%', 'OPPAST/TO'],
      dtype='object')

In [96]:
# Select the columns we want based on the four factors
general = ['Season','TEAM','WIN%','OFFRTG','DEFRTG','NETRTG','PACE']
shooting = ['TS%','2P%','3P%','FGAFREQ','3PFREQ',]
turnovers = ['TOV%','AST/TO','STL%']
rebounding = ['OREB%','DREB%']
free_throws = ['FT%','FTR']

opp_shooting = ['OPPTS%','OPP2P%','OPP3P%','OPPFGAFREQ','OPP3PFREQ',]
opp_turnovers = ['OPPTOV%','OPPAST/TO','OPPSTL%']
opp_rebounding = ['OPPOREB%','OPPDREB%']
opp_free_throws = ['OPPFT%','OPPFTR']

df = df[general+shooting+turnovers+rebounding+free_throws+opp_shooting+opp_turnovers+opp_rebounding+opp_free_throws]

In [97]:
# Create a dictionary of NBA Champions
champions = {2001: 'Los Angeles Lakers',
             2002: 'Los Angeles Lakers',
             2003: 'San Antonio Spurs',
             2004: 'Detroit Pistons',
             2005: 'San Antonio Spurs',
             2006: 'Miami Heat',
             2007: 'San Antonio Spurs',
             2008: 'Boston Celtics',
             2009: 'Los Angeles Lakers',
             2010: 'Los Angeles Lakers',
             2011: 'Dallas Mavericks',
             2012: 'Miami Heat',
             2013: 'Miami Heat',
             2014: 'San Antonio Spurs',
             2015: 'Golden State Warriors',
             2016: 'Cleveland Cavaliers',
             2017: 'Golden State Warriors',
             2018: 'Golden State Warriors',
             2019: 'Toronto Raptors',
             2020: 'Los Angeles Lakers'}

In [98]:
# Pull Champions from our Dictionary
df['Champion'] = 0
for i in range(0,len(df),1):
    if df['Season'][i] == 2021:
        df['Champion'][i] = 0 
    elif df['TEAM'][i] == champions[df['Season'][i]]:
        df['Champion'][i] = 1
    else:
        df['Champion'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Champion'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Champion'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Champion'][i] = 0


In [99]:
df[df['Champion']==1][['Season','TEAM']]

Unnamed: 0,Season,TEAM
1,2001,Los Angeles Lakers
30,2002,Los Angeles Lakers
59,2003,San Antonio Spurs
92,2004,Detroit Pistons
118,2005,San Antonio Spurs
150,2006,Miami Heat
178,2007,San Antonio Spurs
206,2008,Boston Celtics
237,2009,Los Angeles Lakers
268,2010,Los Angeles Lakers


In [100]:
df.columns

Index(['Season', 'TEAM', 'WIN%', 'OFFRTG', 'DEFRTG', 'NETRTG', 'PACE', 'TS%',
       '2P%', '3P%', 'FGAFREQ', '3PFREQ', 'TOV%', 'AST/TO', 'STL%', 'OREB%',
       'DREB%', 'FT%', 'FTR', 'OPPTS%', 'OPP2P%', 'OPP3P%', 'OPPFGAFREQ',
       'OPP3PFREQ', 'OPPTOV%', 'OPPAST/TO', 'OPPSTL%', 'OPPOREB%', 'OPPDREB%',
       'OPPFT%', 'OPPFTR', 'Champion'],
      dtype='object')

In [101]:
correlation = df.drop(['Season','TEAM'],axis=1).corr()

In [102]:
# We can take a look to see how our four factors are correlated with the other metrics
print(correlation['TS%'][shooting+opp_shooting+['WIN%']].sort_values(ascending=False))
print('---------------------------')
print(correlation['TOV%'][turnovers+opp_turnovers+['WIN%']].sort_values(ascending=False))
print('---------------------------')
print(correlation['OREB%'][rebounding+opp_rebounding+['WIN%']].sort_values(ascending=False))
print('---------------------------')
print(correlation['FTR'][free_throws+opp_free_throws+['WIN%']].sort_values(ascending=False))

TS%           1.000000
2P%           0.925569
3PFREQ        0.675575
3P%           0.612801
OPP3PFREQ     0.566530
WIN%          0.539498
OPPFGAFREQ    0.387208
OPP2P%        0.364764
OPPTS%        0.294266
FGAFREQ       0.264997
OPP3P%       -0.012028
Name: TS%, dtype: float64
---------------------------
TOV%         1.000000
OPPTOV%      0.325924
STL%        -0.157755
OPPSTL%     -0.274090
OPPAST/TO   -0.304324
WIN%        -0.309321
AST/TO      -0.796330
Name: TOV%, dtype: float64
---------------------------
OREB%       1.000000
OPPOREB%    0.461334
WIN%       -0.029494
DREB%      -0.461334
OPPDREB%   -1.000000
Name: OREB%, dtype: float64
---------------------------
FTR       1.000000
OPPFTR    0.482705
WIN%      0.106939
OPPFT%   -0.236406
FT%      -0.256643
Name: FTR, dtype: float64


In [149]:
# We don't want to over fit the model by inputting high indicator data types or our main four factor stats
# We would like the model to be able to group teams based on efficiency without the overriding metrics
df2 = df.drop(['WIN%','OFFRTG','DEFRTG','NETRTG','TS%','OPPTS%','TOV%','OPPTOV%','OREB%','DREB%','OPPOREB%',
               'OPPDREB%','FTR','OPPFTR','Champion'],axis=1).set_index(['Season','TEAM'])

In [150]:
tsne_2d = pd.DataFrame(TSNE().fit_transform(
    df2),index=df2.index).reset_index()

In [151]:
tsne_2d = pd.merge(tsne_2d,df,how='left',left_on=['Season','TEAM'],right_on=['Season','TEAM'])
tsne_2d

Unnamed: 0,Season,TEAM,0,1,WIN%,OFFRTG,DEFRTG,NETRTG,PACE,TS%,...,OPPFGAFREQ,OPP3PFREQ,OPPTOV%,OPPAST/TO,OPPSTL%,OPPOREB%,OPPDREB%,OPPFT%,OPPFTR,Champion
0,2001,San Antonio Spurs,-22.046196,21.041525,0.707,105.0,96.6,8.3,90.87,0.541,...,0.7867,0.1543,0.1284,1.462585,0.5263,0.297,0.703,0.741,0.2453,0
1,2001,Los Angeles Lakers,2.079633,10.176159,0.683,107.0,103.6,3.5,92.90,0.535,...,0.7730,0.1708,0.1216,1.546763,0.5195,0.309,0.662,0.754,0.3100,1
2,2001,Philadelphia 76ers,-11.089959,9.551980,0.683,101.9,97.3,4.6,92.09,0.518,...,0.7704,0.1866,0.1455,1.397590,0.5176,0.296,0.666,0.749,0.2480,0
3,2001,Sacramento Kings,23.568558,-11.533553,0.671,104.2,98.5,5.7,95.61,0.529,...,0.7730,0.1582,0.1449,1.359281,0.5461,0.321,0.693,0.749,0.2413,0
4,2001,Dallas Mavericks,21.644693,0.442421,0.646,105.6,101.2,4.4,94.52,0.544,...,0.7481,0.1702,0.1421,1.398773,0.5479,0.322,0.728,0.731,0.3333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,2021,Oklahoma City Thunder,-13.366204,-27.275124,0.333,103.6,112.8,-9.2,101.25,0.545,...,0.8089,0.4007,0.1149,1.945312,0.5669,0.260,0.753,0.767,0.2142,0
622,2021,Orlando Magic,1.701016,-29.260538,0.305,105.0,113.3,-8.3,98.89,0.528,...,0.8057,0.4139,0.1190,1.984848,0.5891,0.251,0.749,0.791,0.2125,0
623,2021,Detroit Pistons,5.477417,-28.308479,0.295,108.2,112.3,-4.1,98.30,0.557,...,0.7818,0.3581,0.1288,1.722222,0.5333,0.276,0.726,0.763,0.2597,0
624,2021,Minnesota Timberwolves,-14.435663,-25.339956,0.279,108.0,114.9,-6.9,101.84,0.550,...,0.7780,0.3739,0.1321,1.763514,0.5357,0.285,0.733,0.792,0.2626,0


In [152]:
tsne_2d.corr()[[0,1]]

Unnamed: 0,0,1
Season,0.178322,-0.673225
0,1.0,-0.521911
1,-0.521911,1.0
WIN%,-0.060695,0.064358
OFFRTG,0.107719,-0.46296
DEFRTG,0.175013,-0.548428
NETRTG,-0.051932,0.051457
PACE,0.358173,-0.934802
TS%,0.115158,-0.515595
2P%,0.093456,-0.586241


In [153]:
px.scatter(tsne_2d.dropna(),x=0,y=1,
           hover_data=['TEAM','Season','WIN%','OFFRTG','DEFRTG','3PFREQ','TS%','Champion'],
           color='FGAFREQ',size='WIN%')