In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, Normalizer, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set_style("whitegrid")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
nba_stats2021 = pd.read_csv('./nba_stats2021_original.csv')
nba_stats2021.head()

Unnamed: 0,Player,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,%FTA,%OREB,%REB,%AST,%TOV,%STL,%BLKA,%PF,%PFD,%PTS
0,Stephen Curry,GSW,33,58,32,26,34.1,31.5,10.3,21.3,...,37.9,8.0,17.9,27.6,29.8,20.8,32.5,12.1,32.1,37.1
1,Bradley Beal,WAS,27,57,29,28,35.6,31.1,11.1,22.8,...,37.0,15.9,14.3,22.9,29.6,20.1,26.5,14.3,32.3,34.9
2,Joel Embiid,PHI,27,47,36,11,31.5,29.1,9.2,18.0,...,58.3,43.3,37.2,18.4,34.7,16.1,35.0,21.5,53.0,37.1
3,Luka Doncic,DAL,22,59,35,24,35.1,28.5,10.1,20.8,...,45.4,12.5,25.6,52.4,49.9,21.3,32.2,16.8,38.5,33.7
4,Damian Lillard,POR,30,61,34,27,35.7,28.4,8.8,20.0,...,43.6,6.0,12.8,46.2,36.6,18.0,29.1,11.5,43.1,32.0


In [3]:
# Creating List of offensive stat features
off_stat = []
for feature in nba_stats2021.columns:
    if feature.startswith(('3','2','A','PT','%P','TO','e','PO')):
        off_stat.append(feature)
print(len(off_stat))
off_stat

27


['AGE',
 'PTS',
 '3PM',
 '3PA',
 '3P%',
 'AST',
 'TOV',
 'AST%',
 'AST/TO',
 'AST\xa0Ratio',
 'TO\xa0Ratio',
 'eFG%',
 'POSS',
 '%PTS2PT',
 '%PTS2PT\xa0MR',
 '%PTS3PT',
 '%PTSFBPs',
 '%PTSFT',
 '%PTSOffTO',
 '%PTSPITP',
 '2FGM%AST',
 '2FGM%UAST',
 '3FGM%AST',
 '3FGM%UAST',
 '%PF',
 '%PFD',
 '%PTS']

In [4]:
# Creating List of defensive stat features
def_stat = []
for feature in nba_stats2021.columns:
    if feature.startswith(('D','%D','S','%S','B','R','%B')):
        def_stat.append(feature)
print(len(def_stat))
def_stat

15


['DREB_x',
 'REB',
 'STL_x',
 'BLK_x',
 'DD2',
 'DEFRTG',
 'DREB%_x',
 'REB%',
 'DEF\xa0RTG',
 '%DREB_x',
 'STL%',
 '%BLK_x',
 'DEFWS',
 '%STL',
 '%BLKA']

In [5]:
# List of Traditional/Boxscore Stats
trad_list = ['MIN','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB','DREB_x','REB','AST','STL_x','BLK_x','PF','PTS','+/-']
# List of stats that describe pure or shooting offensive performance
pure_off = ['3P%','AST%','eFG%','%PTS2PT','%PTSOffTO','%PTSFT','%PTSFBPs','%PTSPITP','2FGM%AST','3FGM%AST','2FGM%UAST']

## ```Traditional Stats PCA Model```

In [38]:
def optimize_pca(category, n_components, n_cluster):
    # Set up X and y
    X = nba_stats2021[category]
    y = nba_stats2021['Player']
    evaluate = []
    # Set up scaler/ reduction
    sc = StandardScaler()
    nn = Normalizer()
    X_nn = nn.fit_transform(X)
    # Dimensionality reduction
    for i in range(1,n_components):
        pca = PCA(n_components=i)
        pca.fit(X_nn)
        X_pca = pca.transform(X_nn)
        explained_ratio = pca.explained_variance_ratio_.sum()
        clusters = KMeans(n_clusters=n_cluster, random_state=42)
        clusters.fit(X_pca)
        y_preds = clusters.predict(X_pca)
        sil = silhouette_score(X_nn, clusters.labels_)
        evaluate.append([i,explained_ratio,sil])
        

    eval_df = pd.DataFrame(evaluate)
    eval_df.columns = ['n_Components','explained_ratio','sil']
    return eval_df

optimize_pca(trad_list, 15, 4)

Unnamed: 0,n_Components,explained_ratio,sil
0,1,0.415216,0.159577
1,2,0.645399,0.262507
2,3,0.820339,0.46771
3,4,0.938462,0.463425
4,5,0.973529,0.474175
5,6,0.988622,0.46546
6,7,0.993908,0.46441
7,8,0.996188,0.467882
8,9,0.997727,0.46546
9,10,0.99845,0.463022
