In [56]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, Normalizer, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

sns.set_style("whitegrid")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
nba_stats2021 = pd.read_csv('./nba_stats2021_original.csv')
nba_stats2021.head()

Unnamed: 0,Player,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,...,%FTA,%OREB,%REB,%AST,%TOV,%STL,%BLKA,%PF,%PFD,%PTS
0,Stephen Curry,GSW,33,58,32,26,34.1,31.5,10.3,21.3,...,37.9,8.0,17.9,27.6,29.8,20.8,32.5,12.1,32.1,37.1
1,Bradley Beal,WAS,27,57,29,28,35.6,31.1,11.1,22.8,...,37.0,15.9,14.3,22.9,29.6,20.1,26.5,14.3,32.3,34.9
2,Joel Embiid,PHI,27,47,36,11,31.5,29.1,9.2,18.0,...,58.3,43.3,37.2,18.4,34.7,16.1,35.0,21.5,53.0,37.1
3,Luka Doncic,DAL,22,59,35,24,35.1,28.5,10.1,20.8,...,45.4,12.5,25.6,52.4,49.9,21.3,32.2,16.8,38.5,33.7
4,Damian Lillard,POR,30,61,34,27,35.7,28.4,8.8,20.0,...,43.6,6.0,12.8,46.2,36.6,18.0,29.1,11.5,43.1,32.0


In [19]:
nba_stats2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 84 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Player             534 non-null    object 
 1   TEAM               534 non-null    object 
 2   AGE                534 non-null    int64  
 3   GP                 534 non-null    int64  
 4   W                  534 non-null    int64  
 5   L                  534 non-null    int64  
 6   MIN                534 non-null    float64
 7   PTS                534 non-null    float64
 8   FGM                534 non-null    float64
 9   FGA                534 non-null    float64
 10  FG%                534 non-null    float64
 11  3PM                534 non-null    float64
 12  3PA                534 non-null    float64
 13  3P%                534 non-null    float64
 14  FTM                534 non-null    float64
 15  FTA                534 non-null    float64
 16  FT%                534 non

In [4]:
# Creating List of defensive stat features
def_stat = []
for feature in nba_stats2021.columns:
    if feature.startswith(('D','%D','S','%S','B','R','%B')):
        def_stat.append(feature)
print(len(def_stat))
def_stat

15


['DREB_x',
 'REB',
 'STL_x',
 'BLK_x',
 'DD2',
 'DEFRTG',
 'DREB%_x',
 'REB%',
 'DEF\xa0RTG',
 '%DREB_x',
 'STL%',
 '%BLK_x',
 'DEFWS',
 '%STL',
 '%BLKA']

In [81]:
# Creating List of offensive stat features
off_stat = []
for feature in nba_stats2021.columns:
    if feature.startswith(('3','2','A','PT','%P','TO','e','PO')):
        off_stat.append(feature)
print(len(off_stat))
off_stat

27


['AGE',
 'PTS',
 '3PM',
 '3PA',
 '3P%',
 'AST',
 'TOV',
 'AST%',
 'AST/TO',
 'AST\xa0Ratio',
 'TO\xa0Ratio',
 'eFG%',
 'POSS',
 '%PTS2PT',
 '%PTS2PT\xa0MR',
 '%PTS3PT',
 '%PTSFBPs',
 '%PTSFT',
 '%PTSOffTO',
 '%PTSPITP',
 '2FGM%AST',
 '2FGM%UAST',
 '3FGM%AST',
 '3FGM%UAST',
 '%PF',
 '%PFD',
 '%PTS']

In [104]:
# List of Traditional/Boxscore Stats
trad_list = ['MIN','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB','DREB_x','REB','AST','STL_x','BLK_x','PF','PTS','+/-']
# List of stats that describe pure or shooting offensive performance
pure_off = ['3P%','AST%','eFG%','%PTS2PT','%PTSOffTO','%PTSFT','%PTSFBPs','%PTSPITP','2FGM%AST','3FGM%AST','2FGM%UAST']

## ```Baseline model```
---

Here 5 clusters were used since their are 5 traditional positions in Basketball.

## ```Defense Model```

List of all defensive stats

In [94]:
X_deff = nba_stats2021[def_stat]

# Standard Scalar X
sc = StandardScaler()
X_sc = sc.fit_transform(X_deff)

In [95]:
# Instantiate KMeans with Parameters
km = KMeans(n_clusters=5, random_state=42)

# Fit model
km.fit(X_sc)

KMeans(n_clusters=5, random_state=42)

In [96]:
km.labels_

array([0, 0, 3, 3, 0, 3, 3, 0, 0, 3, 3, 0, 3, 0, 0, 0, 3, 0, 3, 3, 0, 0,
       3, 3, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 0, 0, 3, 3, 0, 3,
       0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       3, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 3, 0, 4, 0,
       0, 0, 3, 3, 3, 1, 3, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 3, 0, 3,
       1, 0, 0, 4, 0, 4, 3, 0, 4, 0, 3, 0, 1, 3, 1, 0, 4, 3, 4, 1, 4, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 4, 1, 3, 3, 1, 0, 3, 0, 4, 0, 1,
       0, 4, 0, 4, 0, 1, 4, 3, 4, 0, 4, 0, 0, 0, 3, 4, 0, 4, 0, 0, 0, 1,
       4, 0, 4, 0, 0, 4, 0, 0, 1, 1, 1, 0, 4, 0, 0, 0, 4, 4, 0, 4, 4, 0,
       0, 0, 0, 3, 0, 3, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 3, 4, 3, 0, 0,
       3, 0, 0, 3, 4, 0, 0, 4, 1, 1, 4, 3, 1, 0, 0, 0, 0, 0, 1, 1, 4, 4,
       0, 0, 1, 0, 0, 4, 4, 4, 1, 1, 3, 1, 0, 1, 4, 1, 1, 0, 4, 4, 1, 4,
       1, 1, 4, 3, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 4, 0, 1, 4, 0, 3,
       1, 0, 4, 4, 0, 0, 1, 0, 1, 0, 1, 4, 0, 0, 0,

In [97]:
km.cluster_centers_

array([[ 2.05563919e-02, -4.56557288e-02,  7.14431702e-01,
        -1.41912461e-01, -1.51022481e-01,  9.10205434e-02,
        -3.52095410e-01, -4.06853989e-01,  9.10205434e-02,
        -3.56459055e-01,  6.84848848e-01, -3.04494367e-01,
         2.85666847e-01,  6.84527816e-01, -5.50439501e-03],
       [ 1.16691648e-01,  2.37024283e-01, -6.31305888e-01,
         4.97395553e-01, -1.88476373e-01, -1.51847284e-01,
         9.90265230e-01,  1.08415407e+00, -1.51847284e-01,
         9.93601072e-01, -5.46867619e-01,  1.04588527e+00,
        -2.44080407e-01, -5.46116287e-01,  2.27973670e-01],
       [-1.49812801e+00, -1.33739179e+00, -1.56699075e+00,
        -9.98755648e-01, -4.18336792e-01,  7.96128737e+00,
        -2.29926228e+00, -1.44262635e+00,  7.96128737e+00,
        -2.39255821e+00, -2.11281974e+00, -1.14321832e+00,
        -3.38269518e+00, -2.11235468e+00, -1.68731930e+00],
       [ 2.08674896e+00,  2.12466098e+00,  8.03652685e-01,
         1.61206257e+00,  2.24533417e+00,  1.37511306

In [98]:
print(f'Sil score: {silhouette_score(X_sc, km.labels_)}')
print('------')
print(f'Inertia score: {km.inertia_}')

Sil score: 0.17427808428600602
------
Inertia score: 4416.82346692033


## ```Offense Model```

List of all offense stats

In [111]:
def baseline(list):
    X = nba_stats2021[list]
    sc = StandardScaler()
    X_sc = sc.fit_transform(X)
    km = KMeans(n_clusters=5, random_state=42)
    # Fit model
    km.fit(X_sc)
    return print(f'Sil score: {silhouette_score(X_sc, km.labels_)}\n Inertia score: {km.inertia_}')

baseline(off_stat)

Sil score: 0.174952602944137
 Inertia score: 8706.83937777845


## ```Traditional/Boxscore Model```

List of Traditional boxscore stats

In [115]:
baseline(trad_list)

Sil score: 0.20837288724021316
 Inertia score: 4322.942491024253


## ```Pure Offense Model```

List of stats that describe pure or shooting offensive performance

In [116]:
baseline(pure_off)

Sil score: 0.20841507237891116
 Inertia score: 2904.2111097358033


## ```Baseline Models Summary```
---
Defensive:

    Sil Score: 0.1742
    Inertia Score: 4416.8
    
Offensive:

     Sil Score: 0.17495
    Inertia Score: 8706.8
    
Pure Offensive:

    Sil Score: 0.20842
    Inertia Score: 2904.2
    
Traditional:

    Sil Score: 0.20837
    Inertia Score: 4322.9

## ```Manual GridSearch Models```
---
- Trying to beat Baseline scores
- Running manual Grid search with different Scalers and K-values

## ```GridSearch Defensive Model```

List of all defensive stats

In [121]:
# Manual Grid Search to Find best Scaler/# of Centroids

def optimizer(list, n):
    # set x, y to drop species
    X = nba_stats2021[list]

    # set all scaling methods
    sc = StandardScaler()
    X_sc = sc.fit_transform(X)
    nor = Normalizer()
    X_normal = nor.fit_transform(X)
    min_max = MinMaxScaler()
    X_mm = min_max.fit_transform(X)
    # loop through each scaler from list
    scaler_list = [X_sc, X_normal, X_mm]
    scaler = ['StandardScaler', 'Normalize', 'MinMaxScaler']
    counter = 0
    for scale in scaler_list:
        # now loop through range for optimal clusters
        sil_scores = []
        for k in range(2,n):
            km3 = KMeans(n_clusters=k)
            km3.fit(scale)
            sil_score = silhouette_score(scale, km3.labels_)
            sil_scores.append([k, sil_score])
        # saving to dataframe
        score_df = pd.DataFrame(data=sil_scores, columns=['k','sil'])
        # sorting values by silhouette score
        score_df.sort_values(by='sil', ascending=False)
        # show top 5 # of clusters and silhouette score
        print(scaler[counter])
        print(score_df.head(1))
        counter += 1
pass

optimizer(def_stat,10)

StandardScaler
   k       sil
0  2  0.285092
Normalize
   k       sil
0  2  0.361631
MinMaxScaler
   k       sil
0  2  0.307237


In [132]:
# Double Checking my GridSearch with Best parameters
scores = []

X = nba_stats2021[def_stat]
nn = Normalizer()
X_nn = nn.fit_transform(X)

for k in range(2,10):
    cl = KMeans(n_clusters=k)
    cl.fit(X_nn)
    inertia = cl.inertia_
    sil = silhouette_score(X_nn,cl.labels_)
    scores.append([k,inertia, sil])
    
scores_df = pd.DataFrame(scores)
scores_df.columns = ['K', 'inertia', 'silhouette']
scores_df['silhouette'].max() 

0.361631474645697

In [133]:
scores_df.head()

Unnamed: 0,K,inertia,silhouette
0,2,10.282247,0.361631
1,3,8.786337,0.213574
2,4,7.777607,0.214459
3,5,7.005465,0.228127
4,6,6.276823,0.216799


## ```Offensive Gridsearch```

List of all Offensive Stats

In [125]:
optimizer(off_stat,10)

StandardScaler
   k       sil
0  2  0.159066
Normalize
   k       sil
0  2  0.721928
MinMaxScaler
   k      sil
0  2  0.34308


In [134]:
# Double Checking my GridSearch with Best parameters
scores = []

X = nba_stats2021[off_stat]
nn = Normalizer()
X_nn = nn.fit_transform(X)

for k in range(2,11):
    cl = KMeans(n_clusters=k)
    cl.fit(X_nn)
    inertia = cl.inertia_
    sil = silhouette_score(X_nn,cl.labels_)
    scores.append([k,inertia, sil])
    
scores_df = pd.DataFrame(scores)
scores_df.columns = ['K', 'inertia', 'silhouette']
scores_df['silhouette'].max() 

0.7300294465924135

In [135]:
scores_df.head()

Unnamed: 0,K,inertia,silhouette
0,2,38.152328,0.721928
1,3,29.86581,0.725332
2,4,23.226511,0.730029
3,5,19.891956,0.633879
4,6,17.778814,0.626515


## ```Traditional GridSearch```

List of all Traditional stats

In [136]:
optimizer(trad_list,10)

StandardScaler
   k       sil
0  2  0.304904
Normalize
   k       sil
0  2  0.700862
MinMaxScaler
   k       sil
0  2  0.344298


In [138]:
# Double Checking my GridSearch with Best parameters
scores = []

X = nba_stats2021[trad_list]
nn = Normalizer()
X_nn = nn.fit_transform(X)

for k in range(2,11):
    cl = KMeans(n_clusters=k)
    cl.fit(X_nn)
    inertia = cl.inertia_
    sil = silhouette_score(X_nn,cl.labels_)
    scores.append([k,inertia, sil])
    
scores_df = pd.DataFrame(scores)
scores_df.columns = ['K', 'inertia', 'silhouette']
scores_df['silhouette'].max()

0.7008617884039994

In [139]:
scores_df.head()

Unnamed: 0,K,inertia,silhouette
0,2,36.155831,0.700862
1,3,28.222608,0.45344
2,4,22.227681,0.467882
3,5,18.173019,0.291928
4,6,15.514911,0.29215


## ```Pure Offensive GridSearch```

List of all stats that describe player scoring

In [141]:
optimizer(pure_off,11)

StandardScaler
   k     sil
0  2  0.4204
Normalize
   k       sil
0  2  0.537453
MinMaxScaler
   k       sil
0  2  0.487013


In [142]:
# Double Checking my GridSearch with Best parameters
scores = []

X = nba_stats2021[pure_off]
nn = Normalizer()
X_nn = nn.fit_transform(X)

for k in range(2,11):
    cl = KMeans(n_clusters=k)
    cl.fit(X_nn)
    inertia = cl.inertia_
    sil = silhouette_score(X_nn,cl.labels_)
    scores.append([k,inertia, sil])
    
scores_df = pd.DataFrame(scores)
scores_df.columns = ['K', 'inertia', 'silhouette']
scores_df['silhouette'].max()

0.5374532181627872

In [143]:
scores_df.head()

Unnamed: 0,K,inertia,silhouette
0,2,58.693583,0.537453
1,3,45.426674,0.320131
2,4,34.067387,0.358776
3,5,28.841432,0.296299
4,6,24.849613,0.313837


## ```Manual GridSearch Best Parameters Summary```
---
Defensive:
    
    Best K clusters = 2
    Best scalar = Normalize
    Sil score = 0.36

Offensive:

    Best K clusters = 3
    Best scalar = Normalize
    Sil score = 0.725332   
    
Traditonal Stats:

    Best K clusters = 2
    Best scalar = Normalize
    Sil score = 0.700862

Pure Offensive:

    Best K clusters = 2
    Best scalar = Normalize
    Sil score = 0.537453    
    

    