In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# A simple example of clustering

In [None]:
df_example = pd.read_csv('clustering-example.csv')

In [None]:
stl_leaders = df_example[0:5]
blk_leaders = df_example[5:]

In [None]:
plt.style.use('fivethirtyeight')

stl_blk, ax = plt.subplots()

ax.scatter(df_example['STL'], df_example['BLK'])

ax.set_xlabel('STL/G')
ax.set_ylabel('BLK/G')

stl_blk.suptitle("Top 5 players in steals and blocks", weight = 'bold', size = 18)

stl_blk.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

stl_blk.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

stl_blk.savefig('stl-blk.png', dpi = 400, bbox_inches = 'tight')

In [None]:
plt.style.use('fivethirtyeight')

stl_blk_separated, ax = plt.subplots()

ax.scatter(stl_leaders['STL'], stl_leaders['BLK'], label = 'STL leaders')
ax.scatter(blk_leaders['STL'], blk_leaders['BLK'], label = 'BLK leaders')

ax.legend(loc='best', prop={'size': 12, "family": "Rockwell"})

ax.set_xlabel('STL/G')
ax.set_ylabel('BLK/G')

stl_blk_separated.suptitle("Top 5 players in steals and blocks", weight = 'bold', size = 18)

stl_blk_separated.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

stl_blk_separated.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

stl_blk_separated.savefig('stl-blk-separated.png', dpi = 400, bbox_inches = 'tight')

In [None]:
kmeans = KMeans(n_clusters = 2, random_state = 0)

x = np.column_stack((df_example['STL'], df_example['BLK']))

kmeans.fit(x)

y_kmeans = kmeans.predict(x)

In [None]:
plt.style.use('fivethirtyeight')

stl_blk_clustered, ax = plt.subplots()

cluster_1 = []
cluster_2 = []

for i in range(len(y_kmeans)):
    if(y_kmeans[i] == 0):
        cluster_1.append(x[i])
    elif(y_kmeans[i] == 1):
        cluster_2.append(x[i])
        
cluster_1 = np.vstack(cluster_1)
cluster_2 = np.vstack(cluster_2)

ax.scatter(cluster_1[:, 0], cluster_1[:, 1], label = "Cluster 1")
ax.scatter(cluster_2[:, 0], cluster_2[:, 1], label = "Cluster 2")

centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1], c = 'black', s = 200, alpha = .5, label = 'Cluster center')

ax.legend(loc='best', prop={'size': 12, "family": "Rockwell"})

ax.set_xlabel('STL/G')
ax.set_ylabel('BLK/G')

stl_blk_clustered.suptitle("Clustered steals and blocks leaders", weight = 'bold', size = 18)

stl_blk_clustered.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

stl_blk_clustered.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

stl_blk_clustered.savefig('stl-blk-clustered.png', dpi = 400, bbox_inches = 'tight')

Let's add a couple more features to this to see how we can differentiate the players differently

In [None]:
kmeans = KMeans(n_clusters = 2, random_state = 0)

x = np.column_stack((df_example['PTS'], df_example['TRB'], df_example['AST'], df_example['STL'], df_example['BLK']))

kmeans.fit(x)

y_kmeans = kmeans.predict(x)

In [None]:
for i, j in zip(df_example['Player'], y_kmeans):
    print(i, j)

When we add other features to the cluster, the model clusters superstars together

# Let's cluster the top 50 players in PPG

In [None]:
df_counting = pd.read_csv('counting-stats.csv')
df_advanced = pd.read_csv('advanced-stats.csv')

In [None]:
kmeans = KMeans(n_clusters = 3, random_state = 98)

x = np.column_stack((df_counting['PTS'], df_advanced['USG%']))

kmeans.fit(x)

y_kmeans = kmeans.predict(x)

In [None]:
for i, j in zip(df_counting['Player'], y_kmeans):
    print(i, j)

In [None]:
plt.style.use('fivethirtyeight')

pts_usg_clustered, ax = plt.subplots()

cluster_1 = []
cluster_2 = []
cluster_3 = []

for i in range(len(y_kmeans)):
    if(y_kmeans[i] == 0):
        cluster_1.append(x[i])
    elif(y_kmeans[i] == 1):
        cluster_2.append(x[i])
    elif(y_kmeans[i] == 2):
        cluster_3.append(x[i])
        
cluster_1 = np.vstack(cluster_1)
cluster_2 = np.vstack(cluster_2)
cluster_3 = np.vstack(cluster_3)

ax.scatter(cluster_1[:, 0], cluster_1[:, 1], label = "Cluster 1 (secondary scorers)")
ax.scatter(cluster_2[:, 0], cluster_2[:, 1], label = "Cluster 2 (primary scorers)")
ax.scatter(cluster_3[:, 0], cluster_3[:, 1], label = "Cluster 3 (James Harden)")

centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1], c = 'black', s = 200, alpha = .5, label = 'Cluster center')

ax.legend(loc='best', prop={'size': 12, "family": "Rockwell"})

ax.set_xlabel('PPG')
ax.set_ylabel('USG%')

pts_usg_clustered.suptitle("Clustered points and usage", weight = 'bold', size = 18)

pts_usg_clustered.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

pts_usg_clustered.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

pts_usg_clustered.savefig('pts-usg-clustered.png', dpi = 400, bbox_inches = 'tight')

In [None]:
kmeans = KMeans(n_clusters = 4, random_state = 99)

x = np.column_stack((df_counting['AST'], df_advanced['USG%']))

kmeans.fit(x)

y_kmeans = kmeans.predict(x)

In [None]:
for i, j in zip(df_counting['Player'], y_kmeans):
    print(i, j)

In [None]:
plt.style.use('fivethirtyeight')

ast_usg_clustered, ax = plt.subplots()

cluster_1 = []
cluster_2 = []
cluster_3 = []
cluster_4 = []

for i in range(len(y_kmeans)):
    if(y_kmeans[i] == 2):
        cluster_1.append(x[i])
    elif(y_kmeans[i] == 1):
        cluster_2.append(x[i])
    elif(y_kmeans[i] == 0):
        cluster_3.append(x[i])
    elif(y_kmeans[i] == 3):
        cluster_4.append(x[i])
        
cluster_1 = np.vstack(cluster_1)
cluster_2 = np.vstack(cluster_2)
cluster_3 = np.vstack(cluster_3)
cluster_4 = np.vstack(cluster_4)

ax.scatter(cluster_1[:, 0], cluster_1[:, 1], label = "Cluster 1 (scorers)")
ax.scatter(cluster_2[:, 0], cluster_2[:, 1], label = "Cluster 2 (passers)")
ax.scatter(cluster_3[:, 0], cluster_3[:, 1], label = "Cluster 3 (stars)")
ax.scatter(cluster_4[:, 0], cluster_4[:, 1], label = "Cluster 4 (James Harden)")

centers = kmeans.cluster_centers_
ax.scatter(centers[:, 0], centers[:, 1], c = 'black', s = 200, alpha = .5, label = 'Cluster center')

ax.legend(loc='best', prop={'size': 12, "family": "Rockwell"})

ax.set_xlabel('AST')
ax.set_ylabel('USG%')

ast_usg_clustered.suptitle("Clustered assists and usage", weight = 'bold', size = 18)

ast_usg_clustered.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

ast_usg_clustered.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

ast_usg_clustered.savefig('ast-usg-clustered.png', dpi = 400, bbox_inches = 'tight')

# Now let's cluster NBA rotation players

In [None]:
df_full_nba = pd.read_csv('full-nba-data.csv')

In [None]:
df_full_nba.head()

In [None]:
df_full_nba.columns.values

In [None]:
features = ['Pos', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
            'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
            'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

In [None]:
x = df_full_nba.loc[:, features].values

x = StandardScaler().fit_transform(x)

In [None]:
pca_list = []

for n_components in range(2, 11):
    pca = PCA(n_components = n_components)
    components = pca.fit_transform(x)
    pca_variance = sum(pca.explained_variance_ratio_)
    pca_list.append(pca_variance)
    print("For n_components = {}, explained variance ratio is {}".format(n_components, pca_variance))

In [None]:
plt.style.use('fivethirtyeight')

pca_fig, ax = plt.subplots()

ax.plot(range(2, 11), pca_list)

ax.set_xlabel('Number of components')
ax.set_ylabel('Explained variance ratio')

ax.set_xticks(np.arange(2, 11, 1.0))

pca_fig.suptitle("Finding the right number of components", weight = 'bold', size = 18)

pca_fig.text(x = -0.05, y = -0.08,
    s = '______________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

pca_fig.text(x = -0.05, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

pca_fig.savefig('pca-variance.png', dpi = 400, bbox_inches = 'tight')

In [None]:
from numpy import diff

dx = 1
y = pca_list
dy = diff(y)/dx
print(dy)

In [None]:
pca_deriv, ax = plt.subplots()

ax.plot(range(3, 11), dy)

ax.set_xlabel('Number of components')
ax.set_ylabel('dy(explained variance)/dx')

ax.set_xticks(np.arange(3, 11, 1.0))

pca_deriv.suptitle("Finding the right number of components", weight = 'bold', size = 18)

pca_deriv.text(x = -0.05, y = -0.08,
    s = '______________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

pca_deriv.text(x = -0.05, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

pca_deriv.savefig('pca-deriv.png', dpi = 400, bbox_inches = 'tight')

In [None]:
pca = PCA(n_components = 6)

components = pca.fit_transform(x)

pca_df = pd.DataFrame(data = components, columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6'])

pca_df['Player'] = df_full_nba['Player']
pca_df = pca_df[['Player', 'PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6']]

In [None]:
pca_df.head()

In [None]:
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
x = np.column_stack((pca_df['PC 1'], pca_df['PC 2'], pca_df['PC 3'], pca_df['PC 4'], pca_df['PC 5'], pca_df['PC 6']))

silhouette = []

for n_clusters in range(2, 21):
    kmeans = KMeans(n_clusters = n_clusters, random_state = 99)
    cluster_labels = kmeans.fit_predict(x)
    
    centers = kmeans.cluster_centers_

    score = silhouette_score(x, cluster_labels)
    silhouette.append(score)
    print("For n_clusters = {}, silhouette score is {}".format(n_clusters, score))

In [None]:
plt.style.use('fivethirtyeight')

silhouette_fig, ax = plt.subplots()

ax.plot(range(2, 21), silhouette)

ax.set_xlabel('Number of clusters')
ax.set_ylabel('Silhouette score')

ax.set_xticks(np.arange(2, 21, 3.0))

silhouette_fig.suptitle("Finding the right number of clusters", weight = 'bold', size = 18)

silhouette_fig.text(x = -0.05, y = -0.08,
    s = '______________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

silhouette_fig.text(x = -0.05, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

silhouette_fig.savefig('silhouette-score.png', dpi = 400, bbox_inches = 'tight')

In [None]:
silhouette_diff = []

for i in range(1, len(silhouette)):
    improvement = 1 - ((1 - silhouette[i]) / (1 - silhouette[i - 1]))
    silhouette_diff.append(improvement)
    print("For n_cluster = {}, percent improvement = {}".format(i + 2, improvement))

In [None]:
plt.style.use('fivethirtyeight')

silhouette_imp_fig, ax = plt.subplots()

ax.plot(range(3, 21), silhouette_diff)

ax.set_xlabel('Number of clusters')
ax.set_ylabel('% silhouette improvement')

ax.set_xticks(np.arange(3, 21, 2.0))

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

silhouette_imp_fig.suptitle("Finding the right number of clusters", weight = 'bold', size = 18)

silhouette_imp_fig.text(x = -0.05, y = -0.08,
    s = '______________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

silhouette_imp_fig.text(x = -0.05, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

silhouette_imp_fig.savefig('silhouette-diff-score.png', dpi = 400, bbox_inches = 'tight')

In [None]:
kmeans = KMeans(n_clusters = 12, random_state = 1)

kmeans.fit(x)

y_kmeans = kmeans.predict(x)

In [None]:
df_cluster = pd.DataFrame()

df_cluster['Player'] = df_full_nba['Player']
df_cluster['Cluster'] = y_kmeans

df_cluster.head()

In [None]:
interest_stats = ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'USG%']

def cluster_stats(cluster):
    
    print(df_full_nba[interest_stats].loc[df_cluster['Cluster'] == cluster].mean())

In [None]:
df_cluster[df_cluster['Cluster'] == 11]

In [None]:
cluster_stats(11)

In [None]:
df_cluster[interest_stats] = df_full_nba[interest_stats]

In [None]:
df_cluster['Cluster'] = df_cluster['Cluster'] + 1

In [None]:
roles = ['3&D guard', 'Perimiter scorer', 'Rim runner', '???', 'Star big', 'Shooter', 'Star ball handler',
         'Team-leading guard', 'Do it all big', 'Floor general', 'Inside big', '3&D forward']

cluster_dict = dict(zip(list(range(1, 13)), roles))

In [None]:
df_cluster['Role'] = df_cluster['Cluster'].map(cluster_dict)

In [None]:
df_cluster.head()

In [None]:
df_cluster = df_cluster[['Player', 'Cluster', 'Role', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'USG%']]
df_cluster.to_csv('clustered-nba.csv')

In [None]:
df_avg = df_cluster.groupby('Role')[interest_stats].mean()

In [None]:
df_avg