# K-means Clustering

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates

# Notebook variables

In [None]:
# NOTE: The data_root path must end in a '/'!
data_root = "abfss://[YOUR DATALAKE URL]/data/totals/"

feature_cols = ["PPG", "RPG", "APG", "Years_played"]
k = 6

# Reference Common notebook

In [None]:
%run "Common"

## Load dataframe and drop null "BLK" values

In [None]:
df = load_data(data_root)
df = df.dropna(subset=["BLK"])

## Generate the career stats dataframe

In [None]:
career_df = career_totals(df)
years_played_df = years_played(df)
career_df = career_df.join(years_played_df, "PlayerId", "left").orderBy(F.desc("Years_played"))
display(career_df)


## Perform K-means clustering

In [None]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
feature_df = assembler.transform(career_df)
featuresCol = "features"
kmeans = KMeans(featuresCol=featuresCol, k=k, seed=123)
model = kmeans.fit(feature_df)
predictions_df = model.transform(feature_df)
player_clusters_df = predictions_df.select("PlayerId", "prediction")

display(player_clusters_df)


## Evaluate the clustering

In [None]:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions_df)

inertia = model.summary.trainingCost
print(f"Inertia: {inertia}")
print(f"Silhouette: {silhouette}")

## Get the cluster a player belongs to

In [165]:
id = get_player_id(df, "LeBron James")[0]
james_cluster = players_cluster(player_clusters_df, id)[0]
james_cluster

StatementMeta(sparkPool01, 3, 195, Finished, Available)

1

## Show the players in cluster

In [None]:
display(players_in_cluster(player_clusters_df, james_cluster).orderBy("PlayerId"))

## Plot the clusters

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(16, 8))

for k, ax in zip(range(k), axes.flat):
    data = predictions_df.select(*feature_cols, "prediction").filter(predictions_df["prediction"] == k).toPandas()
    parallel_coordinates(data, "prediction", ax=ax)
    ax.set_title(f"Cluster {k}")
    ax.set_ylim(0, 30)

plt.tight_layout()
plt.show()