# K-means Clustering

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator

# Notebook variables

In [None]:
# NOTE: The dataRoot path must end in a '/'!
data_root = "abfss://msjhrwsynfs@msjhrwsyndl.dfs.core.windows.net/data/totals/"

# Reference Common notebook

In [None]:
%run "Common"

## Load dataframe and replace null "BLK" values

In [None]:
df = load_data(data_root)

percentiles = [0.25, 0.5, 0.75]
blk_percentiles = career_df.approxQuantile("BLK", percentiles, 0.01)

# Replace null "BLK" column values with the 50th percentile value.
df = df.fillna({"BLK": blk_percentiles[-2]})

## Generate the career stats dataframe

In [None]:
career_df = career_totals(df)
years_played_df = years_played(df)
career_df = career_df.join(years_played_df, "PlayerId", "left").orderBy(F.desc("Years_played"))
display(career_df)

## Perform K-means clustering

In [None]:
feature_cols = ['PPG', 'RPG', 'APG', 'BPG']

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
feature_df = assembler.transform(career_df)

kmeans = KMeans(featuresCol='features', k=8, seed=123)
model = kmeans.fit(feature_df)

predictions_df = model.transform(feature_df)

player_clusters_df = predictions_df.select("PlayerId", "prediction")

display(player_clusters_df)


## Evaluate the clustering

In [None]:
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)

inertia = model.summary.trainingCost
print(f"Inertia: {inertia}")
print(f"Silhouette: {silhouette}")

## Show the players in cluster 1

In [None]:
display(players_in_cluster(player_clusters_df, 1).orderBy("PlayerId"))

## Get the cluster a player belongs to

In [None]:
id = get_player_id(df, "Michael Jordan")[0]
players_cluster(player_clusters_df, id)

In [None]:
from pandas.plotting import parallel_coordinates

data = predictions_df.toPandas()
parallel_coordinates(data, "prediction")
plt.show()