Sillhoute Score Analysis for optmizing KMeans
---
Diogo Pessoa

In [None]:
"""Uncomment this Section if running locally or working on this notebook and loading dataset directly here."""
# import os
# import sys
#
# from dotenv import load_dotenv
#
# # Load environment variables from a .env file
# load_dotenv()
# images_path = os.getenv('IMAGES_PATH')
# data_dir = os.getenv('DATA_COLLECTION_DIR')
#
# # Loading local helper modules
# module_path = os.path.abspath(os.path.join('..'))
# if module_path not in sys.path:
#     sys.path.append(module_path)
# # data_collection
# %run './data_collection.ipynb'
# # features engineering
# %run './feature_engineering.ipynb'
# """
# Local Dataset from local files (download if not present)
#  :returns sampled_df_with_added_features: DataFrame, sampled_df_with_added_features_indexed: DataFrame
# """

In [None]:
features = ['start_station_id_index', 'day_period_index']

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=features, outputCol="features")
# # Scaling the features
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)

# # Combine the VectorAssembler and StandardScaler into a Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

# You can now define a pipeline that includes both the assembler and the scaler
pipeline = Pipeline(stages=[assembler, scaler])

# Fit and transform the DataFrame using the defined pipeline
sampled_df_scaled = pipeline.fit(sampled_df_with_added_features_indexed).transform(
    sampled_df_with_added_features_indexed)


In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()

k_values = list(range(2, 11))

# Initialize an empty list to store silhouette scores
silhouette_scores = []

# Iterate over values of k
for k in k_values:
    # Initialize KMeans with the specified number of clusters (k) and a seed for reproducibility
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("scaled_features")

    # Fit the model to the data
    model = kmeans.fit(sampled_df_scaled)

    # Transform the dataset to include cluster predictions
    predictions = model.transform(sampled_df_scaled)

    # Evaluate the model
    silhouette = evaluator.evaluate(predictions)
    silhouette_scores.append(silhouette)




In [None]:
from matplotlib import pyplot as plt
import numpy as np
# Plotting
plt.figure(figsize=(10, 6))
plt.plot(k_values, silhouette_scores, color='orange')
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.title('Silhouette score vs. Number of clusters (k)')
plt.xticks(np.arange(min(k_values), max(k_values) + 1, 1.0))
plt.grid(True)
plt.savefig(os.path.join(images_path, 'silhouette_score_day_period_start_stations_vs_number_of_clusters.png'))
plt.show()
# Output the optimal k based on silhouette score
optimal_k = k_values[silhouette_scores.index(max(silhouette_scores))]
print(f"The optimal number of clusters k is: {optimal_k}")
