# Finding the appropriate number of clusters

In [None]:
%matplotlib inline

In [None]:
import sklearn
import sklearn.cluster as clustering
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
import numpy as np
import seaborn as sns
import matplotlib.cm as cm


In [None]:
combined_asos = pd.read_csv("/share/share/combined_asos_2023.csv")

In [None]:
combined_asos

In [None]:
stations_lls = combined_asos.groupby("station")[["lat", "lon"]].mean()

In [None]:
plt.scatter(stations_lls['lon'], stations_lls['lat'])

In [None]:
my_clustering = clustering.KMeans(n_clusters=5).fit(stations_lls)

## The Elbow Method

In [None]:
# Elbow Method
num_clusters = np.arange(1, 21,1)
sses = np.empty(num_clusters.shape)
for k in num_clusters:
    curr_clusters = clustering.KMeans(n_clusters=k, max_iter=1000, n_init='auto').fit(stations_lls)
    sses[k-1] = curr_clusters.inertia_

In [None]:
sns.set_style("darkgrid")
plt.plot(num_clusters, sses)
plt.xticks(np.arange(1,21, 2))
plt.grid(True)
plt.xlim(1,20)
plt.ylim(0,500000)
plt.xlabel("Number of clusters")
plt.ylabel("Sum of Squared Errors")

In [None]:
station_mean_vals = combined_asos.groupby('station')[['lat','lon','tmpf','dwpf', 'sknt']].mean()

In [None]:
station_mean_vals = station_mean_vals.dropna()

In [None]:
station_mean_vals

In [None]:
# Elbow Method
num_clusters = np.arange(1, 21,1)
sses = np.empty(num_clusters.shape)
for k in num_clusters:
    curr_clusters = clustering.KMeans(n_clusters=k, max_iter=1000, n_init='auto').fit(station_mean_vals[['tmpf','dwpf', 'sknt']])
    sses[k-1] = curr_clusters.inertia_

In [None]:
sns.set_style("darkgrid")
plt.plot(num_clusters, sses)
plt.xticks(np.arange(1,21, 2))
plt.grid(True)
plt.xlim(1,20)
plt.xlabel("Number of clusters")
plt.ylabel("Sum of Squared Errors")

In [None]:
curr_clusters = clustering.KMeans(n_clusters=5, max_iter=1000, n_init='auto').fit(station_mean_vals[['tmpf','dwpf', 'sknt']])

In [None]:
plt.scatter(station_mean_vals['lon'], station_mean_vals['lat'], c=curr_clusters.labels_)

## Silhouette Method

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples

In [None]:
# Silhouette Method 
# adapted from sklearn documentation


num_clusters = np.arange(2, 11,2)
scores = np.empty(num_clusters.shape)
for iter, k in enumerate(num_clusters):
    fig, (ax1) = plt.subplots(1, 1)
    curr_clusters = clustering.KMeans(n_clusters=k, max_iter=1000, n_init='auto').fit(station_mean_vals[['tmpf','dwpf', 'sknt']])
    sil_score = silhouette_score(station_mean_vals[['tmpf','dwpf', 'sknt']], curr_clusters.labels_)
    sample_vals = silhouette_samples(station_mean_vals[['tmpf','dwpf', 'sknt']], curr_clusters.labels_)
    y_lower=10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_vals[curr_clusters.labels_ == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=sil_score, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    plt.show()
    plt.close(fig)
