In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.decomposition import PCA


# function to reduce dimensionality 
def pca(data):
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(data)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
    principalDf_headers = list(principalDf.columns)
    return principalDf, principalDf_headers

def elbow_plot(data, title, features):
    distortions = []
    K = range(1, 500)
    for k in K:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data[features])
        distortions.append(kmeans.inertia_)  # Distortion (inertia) for each k

    # Use KneeLocator to find the optimal k
    kneedle = KneeLocator(K, distortions, curve='convex', direction='decreasing')
    optimal_k = kneedle.elbow

    plt.figure(figsize=(10, 6))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Distortion')
    plt.title(title)  # Use the title argument here
    plt.grid(True)

    # Highlight the optimal k
    if optimal_k is not None:
        plt.axvline(x=optimal_k, color='r', linestyle='--')
        plt.scatter(optimal_k, distortions[optimal_k-1], color='red', s=100, zorder=5)
        plt.text(optimal_k, distortions[optimal_k-1], f'  k={optimal_k}', fontsize=12, color='red', verticalalignment='bottom')

    return optimal_k    



# Function to generate inducing points
def k_means(data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
    inducing_points = kmeans.cluster_centers_
    return inducing_points

def k_means_sillohette(data, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters).fit(data)
    labels = kmeans.labels_
    sil_score = silhouette_score(data, labels)
    print(f'Silhouette Score: {sil_score}')
    return sil_score    


  