In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

def load_and_prepare_data(input_path):
    data = pd.read_csv(input_path)
    data = data[data['TCITY15NM'] != 'London'].copy()
    X = data.drop(columns=['TCITY15NM'])
    data_cols = X.columns.tolist()
    return X, data_cols

def train_kmeans_model(X_train, num_clusters):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)
    X_normalized = normalize(X_scaled)
    pca = PCA(n_components=2)
    X_principal = pca.fit_transform(X_normalized)
    
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    labels = kmeans_model.fit_predict(X_principal)
    
    silhouette_avg = silhouette_score(X_principal, labels)
    davies_bouldin = davies_bouldin_score(X_principal, labels)
    calinski_harabasz = calinski_harabasz_score(X_principal, labels)
    
    return kmeans_model, scaler, pca, labels, X_principal, silhouette_avg, davies_bouldin, calinski_harabasz

def plot_variable_distributions(X_train, labels, run_path, cols):
    df = pd.DataFrame(X_train, columns=cols)
    df['Cluster'] = labels
    
    plots_folder = os.path.join(run_path, 'variable_distributions')
    os.makedirs(plots_folder, exist_ok=True)
    
    num_cols = 2
    num_rows = (len(cols) + num_cols - 1) // num_cols
    
    # ECDF Plot
    plt.figure(figsize=(11, 5 * num_rows))
    for i, col in enumerate(cols):
        plt.subplot(num_rows, num_cols, i + 1)
        for cluster in np.unique(labels):
            cluster_data = df[df['Cluster'] == cluster][col].dropna()
            sns.ecdfplot(cluster_data, label=f'Cluster {cluster}')
        plt.title(f'ECDF of {col} by Cluster')
        plt.xlabel(col)
        plt.ylabel('ECDF')
        plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(os.path.join(plots_folder, 'ecdf_all_distributions.png'))
    plt.close()
    
    # KDE Plot
    plt.figure(figsize=(11, 5 * num_rows))
    for i, col in enumerate(cols):
        plt.subplot(num_rows, num_cols, i + 1)
        for cluster in np.unique(labels):
            cluster_data = df[df['Cluster'] == cluster][col].dropna()
            sns.kdeplot(cluster_data, fill=True, label=f'Cluster {cluster}')
        plt.title(f'KDE of {col} by Cluster')
        plt.xlabel(col)
        plt.ylabel('Density')
        plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(os.path.join(plots_folder, 'kde_all_distributions.png'))
    plt.close()

def save_results(output_path, num_clusters, kmeans_model, scaler, pca, labels, X_principal, silhouette_avg, davies_bouldin, calinski_harabasz):
    run_path = os.path.join(output_path, str(num_clusters))
    os.makedirs(run_path, exist_ok=True)
    
    joblib.dump(kmeans_model, os.path.join(run_path, 'kmeans_model.pkl'))
    joblib.dump(scaler, os.path.join(run_path, 'scaler.pkl'))
    joblib.dump(pca, os.path.join(run_path, 'pca.pkl'))
    
    pd.DataFrame(labels, columns=['Cluster']).to_csv(os.path.join(run_path, 'labels.csv'), index=False)
    pd.DataFrame(X_principal, columns=['PC1', 'PC2']).to_csv(os.path.join(run_path, 'X_principal.csv'), index=False)
    
    summary = f"""
    K-means Clustering Run Summary
    ==============================
    Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
    
    Clustering Metrics:
    - Silhouette Score: {silhouette_avg:.4f}
    - Davies-Bouldin Index: {davies_bouldin:.4f}
    - Calinski-Harabasz Index: {calinski_harabasz:.4f}
    
    Files saved:
    - kmeans_model.pkl: Trained K-means model
    - scaler.pkl: StandardScaler object
    - pca.pkl: PCA object
    - labels.csv: Cluster labels for each data point
    - X_principal.csv: PCA-transformed data
    - ecdf_all_distributions.png: ECDF plots for all variables
    - kde_all_distributions.png: KDE plots for all variables
    """
    
    with open(os.path.join(run_path, 'run_summary.txt'), 'w') as f:
        f.write(summary)

In [9]:

input_path = "/Users/gracecolverd/City_clustering/resv3_clustering_data.csv" 
output_path ='/Users/gracecolverd/City_clustering/clustering_results/kmeans' 
num_clusters = 7
for num_clusters in [5,6,7,8,9,10,11,12]:
    X, data_cols = load_and_prepare_data(input_path)
    kmeans_model, scaler, pca, labels, X_principal, silhouette_avg, davies_bouldin, calinski_harabasz = train_kmeans_model(X, num_clusters)
    plot_variable_distributions(X, labels, os.path.join(output_path, str(num_clusters) ), data_cols)
    save_results(output_path, num_clusters,  kmeans_model, scaler, pca, labels, X_principal, silhouette_avg, davies_bouldin, calinski_harabasz)




In [None]:

    K-means Clustering Run Summary
    ==============================
    Date: 2024-09-27 17:07:21
    
    Clustering Metrics:
    - Silhouette Score: 0.4547
    - Davies-Bouldin Index: 0.6935
    - Calinski-Harabasz Index: 151.7551
    
    Files saved:
    - kmeans_model.pkl: Trained K-means model
    - scaler.pkl: StandardScaler object
    - pca.pkl: PCA object
    - labels.csv: Cluster labels for each data point
    - X_principal.csv: PCA-transformed data
    - ecdf_all_distributions.png: ECDF plots for all variables
    - kde_all_distributions.png: KDE plots for all variables

108

In [None]:

    K-means Clustering Run Summary
    ==============================
    Date: 2024-09-27 17:05:10
    
    Clustering Metrics:
    - Silhouette Score: 0.4477
    - Davies-Bouldin Index: 0.7023
    - Calinski-Harabasz Index: 152.9982
    
    Files saved:
    - kmeans_model.pkl: Trained K-means model
    - scaler.pkl: StandardScaler object
    - pca.pkl: PCA object
    - labels.csv: Cluster labels for each data point
    - X_principal.csv: PCA-transformed data
    - ecdf_all_distributions.png: ECDF plots for all variables
    - kde_all_distributions.png: KDE plots for all variables
    

In [None]:

    Spectral Clustering Run Summary
    ===============================
    Date: 2024-09-27 17:01:27
    
    Clustering Metrics:
    - Silhouette Score: 0.4223
    - Davies-Bouldin Index: 0.6641
    - Calinski-Harabasz Index: 123.5564
    

    
    Files saved:
    - spectral_model_rbf.pkl: Trained Spectral Clustering model
    - scaler.pkl: StandardScaler object
    - pca.pkl: PCA object
    - labels.csv: Cluster labels for each data point
    - X_principal.csv: PCA-transformed data
    - ecdf_all_distributions.png: ECDF plots for all variables
    - kde_all_distributions.png: KDE plots for all variables

    