In [None]:
# Stel dat we een dataset hebben met wijkenkenmerken, bijvoorbeeld demografische en sociaaleconomische data
import pandas as pd
data = pd.read_csv('wijken.csv')  # Bijvoorbeeld een dataset met CBS wijkenkenmerken
display(data.head())
#check datatypes
display(data.info())
#check missende waardes
print(data.isna().sum().sum())
object_columns_df = data.select_dtypes(include=['object'])
print(object_columns_df.head())


In [None]:
data = data.set_index('Codering_3')
data.head()

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

def plot_elbow_curve(X, max_k=10):
    """Generate an elbow plot to find the optimal number of clusters for KMeans."""
    # List to hold the inertia (sum of squared distances) for each value of k
    inertia = []

    # Fit KMeans for different values of k (from 1 to max_k)
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertia.append(kmeans.inertia_)  # Inertia is the sum of squared distances to the closest cluster center

    # Plot the inertia values for each k
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_k + 1), inertia, 'bo-', markersize=8)
    plt.title('Elbow Plot for Optimal k')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia (Sum of Squared Distances)')
    plt.xticks(range(1, max_k + 1))
    plt.grid(True)
    plt.show()



plot_elbow_curve(data)

Standaard pipeline voor clusteranalyse

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def apply_standard_scaling(data):
    """Standardize the dataset (mean=0, variance=1)."""
    scaler = StandardScaler()
    return scaler.fit_transform(data)

def apply_kmeans(data, n_clusters=8):
    """Apply KMeans clustering and return labels and centroids."""
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    return kmeans.labels_, kmeans.cluster_centers_

def apply_pca(data, n_components=2):
    """Reduce the dimensionality of the dataset using PCA."""
    pca = PCA(n_components=n_components)
    return pca.fit_transform(data)

def plot_clusters(pca_data, labels, centroids):
    """Plot the PCA-reduced data with cluster assignments."""
    plt.figure(figsize=(8, 6))
    
    # Plot the data points with their cluster labels
    scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='viridis', alpha=0.5)
    
    # Plot the cluster centroids
    plt.scatter(centroids[:, 0], centroids[:, 1], s=300, c='red', label='Centroids', marker='X')
    
    plt.title('PCA of Clusters')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend()
    plt.colorbar(scatter)
    plt.show()

def plot_centroid_analysis(centroids):
    # Plot the centroid differences for each feature
    pass

# Main pipeline

def main(data):
    # Step 1: Standardize the data (optional, but recommended)
    data = apply_standard_scaling(data)

    # Step 2: Apply KMeans clustering
    k = 3  # Number of clusters
    labels, centroids = apply_kmeans(data, n_clusters=k)

    # Step 3: Apply PCA for 2D visualization
    pca_data = apply_pca(data)

    # Step 4: Plot the clusters
    # Since centroids are in the original space, we need to apply PCA to centroids as well
    pca_centroids = apply_pca(centroids, n_components=2)
    plot_clusters(pca_data, labels, pca_centroids)
    plot_centroid_analysis(centroids)
    return labels


labels = main(data)


Hoe ziet het er uit met tsne (niet linear, kan geen centroids gebruiken)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def apply_standard_scaling(data):
    """Standardize the dataset (mean=0, variance=1)."""
    scaler = StandardScaler()
    return scaler.fit_transform(data)

def apply_kmeans(data, n_clusters=8):
    """Apply KMeans clustering and return labels."""
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    return kmeans.labels_

def apply_tsne(data, n_components=2, perplexity=30):
    """Reduce the dimensionality of the dataset using t-SNE."""
    tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42)
    return tsne.fit_transform(data)

def plot_clusters(tsne_data, labels):
    """Plot the t-SNE-reduced data with cluster assignments."""
    plt.figure(figsize=(8, 6))
    
    # Plot the data points with their cluster labels
    scatter = plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=labels, cmap='viridis', alpha=0.7)
    
    plt.title('t-SNE of Clusters')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.legend(*scatter.legend_elements(), title="Clusters")
    plt.colorbar(scatter)
    plt.show()

# Main pipeline

def main(data):
    # Step 1: Standardize the data
    data = apply_standard_scaling(data)

    # Step 2: Apply KMeans clustering
    k = 3  # Number of clusters
    labels = apply_kmeans(data, n_clusters=k)

    # Step 3: Apply t-SNE for 2D visualization
    tsne_data = apply_tsne(data, perplexity=10)  # Adjust perplexity based on your dataset size

    # Step 4: Plot the clusters
    plot_clusters(tsne_data, labels)
    return labels


labels = main(data)


In [None]:
# Train een random forest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

def feature_importance(X, clusters):
    clf.fit(X, clusters)

    # Feature importance opvragen
    importances = clf.feature_importances_

    # Visualiseer de feature importance
    feature_names = X.columns  # Stel dat je dataset kolomnamen heeft
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

    # Sorteer op basis van importance
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Plot de feature importance
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8, 20))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance van Cluster Classificatie')
    plt.gca().invert_yaxis()
    plt.show()

feature_importance(data, labels)

Om de code aan te passen om DBSCAN (Density-Based Spatial Clustering of Applications with Noise) te gebruiken in plaats van KMeans, moet je de volgende aanpassingen maken:

- **DBSCAN gebruikt geen centroiden**, in tegenstelling tot KMeans. We hoeven dus geen centroiden te berekenen of te visualiseren.
- **DBSCAN clustering**: DBSCAN wijst een label van `-1` toe aan ruispunten die tot geen enkele cluster behoren. We moeten hiermee rekening houden bij het plotten.
- **PCA voor dimensiereductie**: We zullen nog steeds PCA gebruiken om de clusters in 2D te visualiseren.
- **DBSCAN parameters**: DBSCAN heeft twee belangrijke parameters: `eps` (de maximale afstand tussen twee samples om als buren te worden beschouwd) en `min_samples` (het minimum aantal punten dat nodig is om een dichte regio te vormen). Je moet mogelijk deze parameters aanpassen op basis van je dataset.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def apply_standard_scaling(data):
    """Standardize the dataset (mean=0, variance=1)."""
    scaler = StandardScaler()
    return scaler.fit_transform(data)

def apply_dbscan(data, eps=0.5, min_samples=5):
    """Apply DBSCAN clustering and return labels."""
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(data)
    return labels

def apply_pca(data, n_components=2):
    """Reduce the dimensionality of the dataset using PCA."""
    pca = PCA(n_components=n_components)
    return pca.fit_transform(data)

def plot_clusters(pca_data, labels):
    """Plot the PCA-reduced data with cluster assignments."""
    plt.figure(figsize=(8, 6))
    
    # Plot the data points with their cluster labels
    scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='viridis', alpha=0.7)
    
    plt.title('PCA of DBSCAN Clusters')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(scatter, label='Cluster Label')
    plt.show()

# Main pipeline

def main(data):
    # Step 1: Standardize the data
    data = apply_standard_scaling(data)

    # Step 2: Apply DBSCAN clustering
    labels = apply_dbscan(data, eps=0.05, min_samples=5)  # Adjust eps and min_samples based on your dataset

    # Step 3: Apply PCA for 2D visualization
    pca_data = apply_pca(data)

    # Step 4: Plot the clusters
    plot_clusters(pca_data, labels)
    return labels

# Assuming your DataFrame 'data' is available and cleaned
labels = main(data)


Nu pas ik de code aan voor HAC

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

def apply_standard_scaling(data):
    """Standardize the dataset (mean=0, variance=1)."""
    scaler = StandardScaler()
    return scaler.fit_transform(data)

def apply_hac(data, n_clusters=3, linkage='ward'):
    """Apply Hierarchical Agglomerative Clustering and return labels."""
    hac = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = hac.fit_predict(data)
    return labels

def apply_pca(data, n_components=2):
    """Reduce the dimensionality of the dataset using PCA."""
    pca = PCA(n_components=n_components)
    return pca.fit_transform(data)

def plot_clusters(pca_data, labels):
    """Plot the PCA-reduced data with cluster assignments."""
    plt.figure(figsize=(8, 6))
    
    # Plot the data points with their cluster labels
    scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='viridis', alpha=0.7)
    
    plt.title('PCA of HAC Clusters')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(scatter, label='Cluster Label')
    plt.show()

# Main pipeline

def main(data):
    # Step 1: Standardize the data
    data = apply_standard_scaling(data)

    # Step 2: Apply Hierarchical Agglomerative Clustering (HAC)
    labels = apply_hac(data, n_clusters=2, linkage='ward')  # Adjust n_clusters and linkage as needed

    # Step 3: Apply PCA for 2D visualization
    pca_data = apply_pca(data)

    # Step 4: Plot the clusters
    plot_clusters(pca_data, labels)
    return labels

# Assuming your DataFrame 'data' is available and cleaned
labels = main(data)



In [None]:

from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

def plot_dendrogram(data, method='ward'):
    """Plot dendrogram for hierarchical clustering."""
    Z = linkage(data, method=method)
    plt.figure(figsize=(10, 7))
    dendrogram(Z)
    plt.title('Dendrogram')
    plt.xlabel('Samples')
    plt.ylabel('Distance')
    plt.show()

# To use this, simply call it in the main pipeline before or after HAC clustering:
plot_dendrogram(data, method='ward')

In [None]:
feature_importance(data, labels)