Import dependancies.

In [None]:
# To mute annoying warnings in notebook
import warnings

# For Data science
import pandas as pd
import numpy as np

# Math plot
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import (
    preprocessing,
    metrics,
    decomposition,
    cluster,
)

from clustergram import Clustergram

warnings.filterwarnings("ignore")

# Getting data, observations
## Get dataset

In [None]:
# Get dataset from file
data = pd.read_csv(
    "../data/SouthGermanCredit.asc",
    delimiter=" ",
)

In [None]:
# Get data info
data.info()

# Preprocessing

## Scaling

In [None]:
# Scale data
scaler = preprocessing.StandardScaler()

X_scaled = scaler.fit_transform(data)

# Processing

## Clustering

Choose data and cluster amount for agglomerative and k-means methods

In [None]:
# Get scaled data
data_to_clustering = X_scaled

# Get maximal cluster amount
cluster_amount = 10

## Agglomerative

In [None]:
# Set metric data lists
silhouette_agglomerative = []
davies_agglomerative = []
calinski_agglomerative = []
distortions_agglomerative = []

# Iterate over cluster number
for i in range(2, cluster_amount + 1):
    clustering = cluster.AgglomerativeClustering(
        n_clusters=i,
    ).fit(data_to_clustering)

    ss = metrics.silhouette_score(data_to_clustering, clustering.labels_)
    silhouette_agglomerative.append(ss)

    dbs = metrics.davies_bouldin_score(data_to_clustering, clustering.labels_)
    davies_agglomerative.append(dbs)

    chs = metrics.calinski_harabasz_score(data_to_clustering, clustering.labels_)
    calinski_agglomerative.append(chs)

    agglomerative_labels = clustering.fit_predict(data_to_clustering)

Look at clusters by label

In [None]:
# Get dataframe for labels
agglomerative_labels_labels_df = pd.DataFrame(
    {
        "Index": [_ for _ in range(len(agglomerative_labels))],
        "Label": list(agglomerative_labels),
    },
    columns=["Index", "Label"],
)

# Group dataframe by label value
label_counts = agglomerative_labels_labels_df.groupby("Label").size()

# Show label distribution
label_counts.plot.bar()

plt.title("Clusters from agglomerative")
plt.xlabel("Cluster label")
plt.ylabel("Cluster entries");

## K-means

In [None]:
# Set metric data lists
silhouette_KMeans = []
davies_KMeans = []
calinski_KMeans = []
distortions_KMeans = []

# Iterate over cluster number
for i in range(2, cluster_amount + 1):
    clustering = cluster.KMeans(
        n_clusters=i,
        random_state=0,
        n_init="auto",
    ).fit(data_to_clustering)

    ss = metrics.silhouette_score(data_to_clustering, clustering.labels_)
    silhouette_KMeans.append(ss)

    dbs = metrics.davies_bouldin_score(data_to_clustering, clustering.labels_)
    davies_KMeans.append(dbs)

    chs = metrics.calinski_harabasz_score(data_to_clustering, clustering.labels_)
    calinski_KMeans.append(chs)

    distortions_KMeans.append(clustering.inertia_)

    kmeans_labels = clustering.labels_

Look at clusters by label

In [None]:
# Get dataframe for labels
kmeans_labels_df = pd.DataFrame(
    {"Index": [_ for _ in range(len(kmeans_labels))], "Label": list(kmeans_labels)},
    columns=["Index", "Label"],
)

# Group dataframe by label value
label_counts = kmeans_labels_df.groupby("Label").size()

# Show label distribution
label_counts.plot.bar()

plt.title("Clusters from kmeans")
plt.xlabel("Cluster label")
plt.ylabel("Cluster entries");

Normalize metrics data for better visualization

In [None]:
# Set function to normalize list
def get_list_normalized(list_to_normalize: list) -> list:
    max_value = max(list_to_normalize)

    return [_ / max_value for _ in list_to_normalize]


# Normalize lists with metrics data
# For agglomerative
silhouette_agglomerative_normalized = get_list_normalized(silhouette_agglomerative)
davies_agglgomerative_normalized = get_list_normalized(davies_agglomerative)
calinski_agglomerative_normalized = get_list_normalized(calinski_agglomerative)

# For k-means
silhouette_KMeans_normalized = get_list_normalized(silhouette_KMeans)
davies_KMeans_normalized = get_list_normalized(davies_KMeans)
calinski_KMeans_normalized = get_list_normalized(calinski_KMeans)
distortions_KMeans_normalized = get_list_normalized(distortions_KMeans)

In [None]:
# Set function to plot metrics
def get_list_plot(*lists):
    for _, data_list in enumerate(lists, 1):
        plt.plot(
            range(1, len(data_list) + 1),
            data_list,
            marker="o",
        )


# Plot metrics for agglomerative
get_list_plot(
    silhouette_agglomerative_normalized,
    davies_agglgomerative_normalized,
    calinski_agglomerative_normalized,
)

plt.title("Metrics for agglomerative method")
plt.xlabel("Cluster amount")
plt.ylabel("Normalized metrics value");

Metrics gives unclear view of cluster number but some effect is present between 2 and 4 clusters.

In [None]:
get_list_plot(
    silhouette_KMeans_normalized,
    davies_KMeans_normalized,
    calinski_KMeans_normalized,
    distortions_KMeans_normalized,
)

plt.title("Metrics for k-means method")
plt.xlabel("Cluster amount")
plt.ylabel("Normalized metrics value");

Metrics gives unclear view of cluster number but some effect is present between 3 and 5 clusters.

## HDBSCAN

In [None]:
# Get HDBSCAN clustering
clustering = cluster.HDBSCAN(
    min_cluster_size=10,
    cluster_selection_epsilon=0.5,
).fit(data)

# Get cluster labels
hdbscan_labels = clustering.labels_

# Get metrics
ss = metrics.silhouette_score(data_to_clustering, hdbscan_labels)
dbs = metrics.davies_bouldin_score(data_to_clustering, hdbscan_labels)
chs = metrics.calinski_harabasz_score(data_to_clustering, hdbscan_labels)

# Get dataframe for labels
hdbscan_labels_df = pd.DataFrame(
    {"Index": [_ for _ in range(len(hdbscan_labels))], "Label": list(hdbscan_labels)},
    columns=["Index", "Label"],
)

# Filter insufficient labels (-1, -2 etc)
hdbscan_labels_filtered = hdbscan_labels_df[hdbscan_labels_df["Label"] > -1]

# Group dataframe by label value
label_counts = hdbscan_labels_filtered.groupby("Label").size()

# Show label distribution
label_counts.plot.bar()

plt.title("Clusters from HDBSCAN")
plt.xlabel("Cluster label")
plt.ylabel("Cluster entries");

We have 2 clusters which dominate in dataset. But for the margin 3 clusters could be accepted.

In [None]:
# Get dataframe for metrics
hdbscan_metrics = pd.DataFrame(
    {
        "metrics": ["ss", "dbs", "chs"],
        "metrics_data": [ss, dbs, chs],
    }
)

# Show HDBSCAN metrics
hdbscan_metrics.plot.bar(
    x="metrics",
    y="metrics_data",
    legend=None,
)

plt.title("Metrics for HDBSCAN");

## CLASTERGRAM

In [None]:
# Get clustergram for maximal cluster amount
clustergram = Clustergram(
    k_range=range(1, cluster_amount),
    verbose=False,
)

# Fit data
clustergram.fit(data_to_clustering)

# Show clustergram
clustergram.plot();

Starting from the number of clusters 3 we observe 3 main threads which are looked stable with not sufficient fluence of entries between them.

# Processing

## Decomposition

In [None]:
# Get principal component analyzer
pca = decomposition.PCA(random_state=0)

# Fit scaled data
X_pca = pca.fit(data_to_clustering)

# Get explained variance (amount of variance explained by each of the selected components)
explained_variance_ratio = pca.explained_variance_ratio_

# Get cumulative explained variance for retained features
cumulative_explained_variance_ratio = np.cumsum(explained_variance_ratio)

# Plot explained variance
sns.lineplot(
    data=explained_variance_ratio,
    label="variance",
    color="g",
    marker="o",
)

plt.xlabel("Number of principal components")
plt.ylabel("explained_variance_ratio")
axis_2 = plt.gca().twinx()

sns.lineplot(
    data=cumulative_explained_variance_ratio,
    label="cumulative variance",
    color="r",
    marker="s",
)

plt.title("Explained variance by principal components")
plt.ylabel("cumulative_explained_variance_ratio");

In [None]:
# Get amount of retained components with cumulative explained variance more than 70%
retained_components = np.argmax(cumulative_explained_variance_ratio >= 0.71)

print(f"Number of retained components: {retained_components}.")

In [None]:
# Get analyzer for only components which cumulative explained variance is more than 70%
pca_reduced = decomposition.PCA(n_components=retained_components)

# Get decomposition for chosen components
X_reduced = pca_reduced.fit_transform(X_scaled)

# Get variance loss
loss = 1 - cumulative_explained_variance_ratio[retained_components]

print(f"Variance loss is {loss:.2f}.")

## Reduced dataset clusterization

In [None]:
# Get HDBSCAN clustering
clustering = cluster.HDBSCAN(
    min_cluster_size=10,
    cluster_selection_epsilon=0.5,
).fit(X_reduced)

# Get cluster labels
hdbscan_labels = clustering.labels_

# Get metrics
ss = metrics.silhouette_score(data_to_clustering, hdbscan_labels)
dbs = metrics.davies_bouldin_score(data_to_clustering, hdbscan_labels)
chs = metrics.calinski_harabasz_score(data_to_clustering, hdbscan_labels)

# Get dataframe for labels
hdbscan_labels_df = pd.DataFrame(
    {"Index": [_ for _ in range(len(hdbscan_labels))], "Label": list(hdbscan_labels)},
    columns=["Index", "Label"],
)

# Filter insufficient labels (-1, -2 etc)
hdbscan_labels_filtered = hdbscan_labels_df[hdbscan_labels_df["Label"] > -1]

# Group dataframe by label value
label_counts = hdbscan_labels_filtered.groupby("Label").size()

# Show label distribution
label_counts.plot.bar()

plt.title("Clusters from HDBSCAN")
plt.xlabel("Cluster label")
plt.ylabel("Cluster entries");

We have 2 clusters which dominate in dataset.

In [None]:
# Get clustergram for maximal cluster amount
clustergram = Clustergram(
    k_range=range(1, cluster_amount),
    method="kmeans",
    verbose=False,
)

# Fit data
clustergram.fit(X_reduced)

# Show clustergram
clustergram.plot();

It's strange but after reduction clustergram still shows 3 threads(

# Summary
1. Three methods of clustering used: k-means, hierarchical and HDBSCAN.
2. Three dominating clusters are found in non-reduced dataset.
3. After decomposition two dominating clusters are found.
4. Conservative assessment is three clusters.