# DM - Clustering analysis [TASK 2]

Explore the dataset using various clustering techniques.

Library imports and initial settings.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

from sklearn import metrics
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import mode
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform

from pyclustering.cluster import cluster_visualizer, cluster_visualizer_multidim
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.silhouette import silhouette


from utils import fetch_preprocessed_dataset, store_preprocessed_dataset, build_grid_plot

import sys
import logging as lg

root = lg.getLogger()
root.setLevel(lg.INFO)

handler = lg.StreamHandler(sys.stdout)
handler.setLevel(lg.DEBUG)
formatter = lg.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
root.addHandler(handler)

Fetching users dataset saved in data preparation phase.

In [None]:
# Load the users dataset from pickle
dataset = fetch_preprocessed_dataset(step_name="users_total")
users = dataset['users.pickle']

`users_num` is a dataset composed by numerical attributes of `users`.

In [None]:
users_num_cols = ['statuses_count', 'bot', 'tweets_num', 'tweets_2020_num', 'likes_num', 'comments_num', 'ratio_likes_x_tweets', 'entropy', 'texts_mean_length', 'texts_special_chars_length', 'mean_pub_range', 'mean_texts_similarity']
users_num = users[users_num_cols].astype({'bot': 'int64'})

users_num.info()

## Scaling phase

It's used `MinMaxScaler` to rescale all numerical numbers between 0 and 1

In [None]:
scaler = MinMaxScaler()
scaler.fit(users_num.values)

Using transform function we are able to scale our values. Scaler is useful also for the inverse transformation.

In [None]:
users_num_scaled = scaler.transform(users_num.values)

users_num_scaled

## Base clustering

We use base clustering methods to analyze the `users` dataset.

### KMeans

Firstly we run kmeans algorithms several times and we create a list of SSE measures to understand which is the proper number to asign to k.

In [None]:
sse_list = []
max_k = 30
for k in range(2, max_k + 1):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(users_num_scaled)

    sse = kmeans.inertia_
    sse_list.append(sse)

We plot results below.

In [None]:
plt.plot(range(2, len(sse_list) + 2), sse_list)
plt.ylabel('SSE')
plt.xlabel('K')
plt.grid(True)
plt.show()

Now we build our model with the proper k and print different metrics to understand the goodness of our model.

In [None]:
kmeans = KMeans(n_clusters=6, n_init=10, max_iter=100)
kmeans.fit(users_num_scaled)

lg.info(f'SSE: {kmeans.inertia_}')
lg.info(f'Silhouette: {silhouette_score(users_num_scaled, kmeans.labels_)}')
lg.info(f'Separation {metrics.davies_bouldin_score(users_num_scaled, kmeans.labels_)}')

Here we plot the distribution of labels among different clusters.

In [None]:
labels_distribution = np.unique(kmeans.labels_, return_counts=True)
plt.bar(labels_distribution[0], labels_distribution[1])
plt.show()

Parallel coordinates plot used to see distinguishability of clusters for each attribute.

In [None]:
centers = scaler.inverse_transform(kmeans.cluster_centers_)

for i in range(0, len(centers)):
    plt.plot(centers[i], marker='o', label=f'Cluster {i}')
plt.xticks(range(0, len(users_num.columns)), users_num.columns, rotation=90)
plt.legend(fontsize=10)
plt.yscale('log')
plt.show()

Scatter-plots for some attribute couples.

In [None]:
scatter_plt_pairs = [
    (0, 2),
    (0, 8),
    (2, 5),
    (2, 8),
]

configs = [
    {
        'type': 'scatter',
        'df': users_num,
        'labels': kmeans.labels_,
        'centers': centers,
        'x_index': x_index,
        'y_index': y_index,
    } for x_index, y_index in scatter_plt_pairs
]

build_grid_plot(configs=configs)

Bar plot able to show categorical attribute distribution among clusters.

In [None]:
users_pct = pd.crosstab(kmeans.labels_, users_num['bot'].map(lambda x: 'Bot' if x else 'User'))

users_pct.plot(kind='bar', stacked=False, title='bot x cluster')
plt.xlabel('cluster')
plt.ylabel('bot')
plt.show()

### DBSCAN

To choose a good model we start to scan with a gridsearch some values inside a DBSCAN model. We fix eps to 0.1 and take n values for test k. Using the silhouette metric we can understand the goodness of the model and choose the best.

In [None]:
best_k = None
best_metric = 0

for k in tqdm_notebook([10, 20, 50, 100, 150, 200, 500, 1000]):
    dbscan = DBSCAN(eps=0.1, min_samples=k)
    dbscan.fit(users_num_scaled)
    metric = silhouette_score(users_num_scaled, dbscan.labels_)
    if metric > best_metric:
        best_metric = metric
        best_k = k

k = best_k

lg.info(f'Best k is {best_k}')
lg.info(f'Best silhouette is {best_metric}')

Here we compute the pairwise distances among `users_num_scaled` to use in knee method.

In [None]:
# pair wise distance
dist = pdist(users_num_scaled, 'euclidean')
# transformation of distance in square form
dist = squareform(dist)

lg.info(dist)

Now we apply the knee method to understand the best number to assign to the radius dimension (eps) given the number of minimum points (k). To do this we take the k-th distances.

In [None]:
kth_distances = [d[np.argsort(d)[k]] for d in dist]

Plot related to the distances from k-th neigbours sorted. We use log scale on y axis to understand better the knee.

In [None]:
plt.plot(range(0, len(kth_distances)), sorted(kth_distances))
plt.ylabel(f'dist from {k}th neighbor')
plt.xlabel('sorted distances')
plt.grid(True)
plt.yscale('log')
plt.show()

Construction of our DBSCAN model using fixed k and eps selected from the previous analysis.

In [None]:
dbscan = DBSCAN(eps=0.11, min_samples=k)
dbscan.fit(users_num_scaled)

lg.info(f'Silhouette {silhouette_score(users_num_scaled, dbscan.labels_)}')
lg.info(f'Separation {metrics.davies_bouldin_score(users_num_scaled, dbscan.labels_)}')

Here we show in a bar-plot the labels distribution. -1 is related to outliers.

In [None]:
labels_distribution = np.unique(dbscan.labels_, return_counts=True)
plt.bar(labels_distribution[0], labels_distribution[1])
plt.show()

Print the number of outliers detected from DBSCAN.

In [None]:
lg.info(f'Number of detected outliers: {labels_distribution[1][0]} ({labels_distribution[1][0] / len(users) * 100}%)')

The following scatter-plots represents some analysis between some pairs of attributes of users_num dataset.

In [None]:
configs = [
    {
        'type': 'scatter',
        'df': users_num,
        'labels': dbscan.labels_,
        'x_index': x_index,
        'y_index': y_index,
    } for x_index, y_index in scatter_plt_pairs
]

build_grid_plot(configs=configs)

## Hierarchical clustering

Hierarchical clustering technique is able to find dendograms that represents the division of different clusters according the eucledian distance. At the beginning we compute the distance matrix of `users_num_scaled` dataset.

In [None]:
data_dist = pdist(users_num_scaled, metric='euclidean')

Dendogram related to hierarchical clustering based on MIN link.

In [None]:
data_link = linkage(data_dist, method='single', metric='euclidean')
res = dendrogram(data_link, color_threshold=0.45, truncate_mode='lastp')

Dendogram related to hierarchical clustering based on MAX link.

In [None]:
data_link = linkage(data_dist, method='complete', metric='euclidean')
res = dendrogram(data_link, color_threshold=1.25, truncate_mode='lastp')

Dendogram related to hierarchical clustering based on average group link.

In [None]:
data_link = linkage(data_dist, method='average', metric='euclidean')
res = dendrogram(data_link, color_threshold=1, truncate_mode='lastp')

Dendogram related to hierarchical clustering based on clusters link.

In [None]:
data_link = linkage(data_dist, method='centroid', metric='euclidean')
res = dendrogram(data_link, color_threshold=0.9, truncate_mode='lastp')

## Advance clustering

### XMeans

X-means initializatioon and process.

In [None]:
# amount of initial centers defines amount of clusters from which X-Means will start analysis
amount_initial_centers = 2
initial_centers = kmeans_plusplus_initializer(users_num_scaled, amount_initial_centers).initialize()

# create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters to the assigned max
xmeans_instance = xmeans(users_num_scaled, initial_centers, 20)
xmeans_instance.process()

# extract clustering results: clusters and their centers
clusters = xmeans_instance.get_clusters()
centers = xmeans_instance.get_centers()

lg.info(f'Number of clusters: {len(clusters)}')

Some metrics calculations.

In [None]:
lg.info(f'SSE: {xmeans_instance.get_total_wce()}')
lg.info(f'Silhouette: {np.mean(silhouette(users_num_scaled, clusters).process().get_score())}')

Bar-plot used to understand the distribution among clusters.

In [None]:
np_clusters = [len(c) for c in clusters]
plt.bar(list(range(len(np_clusters))), np_clusters)
plt.show()

Some scatter-plot to visualize clusters and centroids.

In [None]:
visualizer = cluster_visualizer_multidim()
visualizer.append_clusters(clusters, users_num_scaled.tolist())
visualizer.append_cluster(centers, None, marker='*', markersize=10, color='r')

for dim in scatter_plt_pairs:
    visualizer.show(pair_filter=[dim])