##### This notebook performs and evaluates different baseline algorithms
In more detail it performs the following clustering algorithm categories:
1. Density-based clustering algorithms
2. Spectral clustering algorithms

In [20]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, OPTICS, HDBSCAN, SpectralClustering, KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [2]:
data = pd.read_pickle('../data/feature_elimination_temps/correlation_training_df.pkl')
data

Unnamed: 0,id,sleep_points,exertion_points,altitude,calories,lightly_active_minutes,moderately_active_minutes,sedentary_minutes,steps,very_active_minutes,...,WORK/SCHOOL,badges,exercises,exercise_duration,is_weekend,is_holiday,day_sin,hour_sin,day_cos,hour_cos
0,621e2e8e67b776a24055b564,0.810469,0.622928,0.0,0.029382,0.254701,0.083045,0.495139,0.017563,0.080685,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.500000,0.574623,1.000000
1,621e2e8e67b776a24055b564,0.810469,0.622928,0.0,0.002914,0.254701,0.083045,0.495139,0.000000,0.080685,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.629410,0.574623,0.982963
2,621e2e8e67b776a24055b564,0.810469,0.622928,0.0,0.000729,0.254701,0.083045,0.495139,0.046184,0.080685,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.750000,0.574623,0.933013
3,621e2e8e67b776a24055b564,0.810469,0.622928,0.0,0.012860,0.254701,0.083045,0.495139,0.002661,0.080685,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.853553,0.574623,0.853553
4,621e2e8e67b776a24055b564,0.810469,0.622928,0.0,0.003315,0.254701,0.083045,0.495139,0.000000,0.080685,...,0.0,0.0,0.074074,0.001865,0.0,0.0,0.005131,0.933013,0.574623,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159921,621e375b67b776a240290cdc,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.982963,0.020417,0.370590
159922,621e375b67b776a240290cdc,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.933013,0.020417,0.250000
159923,621e375b67b776a240290cdc,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.853553,0.020417,0.146447
159924,621e375b67b776a240290cdc,0.680095,0.720932,0.0,0.000383,0.000000,0.000000,0.490278,0.064983,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.350126,0.750000,0.020417,0.066987


Prepare data for clustering

In [3]:
user_id = data['id']
data.drop(columns=['id'], inplace=True)

##### 1. Perform multiple density-based clustering algorithms

Perform DBSCAN

In [None]:
start = time.time()
print("Clustering with DBSCAN ... ")

# perform clustering
dbscan = DBSCAN(eps=3, min_samples=2)
y = dbscan.fit_predict(data)
dbscan_data = data.copy()
dbscan_data['cluster'] = y

# evaluate clustering results
print ("The Silhouette score is:", silhouette_score(data, y))
print ("The Davies-Bouldin Index is:", davies_bouldin_score(data, y))
print ("The Calinski-Harabasz Index is:", calinski_harabasz_score(data, y))

print("DBSCAN finished after", time.time() - start)

Clustering with DBSCAN ... 


In [None]:
dbscan_data = pd.concat([user_id, dbscan_data], axis=1, ignore_index=True)
dbscan_data.to_csv('../data/clustering_results/dbscan_results.csv')

Perform OPTICS

In [None]:
start = time.time()
print("Clustering with OPTICS ... ")

# perform clustering
optics = OPTICS(min_samples=2)
y = optics.fit_predict(data)
optics_data = data.copy()
optics_data['cluster'] = y

# evaluate clustering results
print ("The Silhouette score is:", silhouette_score(data, y))
print ("The Davies-Bouldin Index is:", davies_bouldin_score(data, y))
print ("The Calinski-Harabasz Index is:", calinski_harabasz_score(data, y))

print("OPTICS finished after", time.time() - start)

In [None]:
optics_data = pd.concat([user_id, optics_data], axis=1, ignore_index=True)
optics_data.to_csv('../data/clustering_results/optics_results.csv')

Perform HDBSCAN

In [6]:
start = time.time()
print("Clustering with HDBSCAN ... ")

# perform clustering
hdbscan = HDBSCAN(min_cluster_size=100)
y = hdbscan.fit_predict(data)
hdbscan_data = data.copy()
hdbscan_data['cluster'] = y

# evaluate clustering results
print ("The Silhouette score is:", silhouette_score(data, y))
print ("The Davies-Bouldin Index is:", davies_bouldin_score(data, y))
print ("The Calinski-Harabasz Index is:", calinski_harabasz_score(data, y))

print("HDBSCAN finished after", time.time() - start)

Clustering with HDBSCAN ... 
The Silhouette score is: -0.055715805357403274
The Davies-Bouldin Index is: 2.0665128211784927
The Calinski-Harabasz Index is: 2565.3137153946623
HDBSCAN finished after 1968.430926322937


In [15]:
print("HDBSCAN detected", len(np.unique(y)), "clusters (including outliers).")

HDBSCAN detected 32 clusters (including outliers).


In [7]:
hdbscan_data = pd.concat([user_id, hdbscan_data], axis=1, ignore_index=True)
hdbscan_data.to_csv('../data/clustering_results/hdbscan_results.csv')

##### 2. Perform the spectral clustering algorithm

Perform Spectral clustering

In [None]:
start = time.time()
print("Clustering with Spectral ... ")

# perform clustering
spectral = SpectralClustering(n_clusters=2, assign_labels='cluster_qr', random_state=0)
y = spectral.fit_predict(data)
spectral_data = data.copy()
spectral_data['cluster'] = y

# evaluate clustering results
print ("The Silhouette score is:", silhouette_score(data, y))
print ("The Davies-Bouldin Index is:", davies_bouldin_score(data, y))
print ("The Calinski-Harabasz Index is:", calinski_harabasz_score(data, y))

print("Spectral finished after", time.time() - start)

In [None]:
spectral_data = pd.concat([user_id, spectral_data], axis=1, ignore_index=True)
spectral_data.to_csv('../data/clustering_results/spectral_results.csv')

##### K-means

In [None]:
start = time.time()
print("Clustering with K-means ... ")

# perform clustering
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto")
y = kmeans.fit_predict(data)
kmeans_data = data.copy()
kmeans_data['cluster'] = y

# evaluate clustering results
print ("The Silhouette score is:", silhouette_score(data, y))
print ("The Davies-Bouldin Index is:", davies_bouldin_score(data, y))
print ("The Calinski-Harabasz Index is:", calinski_harabasz_score(data, y))

print("K-means finished after", time.time() - start)

In [None]:
kmeans_data = pd.concat([user_id, kmeans_data], axis=1, ignore_index=True)
kmeans_data.to_csv('../data/clustering_results/kmeans_results.csv')

##### Visualize HDBSCAN results

In [25]:
clusters = np.unique(y)
for i in clusters:
    plt.scatter(hdbscan_data[y == i , 0] , hdbscan_data[y == i , 1] , y = i)
plt.legend()
plt.show()

InvalidIndexError: (array([False, False,  True, ...,  True,  True,  True]), 0)