In [1]:
import pandas as pd
from pathlib import Path

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
main_dir = "drive/MyDrive/dataset/"

In [None]:
final_df = pd.read_csv(main_dir + 'final_data_new_regency.csv')
final_df.head()

In [None]:
final_df.describe()

# Clustering DBScan

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Assuming `final_df` is your dataset with appropriate columns

# Define the columns that need to be scaled and encoded
num_cols = ['total_guest_capacity', 'avg_price_per_day', 'area_distance_to_airport']
cat_cols = ['property_type', 'area_regency_city']

# Create the transformers
scaler = MinMaxScaler()
encoder = OneHotEncoder(drop='first')

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_cols),
        ('cat', encoder, cat_cols)
    ])

# Define the KMeans clustering algorithm with the new parameters
kmeans = KMeans(n_clusters=8, init='random', n_init=50, max_iter=300, tol=1e-04, random_state=0)

# Define the DBSCAN clustering algorithm
dbscan = DBSCAN(eps=1.15, min_samples=3)

# Define the AgglomerativeClustering algorithm
agglo = AgglomerativeClustering(n_clusters=19, linkage='single')

# Create pipelines for each clustering algorithm
pipeline_kmeans = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', kmeans)])
pipeline_dbscan = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', dbscan)])
pipeline_agglo = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', agglo)])

# Fit the pipelines
pipeline_kmeans.fit(final_df)
pipeline_dbscan.fit(final_df)
pipeline_agglo.fit(final_df)

# Get cluster labels from each algorithm
labels_kmeans = pipeline_kmeans.named_steps['cluster'].labels_
labels_dbscan = pipeline_dbscan.named_steps['cluster'].labels_
labels_agglo = pipeline_agglo.named_steps['cluster'].labels_

# If DBSCAN results in a single cluster, modify eps or min_samples
if len(set(labels_dbscan)) <= 1:
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    pipeline_dbscan = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', dbscan)])
    pipeline_dbscan.fit(final_df)
    labels_dbscan = pipeline_dbscan.named_steps['cluster'].labels_

# Gather clustering results
clusterings = [labels_kmeans, labels_dbscan, labels_agglo]

# Create the co-association matrix
n_samples = final_df.shape[0]
co_assoc_matrix = np.zeros((n_samples, n_samples))

for labels in clusterings:
    for i in range(n_samples):
        for j in range(n_samples):
            if labels[i] == labels[j]:
                co_assoc_matrix[i, j] += 1

# Normalize the co-association matrix
co_assoc_matrix /= len(clusterings)

# Perform clustering on the co-association matrix
distance_matrix = 1 - co_assoc_matrix
hierarchical_clustering = linkage(squareform(distance_matrix), method='average')
ensemble_labels = fcluster(hierarchical_clustering, t=8, criterion='maxclust')

# Add ensemble labels to the dataframe
final_df['ensemble_cluster_label'] = ensemble_labels

# Preprocess the data for evaluation metrics
preprocessed_data = preprocessor.fit_transform(final_df)

# Check the number of unique clusters in the ensemble result
unique_clusters = len(set(ensemble_labels))
if unique_clusters > 1:
    # Evaluate the ensemble clustering result
    silhouette_avg = silhouette_score(preprocessed_data, ensemble_labels)
    davies_bouldin_avg = davies_bouldin_score(preprocessed_data, ensemble_labels)
    print('Ensemble Silhouette Score:', silhouette_avg)
    print('Ensemble Davies-Bouldin Index:', davies_bouldin_avg)
else:
    print(f"Ensemble clustering resulted in {unique_clusters} unique cluster(s). Silhouette score and Davies-Bouldin Index cannot be computed.")

# Print the dataframe with cluster labels
print(final_df[['unit_id', 'ensemble_cluster_label']])


In [6]:
silhouette_avg = silhouette_score(preprocessed_data, ensemble_labels)
davies_bouldin_avg = davies_bouldin_score(preprocessed_data, ensemble_labels)
print('Ensemble Silhouette Score:', silhouette_avg)
print('Ensemble Davies-Bouldin Index:', davies_bouldin_avg)

Ensemble Silhouette Score: 0.7845887998465851
Ensemble Davies-Bouldin Index: 0.8429919540465185


In [None]:
pd.set_option('display.max_rows', None)
final_df.sort_values(by='ensemble_cluster_label')

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the columns that need to be scaled and encoded
num_cols = ['total_guest_capacity', 'bedroom', 'bathroom', 'beds', 'avg_price_per_day', 'area_distance_to_airport']
cat_cols = ['property_type', 'area_regency_city']

# Create the transformers
scaler = MinMaxScaler()
encoder = OneHotEncoder(drop='first')

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_cols),
        ('cat', encoder, cat_cols)
    ])

eps_value = 1.15
min_samples_value = 3
# Define the DBSCAN clustering algorithm
cluster = DBSCAN(eps=eps_value, min_samples=min_samples_value)  # You may need to tune these parameters

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('cluster', cluster)])

# Fit the pipeline
pipeline.fit(final_df)

# Get cluster labels and add them to the DataFrame
cluster_labels = pipeline.named_steps['cluster'].labels_
final_df['cluster_label'] = cluster_labels

print(final_df[['unit_id', 'cluster_label']])

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
# from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from scipy.cluster.hierarchy import linkage, fcluster
# from scipy.spatial.distance import squareform
# from sklearn.metrics import silhouette_score, davies_bouldin_score

# # Assuming `final_df` is your dataset with appropriate columns

# # Define the columns that need to be scaled and encoded
# num_cols = ['total_guest_capacity', 'avg_price_per_day', 'area_distance_to_airport']
# cat_cols = ['property_type', 'area_regency_city']

# # Create the transformers
# scaler = MinMaxScaler()
# encoder = OneHotEncoder(drop='first')

# # Create the column transformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', scaler, num_cols),
#         ('cat', encoder, cat_cols)
#     ])

# # Define the clustering algorithms with optimized parameters
# kmeans = KMeans(n_clusters=8, init='random', n_init=50, max_iter=300, tol=1e-04, random_state=0)
# dbscan = DBSCAN(eps=0.5, min_samples=5)
# agglo = AgglomerativeClustering(n_clusters=8, linkage='average')

# # Create pipelines for each clustering algorithm
# pipeline_kmeans = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', kmeans)])
# pipeline_dbscan = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', dbscan)])
# pipeline_agglo = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', agglo)])

# # Fit the pipelines
# pipeline_kmeans.fit(final_df)
# pipeline_dbscan.fit(final_df)
# pipeline_agglo.fit(final_df)

# # Get cluster labels from each algorithm
# labels_kmeans = pipeline_kmeans.named_steps['cluster'].labels_
# labels_dbscan = pipeline_dbscan.named_steps['cluster'].labels_
# labels_agglo = pipeline_agglo.named_steps['cluster'].labels_

# # If DBSCAN results in a single cluster, we already changed parameters above
# if len(set(labels_dbscan)) <= 1:
#     print("DBSCAN resulted in a single cluster, parameters may still need optimization.")

# # Gather clustering results
# clusterings = [labels_kmeans, labels_dbscan, labels_agglo]

# # Create the co-association matrix
# n_samples = final_df.shape[0]
# co_assoc_matrix = np.zeros((n_samples, n_samples))

# for labels in clusterings:
#     for i in range(n_samples):
#         for j in range(n_samples):
#             if labels[i] == labels[j]:
#                 co_assoc_matrix[i, j] += 1

# # Normalize the co-association matrix
# co_assoc_matrix /= len(clusterings)

# # Perform clustering on the co-association matrix
# distance_matrix = 1 - co_assoc_matrix
# hierarchical_clustering = linkage(squareform(distance_matrix), method='average')
# ensemble_labels = fcluster(hierarchical_clustering, t=8, criterion='maxclust')

# # Add ensemble labels to the dataframe
# final_df['ensemble_cluster_label'] = ensemble_labels

# # Preprocess the data for evaluation metrics
# preprocessed_data = preprocessor.fit_transform(final_df)

# # Check the number of unique clusters in the ensemble result
# unique_clusters = len(set(ensemble_labels))
# if unique_clusters > 1:
#     # Evaluate the ensemble clustering result
#     silhouette_avg = silhouette_score(preprocessed_data, ensemble_labels)
#     davies_bouldin_avg = davies_bouldin_score(preprocessed_data, ensemble_labels)
#     print('Ensemble Silhouette Score:', silhouette_avg)
#     print('Ensemble Davies-Bouldin Index:', davies_bouldin_avg)
# else:
#     print(f"Ensemble clustering resulted in {unique_clusters} unique cluster(s). Silhouette score and Davies-Bouldin Index cannot be computed.")

# # Print the dataframe with cluster labels
# print(final_df[['unit_id', 'ensemble_cluster_label']])


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_cols = ['total_guest_capacity', 'avg_price_per_day', 'area_distance_to_airport']
cat_cols = ['property_type', 'area_regency_city']

scaler = MinMaxScaler()
encoder = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_cols),
        ('cat', encoder, cat_cols)
    ])

cluster = KMeans(n_clusters=8, init='random', n_init=50, max_iter=300, tol=1e-04, random_state=0)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                        ('cluster', cluster)])

pipeline.fit(final_df)
cluster_labels = pipeline.named_steps['cluster'].labels_

final_df['cluster_label'] = cluster_labels

print(final_df[['unit_id', 'cluster_label']])

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Assuming `final_df` is your dataset with appropriate columns

# Define the columns that need to be scaled and encoded
num_cols = ['total_guest_capacity', 'avg_price_per_day', 'area_distance_to_airport']
cat_cols = ['property_type', 'area_regency_city']

# Create the transformers
scaler = MinMaxScaler()
encoder = OneHotEncoder(drop='first')

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_cols),
        ('cat', encoder, cat_cols)
    ])

# Define the KMeans clustering algorithm with the new parameters
kmeans = KMeans(n_clusters=8, init='random', n_init=50, max_iter=300, tol=1e-04, random_state=0)

# Define the DBSCAN clustering algorithm
dbscan = DBSCAN(eps=1.15, min_samples=3)

# Create pipelines for each clustering algorithm
pipeline_kmeans = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', kmeans)])
pipeline_dbscan = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', dbscan)])

# Fit the pipelines
pipeline_kmeans.fit(final_df)
pipeline_dbscan.fit(final_df)

# Get cluster labels from each algorithm
labels_kmeans = pipeline_kmeans.named_steps['cluster'].labels_
labels_dbscan = pipeline_dbscan.named_steps['cluster'].labels_

# If DBSCAN results in a single cluster, modify eps or min_samples
if len(set(labels_dbscan)) <= 1:
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    pipeline_dbscan = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', dbscan)])
    pipeline_dbscan.fit(final_df)
    labels_dbscan = pipeline_dbscan.named_steps['cluster'].labels_

# Gather clustering results
clusterings = [labels_kmeans, labels_dbscan]

# Create the co-association matrix
n_samples = final_df.shape[0]
co_assoc_matrix = np.zeros((n_samples, n_samples))

for labels in clusterings:
    for i in range(n_samples):
        for j in range(n_samples):
            if labels[i] == labels[j]:
                co_assoc_matrix[i, j] += 1

# Normalize the co-association matrix
co_assoc_matrix /= len(clusterings)

# Perform clustering on the co-association matrix
distance_matrix = 1 - co_assoc_matrix
hierarchical_clustering = linkage(squareform(distance_matrix), method='average')
ensemble_labels = fcluster(hierarchical_clustering, t=8, criterion='maxclust')

# Add ensemble labels to the dataframe
final_df['ensemble_cluster_label'] = ensemble_labels

# Preprocess the data for evaluation metrics
preprocessed_data = preprocessor.fit_transform(final_df)

# Check the number of unique clusters in the ensemble result
unique_clusters = len(set(ensemble_labels))
if unique_clusters > 1:
    # Evaluate the ensemble clustering result
    silhouette_avg = silhouette_score(preprocessed_data, ensemble_labels)
    davies_bouldin_avg = davies_bouldin_score(preprocessed_data, ensemble_labels)
    print('Ensemble Silhouette Score:', silhouette_avg)
    print('Ensemble Davies-Bouldin Index:', davies_bouldin_avg)
else:
    print(f"Ensemble clustering resulted in {unique_clusters} unique cluster(s). Silhouette score and Davies-Bouldin Index cannot be computed.")

# Print the dataframe with cluster labels
print(final_df[['unit_id', 'ensemble_cluster_label']])


In [8]:
silhouette_avg = silhouette_score(preprocessed_data, ensemble_labels)
davies_bouldin_avg = davies_bouldin_score(preprocessed_data, ensemble_labels)
print('Ensemble Silhouette Score:', silhouette_avg)
print('Ensemble Davies-Bouldin Index:', davies_bouldin_avg)

Ensemble Silhouette Score: 0.782397197411991
Ensemble Davies-Bouldin Index: 0.8882339537840735
