In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MeanShift, DBSCAN, estimate_bandwidth
from sklearn.metrics import silhouette_score

In [3]:
np.random.seed(42)

In [4]:
df_b=pd.read_csv('df_behaviour.csv',delimiter=',')
df_d=pd.read_csv('df_demo.csv',delimiter=',')
df_p=pd.read_csv('df_psy.csv',delimiter=',')

In [5]:
df_b.set_index("customer_id", inplace=True)
df_d.set_index("customer_id", inplace=True)
df_p.set_index("customer_id", inplace=True)

# Behaviour Segment

In [24]:
# Define the range of quantile values to test
quantile_range = np.linspace(0.01, 0.2, 20)  # Adjust range and steps as needed
results = []

# Iterate over quantile values
for quantile in quantile_range:
    # Estimate bandwidth for the current quantile
    bandwidth = estimate_bandwidth(df_b, quantile=quantile, random_state=42, n_jobs=-1)
    
    # Skip if bandwidth is too small or invalid
    if bandwidth <= 0:
        continue
    
    # Fit MeanShift with the computed bandwidth
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
    ms_labels_b = ms.fit_predict(df_b)
    
    # Calculate the number of clusters
    n_clusters = len(np.unique(ms_labels_b))
    
    # Calculate the silhouette score (only if more than one cluster exists)
    if n_clusters > 1:
        silhouette_avg = silhouette_score(df_b, ms_labels_b)
    else:
        silhouette_avg = -1  # Invalid silhouette score for single cluster
    
    # Store the results
    results.append({
        'quantile': quantile,
        'bandwidth': bandwidth,
        'n_clusters': n_clusters,
        'silhouette_score': silhouette_avg
    })

# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results)

# Find the best configuration based on silhouette score
best_result = results_df.loc[results_df['silhouette_score'].idxmax()]
print("Best configuration:", best_result)

# Print all results for inspection
print(results_df)


Best configuration: quantile            0.080000
bandwidth           0.181866
n_clusters          3.000000
silhouette_score    0.361326
Name: 7, dtype: float64
    quantile  bandwidth  n_clusters  silhouette_score
0       0.01   0.084726          63          0.258128
1       0.02   0.108696          25          0.264796
2       0.03   0.126565          16          0.204330
3       0.04   0.140672          13          0.197699
4       0.05   0.152355           7          0.258493
5       0.06   0.163331           7          0.208584
6       0.07   0.172897           4          0.296025
7       0.08   0.181866           3          0.361326
8       0.09   0.190205           3          0.352835
9       0.10   0.197899           3          0.349826
10      0.11   0.205236           3          0.348965
11      0.12   0.212398           3          0.357562
12      0.13   0.219459           3          0.361125
13      0.14   0.226344           2          0.264787
14      0.15   0.233005       

In [10]:
bandwidth = estimate_bandwidth(df_b, quantile=0.07, random_state=42, n_jobs=-1)
bandwidth

0.1728967810297375

In [11]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms_labels_b = ms.fit_predict(df_b)

ms_n_clusters = len(np.unique(ms_labels_b))
print("Number of estimated clusters : %d" % ms_n_clusters)

Number of estimated clusters : 4


In [31]:
ms_labels_b

array([3, 3, 3, ..., 3, 3, 3], dtype=int64)

In [12]:
silhouette_avg_b = silhouette_score(df_b, ms_labels_b)
print("The average silhouette score is :", silhouette_avg_b)

The average silhouette score is : 0.2960251613337531


# Demographic Segment

In [34]:
# Define the range of quantile values to test
quantile_range = np.linspace(0.01, 0.2, 20)  # Adjust range and steps as needed
results = []

# Iterate over quantile values
for quantile in quantile_range:
    # Estimate bandwidth for the current quantile
    bandwidth = estimate_bandwidth(df_d, quantile=quantile, random_state=42, n_jobs=-1)
    
    # Skip if bandwidth is too small or invalid
    if bandwidth <= 0:
        continue
    
    # Fit MeanShift with the computed bandwidth
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
    ms_labels_d = ms.fit_predict(df_d)
    
    # Calculate the number of clusters
    n_clusters = len(np.unique(ms_labels_d))
    
    # Calculate the silhouette score (only if more than one cluster exists)
    if n_clusters > 1:
        silhouette_avg = silhouette_score(df_d, ms_labels_d)
    else:
        silhouette_avg = -1  # Invalid silhouette score for single cluster
    
    # Store the results
    results.append({
        'quantile': quantile,
        'bandwidth': bandwidth,
        'n_clusters': n_clusters,
        'silhouette_score': silhouette_avg
    })

# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results)

# Find the best configuration based on silhouette score
best_result = results_df.loc[results_df['silhouette_score'].idxmax()]
print("Best configuration:", best_result)

# Print all results for inspection
print(results_df)


Best configuration: quantile             0.010000
bandwidth            0.002448
n_clusters          66.000000
silhouette_score     0.999869
Name: 0, dtype: float64
    quantile  bandwidth  n_clusters  silhouette_score
0       0.01   0.002448          66          0.999869
1       0.02   0.005190          66          0.999869
2       0.03   0.007309          66          0.999869
3       0.04   0.010096          66          0.999869
4       0.05   0.013577          42          0.782459
5       0.06   0.016815          32          0.666123
6       0.07   0.019245          27          0.612597
7       0.08   0.024718          23          0.581784
8       0.09   0.027530          19          0.555262
9       0.10   0.029400          22          0.557308
10      0.11   0.031294          15          0.562115
11      0.12   0.032378          15          0.560617
12      0.13   0.033650          14          0.546160
13      0.14   0.035619          14          0.490154
14      0.15   0.037050   

In [13]:
bandwidth = estimate_bandwidth(df_d, quantile=0.19, random_state=42, n_jobs=-1)
bandwidth

0.04406926450521398

In [14]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms_labels_d = ms.fit_predict(df_d)

ms_n_clusters = len(np.unique(ms_labels_d))
print("Number of estimated clusters : %d" % ms_n_clusters)

Number of estimated clusters : 6


In [15]:
silhouette_avg_d = silhouette_score(df_d, ms_labels_d)
print("The average silhouette score is :", silhouette_avg_d)

The average silhouette score is : 0.6272116711478545


# Psychographic Segment

In [38]:
# Define the range of quantile values to test
quantile_range = np.linspace(0.01, 0.2, 20)  # Adjust range and steps as needed
results = []

# Iterate over quantile values
for quantile in quantile_range:
    # Estimate bandwidth for the current quantile
    bandwidth = estimate_bandwidth(df_p, quantile=quantile, random_state=42, n_jobs=-1)
    
    # Skip if bandwidth is too small or invalid
    if bandwidth <= 0:
        continue
    
    # Fit MeanShift with the computed bandwidth
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
    ms_labels_p = ms.fit_predict(df_p)
    
    # Calculate the number of clusters
    n_clusters = len(np.unique(ms_labels_p))
    
    # Calculate the silhouette score (only if more than one cluster exists)
    if n_clusters > 1:
        silhouette_avg = silhouette_score(df_p, ms_labels_p)
    else:
        silhouette_avg = -1  # Invalid silhouette score for single cluster
    
    # Store the results
    results.append({
        'quantile': quantile,
        'bandwidth': bandwidth,
        'n_clusters': n_clusters,
        'silhouette_score': silhouette_avg
    })

# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results)

# Find the best configuration based on silhouette score
best_result = results_df.loc[results_df['silhouette_score'].idxmax()]
print("Best configuration:", best_result)

# Print all results for inspection
print(results_df)


Best configuration: quantile            0.190000
bandwidth           0.052117
n_clusters          3.000000
silhouette_score    0.545880
Name: 18, dtype: float64
    quantile  bandwidth  n_clusters  silhouette_score
0       0.01   0.022210        1190          0.159333
1       0.02   0.027953         717          0.157185
2       0.03   0.032055         450          0.178993
3       0.04   0.035184         291          0.190812
4       0.05   0.037720         216          0.201547
5       0.06   0.039872         147          0.230102
6       0.07   0.041654          93          0.268369
7       0.08   0.043162          70          0.288685
8       0.09   0.044414          49          0.308235
9       0.10   0.045523          40          0.322757
10      0.11   0.046533          27          0.354861
11      0.12   0.047449          24          0.355714
12      0.13   0.048276          15          0.388325
13      0.14   0.049024           6          0.489448
14      0.15   0.049724      

In [6]:
bandwidth = estimate_bandwidth(df_p, quantile=0.15, random_state=42, n_jobs=-1)
bandwidth

0.04972364612900115

In [7]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms_labels_p = ms.fit_predict(df_p)

ms_n_clusters = len(np.unique(ms_labels_p))
print("Number of estimated clusters : %d" % ms_n_clusters)

Number of estimated clusters : 5


In [8]:
silhouette_avg_p = silhouette_score(df_p, ms_labels_p)
print("The average silhouette score is :", silhouette_avg_p)

The average silhouette score is : 0.5033976800704945


# Results

In [16]:
silhouette_scores = [silhouette_avg_b, silhouette_avg_d, silhouette_avg_p]

segments = ['Behaviour', 'Demographic', 'Psychographic']
components =[4,6,5]

table = pd.DataFrame({
    'Segment': segments,
    'Silhouette Score': silhouette_scores,
    'Number of Components': components
})

print(table)

         Segment  Silhouette Score  Number of Components
0      Behaviour          0.296025                     4
1    Demographic          0.627212                     6
2  Psychographic          0.503398                     5
