In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MeanShift, DBSCAN, estimate_bandwidth
from sklearn.metrics import silhouette_score

In [3]:
np.random.seed(42)

In [4]:
df_b=pd.read_csv('df_behaviour.csv',delimiter=',')
df_d=pd.read_csv('df_demo.csv',delimiter=',')
df_p=pd.read_csv('df_psy.csv',delimiter=',')

# Behaviour Segment

In [12]:
bandwidth = estimate_bandwidth(df_b, quantile=0.05, random_state=42, n_jobs=-1)
bandwidth

740.0607277304204

In [13]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms_labels_b = ms.fit_predict(df_b)

ms_n_clusters = len(np.unique(ms_labels_b))
print("Number of estimated clusters : %d" % ms_n_clusters)

Number of estimated clusters : 20


In [9]:
ms_labels_b

array([10, 10, 10, ...,  3,  3,  3], dtype=int64)

In [14]:
silhouette_avg_b = silhouette_score(df_b, ms_labels_b)
print("The average silhouette score is :", silhouette_avg_b)

The average silhouette score is : 0.5259462986109608


# Demographic Segment

In [16]:
bandwidth = estimate_bandwidth(df_d[['customer_age']], quantile=0.05, random_state=42, n_jobs=-1)
bandwidth

0.013577474433663474

In [17]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms_labels_d = ms.fit_predict(df_d[['customer_age']])

ms_n_clusters = len(np.unique(ms_labels_d))
print("Number of estimated clusters : %d" % ms_n_clusters)

Number of estimated clusters : 42


In [18]:
silhouette_avg_d = silhouette_score(df_d[['customer_age']], ms_labels_d)
print("The average silhouette score is :", silhouette_avg_d)

The average silhouette score is : 0.7824587500414355


# Psychographic Segment

In [19]:
bandwidth = estimate_bandwidth(df_p, quantile=0.05, random_state=42, n_jobs=-1)
bandwidth

755.3943879180619

In [20]:
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
ms_labels_p = ms.fit_predict(df_p)

ms_n_clusters = len(np.unique(ms_labels_p))
print("Number of estimated clusters : %d" % ms_n_clusters)

Number of estimated clusters : 24


In [21]:
silhouette_avg_p = silhouette_score(df_p, ms_labels_p)
print("The average silhouette score is :", silhouette_avg_p)

The average silhouette score is : 0.5089928444376317


# Results

In [22]:
silhouette_scores = [silhouette_avg_b, silhouette_avg_d, silhouette_avg_p]

segments = ['Behaviour', 'Demographic', 'Psychographic']
components =[20,42,24]

table = pd.DataFrame({
    'Segment': segments,
    'Silhouette Score': silhouette_scores,
    'Number of Components': components
})

print(table)

         Segment  Silhouette Score  Number of Components
0      Behaviour          0.525946                    20
1    Demographic          0.782459                    42
2  Psychographic          0.508993                    24
