In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
data = pd.read_csv('Mall_Customers.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data = data.drop(columns=['CustomerID'])
sns.pairplot(data, hue="Gender")
plt.show()

In [None]:
data['Gender'].replace(['Male','Female'],[0,1],inplace=True)
sns.heatmap(data.corr(),annot=True,linewidths=0.2) 
fig=plt.gcf()
plt.show()

In [None]:
X = data[['Annual Income (k$)','Spending Score (1-100)']]
X.head()

In [None]:
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title('ward method')
plt.xlabel('고객')
plt.ylabel('유클리디안 거리')
plt.show()

In [None]:
dendrogram = sch.dendrogram(sch.linkage(X, method = 'centroid'))
plt.title('centroid method')
plt.xlabel('고객')
plt.ylabel('유클리디안 거리')
plt.show()

In [None]:
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering
 
li = [2,5,8]

for i in li:
    agg = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward').fit(X)
    labels = agg.labels_
    plt.scatter(X['Annual Income (k$)'],X['Spending Score (1-100)'], c=labels, cmap='rainbow')
    plt.show()
    
    print('Silhouette Coefficient: ', metrics.silhouette_score(X, labels, metric='euclidean'))
    print('Davies_bouldin_score: ', metrics .davies_bouldin_score(X, labels))

In [None]:
from sklearn.cluster import KMeans
inertias = []
for i in range(1, 20):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
plt.plot(range(1, 20), inertias)
plt.xlabel('cluster number')
plt.ylabel('inertias')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
kmeans.fit(X)
label = kmeans.labels_
y_kmeans = kmeans.predict(X)
centers = kmeans.cluster_centers_
plt.scatter(X['Annual Income (k$)'],X['Spending Score (1-100)'], c=y_kmeans, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)

In [None]:
print('Silhouette Coefficient: ', metrics.silhouette_score(X, label, metric='euclidean'))
print('Davies_bouldin_score: ', metrics .davies_bouldin_score(X, label))

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth

li = [10,100,500]

for i in li:
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=i)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
 
    plt.scatter(X['Annual Income (k$)'],X['Spending Score (1-100)'], c=labels, s=50, cmap='viridis')
    plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='black', s=200, alpha=0.5);
    plt.show()
    
    print('Silhouette Coefficient: ', metrics.silhouette_score(X, labels, metric='euclidean'))
    print('Davies_bouldin_score: ', metrics .davies_bouldin_score(X, labels))

In [None]:
from sklearn.cluster import DBSCAN

li = [2,5,10,15]

for i in li:
    db = DBSCAN(eps=i, min_samples=5).fit(X)
    cluster = db.fit_predict(X)
    label = db.labels_
    plt.scatter(x = X['Annual Income (k$)'],y = X['Spending Score (1-100)'], c=cluster)
    plt.xlabel("Annual Income")
    plt.ylabel("Spending Score")
    plt.show()
    
    print('Silhouette Coefficient: ', metrics.silhouette_score(X, label, metric='euclidean'))
    print('Davies_bouldin_score: ', metrics .davies_bouldin_score(X, label))