# 고객 클러스터링

  1. 조건 및 파라미터 설정

In [None]:
dict_args = dict()
dict_args['cluster_count'] = list(range(2, 11))
dict_args['base_file'] = 'OnlineRetail.csv'
dict_args['min_cluster_size'] = 30

In [None]:
dict_args

  2. 샘플용 데이터 로딩

In [None]:
import pandas as pd
df = pd.read_csv(dict_args['base_file'], encoding = 'cp1252')
display(df)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,09-12-2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,09-12-2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,09-12-2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,09-12-2011 12:50,4.15,12680.0,France


In [None]:
df.info()

  3. 데이터 전처리

In [None]:
df_source = df.copy()
df_source['CustomerID'] = df_source['CustomerID'].fillna(-1)
df_source['CustomerID'] = df_source['CustomerID'].astype(float)
df_source['CustomerID'] = df_source['CustomerID'].astype(int)
df_source['CustomerID'] = df_source['CustomerID'].astype(str)

df_source['Amount'] = df_source['UnitPrice'] * df_source['Quantity']

df_source['InvoiceDate'] = pd.to_datetime(df_source['InvoiceDate'], format = '%d-%m-%Y %H:%M')

In [None]:
df_source.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


고객 클러스터링을 해야하는데 마땅한 수치형 변수가 Quantity UnitPrice 곱한 파생변수 Amount랑 Quantity, UnitPrice 밖에 없음..

R, F, M 기준의 수치형 변수를 생성

In [None]:
import numpy as np
# Recency
df_max = np.max(df_source['InvoiceDate'])
df_source['diff'] = df_max - df_source['InvoiceDate']
df_diffs = df_source.groupby(['CustomerID']).agg(Recency = ('diff', np.min))
df_diffs['Recency'] = df_diffs['Recency'].dt.days

# Frequency
df_frequency = df_source.groupby(['CustomerID']).agg(Frequency = ('InvoiceNo', 'count'))

# Monetary
df_amount = df_source.groupby(['CustomerID']).agg(Amount = ('Amount', sum))

df_final = df_diffs.join(df_frequency, how = 'inner')
df_final = df_final.join(df_amount, how = 'inner')
df_final = df_final.fillna(-1)

df_final = df_final[df_final.index != '-1']
df_final.head()

Unnamed: 0_level_0,Recency,Frequency,Amount
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,325,2,0.0
12347,1,182,4310.0
12348,74,31,1797.24
12349,18,73,1757.55
12350,309,17,334.4


RFM 방식으로 산출한 고객 클러스터링 실행

In [None]:
#to scale the data using z-score
from sklearn.preprocessing import StandardScaler

# importing clustering algorithms
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# Silhouette score
from sklearn.metrics import silhouette_score

df_base_cluster_source = df_final.copy()

# 데이터 스케일링
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(df_base_cluster_source), columns = df_base_cluster_source.columns)
data_scaled.head()

Unnamed: 0,Recency,Frequency,Amount
0,2.322023,-0.39172,-0.231001
1,-0.893733,0.382657,0.293432
2,-0.169196,-0.266959,-0.012316
3,-0.725005,-0.086271,-0.017146
4,2.16322,-0.327188,-0.190312


In [None]:
data_scaled_copy = data_scaled.copy(deep = True)

In [None]:
import matplotlib.pyplot as plt

sse = {} # 클러스터 중심과 클러스터샘플간 거리제곱합 (이너셔)
sc = {} # 실루엣 계수

for k in dict_args['cluster_count']:
    if k == 2:
        continue
    kmeans = KMeans(n_clusters = k, random_state = 1, init = 'k-means++')
    kmeans.fit(data_scaled)
    sse[k] = kmeans.inertia_

    labels = kmeans.predict(data_scaled)
    sc[k] = silhouette_score(data_scaled, labels)

# Elbow plot
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()), 'bx--')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

# Elbow point
plt.figure()
plt.plot(list(sc.keys()), list(sc.values()), 'bx--')
plt.xlabel("Number of cluster")
plt.ylabel("Silhouette Score")
plt.show()

k = 5 에서 실루엣 계수가 가장 높은것을 확인할 수 있음

<https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html>

In [None]:
def visualize_silouette(cluster_lists, X_features):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import math

    n_cols = len(cluster_lists)
    fig, axs = plt.subplots(figsize = (4*n_cols, 4), nrows = 1, ncols = n_cols)

    for ind, n_cluster in enumerate(cluster_lists):

        clusterer = KMeans(n_clusters = n_cluster, max_iter = 500, random_state = 1, init = 'k-means++')
        cluster_labels = clusterer.fit_predict(X_features)

        sil_avg = silhouette_score(X_features, cluster_labels)
        sil_values = silhouette_samples(X_features, cluster_labels)

        y_lower = 10
        axs[ind].set_title("Number of Cluster : "+ str(n_cluster) + '\n Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])

        for i in range(n_cluster):
            ith_cluster_sil_values = sil_values[cluster_labels == i]
            ith_cluster_sil_values.sort()

            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")

In [None]:
dict_args['cluster_count']

In [None]:
visualize_silouette(dict_args['cluster_count'][:4], data_scaled)

In [None]:
visualize_silouette(dict_args['cluster_count'][:4], data_scaled)

  * 빨간색 점선 : 전체 데이터의 실루엣 계수 평균임.



> 실루엣 계수는 -1에서 1 사이의 값을 가지며 1에 가까울 수록 근처 군집과 멀리 떨어져 있음을, 0에 가까울수록 근처 군집과 가까움을 의미합니다. -(마이너스)이면 아예 다른 군집에 데이터가 할당됐음을 의미합니다.

  * Sihouette score를 보면 5 일때 가장 높지만, 실루엣 계수가 음수인 데이터들이 존재함. -> 실루엣 계수가 음수이면 아에 다른 군집에 데이터가 할당됌
  * Sihouette score 3 일때는 각 클러스터의 별로 실루엣 계수가 비교적 균일함.

In [None]:
def visualize_silouette(cluster_lists, X_features):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import math

    n_cols = len(cluster_lists)
    fig, axs = plt.subplots(figsize = (4*n_cols, 4), nrows = 1, ncols = n_cols)

    for ind, n_cluster in enumerate(cluster_lists):

        clusterer = KMeans(n_clusters = n_cluster, max_iter = 500, random_state = 1, init = 'k-means++')
        cluster_labels = clusterer.fit_predict(X_features)

        sil_avg = silhouette_score(X_features, cluster_labels)
        sil_values = silhouette_samples(X_features, cluster_labels)

        y_lower = 10
        axs[ind].set_title("Number of Cluster : "+ str(n_cluster) + '\n Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])

        for i in range(n_cluster):
            ith_cluster_sil_values = sil_values[cluster_labels == i]
            ith_cluster_sil_values.sort()

            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")

In [None]:
visualize_silouette([3], data_scaled)

In [None]:
visualize_silouette([3], data_scaled)

군집 3으로 하여 0, 1, 2 로 나누는 것이 잘 나누어진 집단으로 보임