RFM기반 군집분석

In [65]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

rfm_file = "RFM_Score.csv"
rfm_df = pd.read_csv(rfm_file, encoding="utf-8")

rfm_features = rfm_df[["Recency", "Frequency", "Monetary"]]

# 데이터 정규화
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_features)

# 최적 K 찾기 (엘보우 방법)
distortions = []
K_range = range(2, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    distortions.append(kmeans.inertia_)

# K-Means 클러스터링 (K=4 )
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm_df["클러스터"] = kmeans.fit_predict(rfm_scaled)

# 클러스터별 평균 RFM 값 
cluster_analysis_ = rfm_df.groupby("클러스터")[["Recency", "Frequency", "Monetary"]].mean().reset_index()

cluster_analysis

Unnamed: 0,클러스터,Recency,Frequency,Monetary
0,0,199.894821,6.494371,83510.1
1,1,32.692089,31.096529,353041.2
2,2,1.628415,583.907104,7706210.0
3,3,6.934623,200.198799,2330087.0


고객특성기반 군집분석

In [64]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

df_member = pd.read_csv("Member_Data.csv", encoding="cp949")

# 컬럼 선택 
customer_data = df_member[["나이", "성별", "결혼", "구독여부"]].fillna("미정")
customer_data["구독여부"] = customer_data["구독여부"].astype(str)  # 타입을 문자열로 변환

# 데이터 변환
column_transformer = ColumnTransformer([('num', StandardScaler(), ["나이"]),
    ('cat', OneHotEncoder(handle_unknown='ignore', dtype=int), ["성별", "결혼", "구독여부"])])

# K-Means 클러스터링 (K=4)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
customer_data["클러스터"] = kmeans.fit_predict(column_transformer.fit_transform(customer_data))

customer_data

Unnamed: 0,나이,성별,결혼,구독여부,클러스터
0,68,여,기혼,False,0
1,83,남,미정,False,0
2,39,여,기혼,False,1
3,73,여,미정,미정,0
4,52,여,기혼,False,0
...,...,...,...,...,...
12535,34,여,미혼,False,3
12536,24,남,미혼,False,3
12537,28,남,미정,False,2
12538,19,남,미혼,미정,3


상품기반 군집분석

In [55]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns


# 데이터 로드
df_sales = pd.read_csv("Sales_Data05.csv", encoding='cp949')
product_df = pd.read_csv("Product_Data.csv", encoding='utf-8')

# 필요한 컬럼 선택 (회원번호, 제품번호, 구매수량)
sales_data = df_sales[['회원번호', '제품번호', '구매수량']]

# 제품번호 기준으로 제품 대분류 정보 추가
data = sales_data.merge(product_df[['제품번호', '물품대분류']], on='제품번호', how='left')

# 고객별 상품 유형별 구매 수량 Pivot Table
customer_product = data.pivot_table(index='회원번호', columns='물품대분류', values='구매수량', aggfunc='sum', fill_value=0)

# 데이터 표준화
scaler = StandardScaler()
customer_scaled = scaler.fit_transform(customer_product)

# 최적의 K 찾기 
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(customer_scaled)
    inertia.append(kmeans.inertia_)

# K-Means 클러스터링 (K=3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(customer_scaled)

# 결과 저장
customer_product['Cluster'] = clusters

# 클러스터별 특성 확인
cluster_summary = customer_product.groupby('Cluster').mean()

cluster_summary

물품대분류,가루,건강일반,건어물,견과,과실주,과일,과일채소,과자,기름/식초,김장채소,...,절임/장아찌,중량(정육),즉석조리,차,찹쌀,콩/화본/깨,콩나물,해조,홍삼/녹용,화장품
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.307604,1.501252,2.937839,0.926244,0.037164,4.924494,3.991366,9.710612,2.013722,0.0,...,1.504288,3.5992,1.188679,1.037164,0.627216,1.82733,5.38422,3.562607,0.178193,1.777587
1,0.513531,0.250607,0.368863,0.109813,0.004258,0.410666,0.301817,1.516706,0.26126,0.0,...,0.183478,0.349451,0.154996,0.197388,0.076079,0.23609,0.741957,0.53151,0.037609,0.272426
2,9.327354,5.37,8.248879,3.242152,0.192825,23.596637,20.087892,37.055336,6.130045,0.008969,...,3.816143,12.050673,4.349776,2.986547,1.713004,5.529148,11.93722,8.780269,0.649552,7.381166


할인/포인트사용기반 군집분석

In [62]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 데이터 로드
df_saels = pd.read_csv("Sales_Data05.csv", encoding='cp949')

# 필요한 컬럼 선택 (회원번호, 사용 적립금, 사용 포인트)
discount_data = df_sales[['회원번호', '사용 적립금', '사용 포인트 네이버']]

# 결측값 처리 (새로운 데이터프레임에 할당)
discount_data = discount_data.copy()
discount_data.fillna(0, inplace=True)

# 데이터 표준화
scaler = StandardScaler()
discount_scaled = scaler.fit_transform(discount_data[['사용 적립금', '사용 포인트 네이버']])

# 최적의 K 찾기 
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(discount_scaled)
    inertia.append(kmeans.inertia_)

# K-Means 클러스터링 (K=3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
discount_data['Cluster'] = kmeans.fit_predict(discount_scaled)

# 클러스터별 평균 할인/포인트 사용 패턴 
cluster_summary = discount_data.groupby('Cluster').mean()
cluster_summary

Unnamed: 0_level_0,회원번호,사용 적립금,사용 포인트 네이버
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,740864900.0,19.394576,53.988632
1,733224500.0,457.962963,251600.0
2,729785200.0,7773.125583,61.723817
