# K-means clustering으로 5개 고객군으로 분류


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
from qrytool import load_data_into_dataframe, insert_dataframe_into_table
import numpy as np


qry = """
SELECT "index",all_id ,이름,결제전화,"Predictions"::int AS 예측가격,아파트명,동네,건물종류,사용승인년월일::date,최단지하철역,도보거리,근접노선수,거래년월일
FROM clbe_customer_realestate_model
"""

df = load_data_into_dataframe(qry)

mean_value = df['예측가격'].mean()

# '예측가격' 컬럼의 NaN 값을 평균으로 대체
df['예측가격'].fillna(mean_value, inplace=True)


# 이미 준비된 DataFrame df 가 있다고 가정합니다.
# '예측가격' 컬럼을 이용하여 K-means 군집화 실행
kmeans = KMeans(n_clusters=5, random_state=0)  # 4개의 그룹으로 군집화
df['cluster'] = kmeans.fit_predict(df[['예측가격']])  # 예측가격 컬럼을 기준으로 군집화하고 그 결과를 새로운 컬럼 'cluster'에 저장
# 클러스터 중심값을 기준으로 새로운 클러스터 번호 할당
sorted_centers = np.argsort(kmeans.cluster_centers_.flatten())
new_cluster_labels = {old: new for new, old in enumerate(sorted_centers)}
df['cluster'] = df['cluster'].map(new_cluster_labels)

# 군집화 결과 확인
print(df[['예측가격', 'cluster']].head())  # 군집화된 결과의 일부를 출력


# Plotly를 사용한 군집화 결과 시각화
fig = px.scatter(df, x=df.index, y='예측가격', color='cluster',
                 labels={'x': 'index', '예측가격': '예측가격'},
                 hover_data=['아파트명', '동네', '건물종류', '이름', '결제전화'],
                 title='K-Means Clustering on 예측가격 with Plotly',
                 template='plotly_dark', height=600)
# # y축 레이블 포맷 변경
# fig.update_yaxes(tickvals=np.linspace(df['예측가격'].min(), df['예측가격'].max(), num=10),
#                  ticktext=[f"{x/100000:.1f}억" for x in np.linspace(df['예측가격'].min(), df['예측가격'].max(), num=10)])

# 군집 중심값과 경계재계값 계산
centers = np.sort(kmeans.cluster_centers_.flatten())
boundaries = (centers[:-1] + centers[1:]) / 2  # 연속된 중심값의 평균을 경계값으로 사용

# y축 틱 설정 (군집 중심값과 경계값 포함)
tick_values = np.concatenate([centers, boundaries])
tick_text = [f"{x/10000:.1f}억" for x in tick_values]
fig.update_yaxes(tickvals=tick_values, ticktext=tick_text)

# x축 범위 조정
fig.update_xaxes(range=[df['index'].min(), df['index'].max()])


# K-means 군집 경계선 추가
for center in kmeans.cluster_centers_:
    fig.add_shape(type='line',
                  x0=df['all_id'].min(), x1=df['all_id'].max(),
                  y0=center[0], y1=center[0],
                  line=dict(color='red', width=2, dash='dot'))

fig.update_traces(marker=dict(size=8, opacity=0.8, line=dict(width=0.5, color='DarkSlateGrey')))

fig

       예측가격  cluster
0   13187.0        0
1   19512.0        0
2   18605.0        0
3  174639.0        2
4   58333.0        1



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.







In [11]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
import plotly.express as px

# Sample DataFrame and KMeans clustering simulation
# df = pd.DataFrame({'예측가격': np.random.rand(100) * 100000})  # Random data for demonstration
# Assuming df['예측가격'] is already defined
mean_value = df['예측가격'].mean()
df['예측가격'].fillna(mean_value, inplace=True)
kmeans = KMeans(n_clusters=5, random_state=0)
df['cluster'] = kmeans.fit_predict(df[['예측가격']])

# Reassign cluster labels based on sorted cluster centers
sorted_centers = np.argsort(kmeans.cluster_centers_.flatten())
new_cluster_labels = {old: new for new, old in enumerate(sorted_centers)}
df['cluster'] = df['cluster'].map(new_cluster_labels)

# Calculate data count per cluster
cluster_counts = df['cluster'].value_counts().sort_index()

# Calculate price range per cluster
cluster_ranges = df.groupby('cluster')['예측가격'].agg([min, max])

# Create cluster labels with min and max prices
cluster_labels = [f"Cluster {i}: {row['min']/10000:.1f}억 - {row['max']/10000:.1f}억" for i, row in cluster_ranges.iterrows()]
percentages = (cluster_counts / cluster_counts.sum() * 100).round(2)
# Format text for each bar to show both counts and percentages
bar_texts = [f"{count} ({percent}%)"
             for count, percent in zip(cluster_counts, percentages)]

# Enhanced labels for displaying in the plots
enhanced_labels = [f"{label} ({count}개, {percent}%)" for label, count, percent in zip(cluster_labels, cluster_counts, percentages)]

# Colors
colors = px.colors.qualitative.Plotly

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'bar'}, {'type': 'pie'}]])

# Bar chart with counts above bars
# fig.add_trace(
#     go.Bar(x=cluster_labels, y=cluster_counts, text=cluster_counts, textposition='outside',
#            name='Data Count by Cluster', marker=dict(color=colors[:len(cluster_labels)])),
#     row=1, col=1
# )

# Bar chart with counts and percentages displayed on each bar
fig.add_trace(
    go.Bar(
        x=cluster_labels,  # x-axis labels (cluster labels)
        y=cluster_counts,  # y-axis values (counts)
        text=bar_texts,  # text labels showing count and percentage
        textposition='outside',  # position the text above the bars
        marker=dict(color=colors[:len(cluster_labels)]),  # color coding for the bars
        name='Data Count by Cluster'  # legend name
    ),
    row=1, col=1
)

# Pie chart with counts in the slices
fig.add_trace(
    go.Pie(labels=enhanced_labels, values=cluster_counts, textinfo='value+percent',
           name='Data Distribution by Cluster', marker=dict(colors=colors[:len(cluster_labels)])),
    row=1, col=2
)

# Update layout and title
fig.update_layout(
    title_text='Data Distribution Analysis by Cluster',
    height=600
)
fig


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.






The provided callable <built-in function min> is currently using SeriesGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.


The provided callable <built-in function max> is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.



In [12]:
insert_dataframe_into_table(df, 'clbe_realestate_clustering2', if_exists='append')

# 클러스터된 고객을 일반상품 고객과 매핑


In [None]:
#!/home/max/miniconda3/bin/python
import string
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import mysql.connector
import time
import json
from gspread_formatting import *
import pandas as pd
from qrytool import load_data_into_dataframe, insert_dataframe_into_table
from gspread_dataframe import set_with_dataframe

from logging_config import configure_logging
logger = configure_logging("notebook:4.clustering_customers.ipynb")


# Query to fetch customer list
sale_query = """
SELECT
    CONVERT_TZ(p.post_date, '+00:00', '+09:00') AS '주문일시',
    wp_product.post_title AS '상품명',
    oim_quantity.meta_value AS '주문수량',
    ROUND(oim_price.meta_value) AS '상품금액',
    ROUND(pim_tax.meta_value) AS '상품세금',
    ROUND(oim_subtotal.meta_value - oim_price.meta_value) AS '상품할인금액',
    ROUND(pm_total.meta_value) AS '실결제금액',
    ROUND(pm_discount.meta_value) AS '총할인금액',
    ROUND(pm_tax.meta_value) AS '총주문세금',
    MAX(CASE WHEN pm_address.meta_key = '_shipping_first_name' THEN pm_address.meta_value END) AS '수취인',
    MAX(CASE WHEN pm_address.meta_key = '_shipping_phone' THEN pm_address.meta_value END) AS '수취인휴대폰',
    MAX(CASE WHEN pm_address.meta_key = '_shipping_postcode' THEN pm_address.meta_value END) AS '배송지우편번호',
    MAX(CASE WHEN pm_address.meta_key = '_shipping_address_1' THEN pm_address.meta_value END) AS '배송지주소',
    MAX(CASE WHEN pm_address.meta_key = '_order_notes' THEN pm_address.meta_value END) AS '배송메세지',
    MAX(CASE WHEN pm_address.meta_key = '_billing_first_name' THEN pm_address.meta_value END) AS '구매자명',
    MAX(CASE WHEN pm_address.meta_key = '_billing_phone' THEN pm_address.meta_value END) AS '구매자휴대폰',
    GROUP_CONCAT(DISTINCT CASE WHEN oi_cpn.order_item_type = 'coupon' THEN oi_cpn.order_item_id END SEPARATOR ', ') AS '쿠폰id',
    GROUP_CONCAT(DISTINCT CASE WHEN oi_cpn.order_item_type = 'coupon' THEN oi_cpn.order_item_name END SEPARATOR ', ') AS '쿠폰코드',
    p.post_status AS '결제상태',
    pm.meta_value AS '구매자id',
    oi.order_id AS '주문id',
    oim_product.meta_value AS '상품id',
    oi.order_item_id AS '주문상품id',
    oim_variation.meta_value AS '변형id'
FROM wp_posts p
    LEFT JOIN wp_postmeta pm ON p.ID = pm.post_id AND pm.meta_key = '_customer_user'
    LEFT JOIN wp_users u ON pm.meta_value = u.ID
    LEFT JOIN wp_woocommerce_order_items oi ON p.ID = oi.order_id AND oi.order_item_type = 'line_item'
    LEFT JOIN wp_woocommerce_order_items oi_cpn ON p.ID = oi_cpn.order_id AND oi_cpn.order_item_type = 'coupon'
    LEFT JOIN wp_woocommerce_order_itemmeta oim_product ON oi.order_item_id = oim_product.order_item_id AND oim_product.meta_key = '_product_id'
    LEFT JOIN wp_woocommerce_order_itemmeta oim_variation ON oi.order_item_id = oim_variation.order_item_id AND oim_variation.meta_key = '_variation_id'
    LEFT JOIN wp_posts wp_product ON wp_product.ID = oim_product.meta_value
    LEFT JOIN wp_woocommerce_order_itemmeta oim_price ON oi.order_item_id = oim_price.order_item_id AND oim_price.meta_key = '_line_total'
    LEFT JOIN wp_woocommerce_order_itemmeta oim_subtotal ON oi.order_item_id = oim_subtotal.order_item_id AND oim_subtotal.meta_key = '_line_subtotal'
    LEFT JOIN wp_woocommerce_order_itemmeta oim_quantity ON oi.order_item_id = oim_quantity.order_item_id AND oim_quantity.meta_key = '_qty'
    LEFT JOIN wp_woocommerce_order_itemmeta pim_tax ON oi.order_item_id = pim_tax.order_item_id AND pim_tax.meta_key = '_line_subtotal_tax'
    LEFT JOIN wp_postmeta pm_tax ON p.ID = pm_tax.post_id AND pm_tax.meta_key = '_order_tax'
    LEFT JOIN wp_postmeta pm_total ON p.ID = pm_total.post_id AND pm_total.meta_key = '_order_total'
    LEFT JOIN wp_postmeta pm_discount ON p.ID = pm_discount.post_id AND pm_discount.meta_key = '_cart_discount'
    LEFT JOIN wp_postmeta pm_address ON p.ID = pm_address.post_id
WHERE
    p.post_type = 'shop_order'
    AND p.post_status NOT IN ('wc-refunded','wc-failed','wc-cancelled','wc-on-hold') -- Only include completed orders
    AND oim_product.meta_value NOT IN (2796, 8613)
    AND oim_product.meta_value IS NOT NULL
    AND u.ID NOT IN (1,2,24,153,86,109,51,164,3,33,35,158,79,54,56,80,105)
GROUP BY p.ID, oi.order_id
ORDER BY p.post_date DESC;
"""


opt_query = """
SELECT
    oim_product.order_item_id AS '주문상품id',
    COALESCE(pm_size.meta_value, pm_size_kr.meta_value) AS '크기',
    COALESCE(pm_color.meta_value, pm_color_kr.meta_value) AS '색상',
    COALESCE(pm_duvtype.meta_value, LEFT(pm_duvtype_kr.meta_value, 3)) AS '두께',
    COALESCE(pm_reps.meta_value, LEFT(pm_reps_kr.meta_value, 2)) AS '주기',
    COALESCE(pm_count.meta_value, '') AS '수량',
    CONCAT_WS(', ',
        NULLIF(COALESCE(pm_size.meta_value, pm_size_kr.meta_value), ''),
        NULLIF(COALESCE(pm_color.meta_value, pm_color_kr.meta_value), ''),
        NULLIF(COALESCE(pm_duvtype.meta_value, LEFT(pm_duvtype_kr.meta_value, 3)), ''),
        NULLIF(COALESCE(pm_reps.meta_value, LEFT(pm_reps_kr.meta_value, 2)), ''),
        NULLIF(COALESCE(pm_count.meta_value, ''), '')
    ) AS '전체옵션'
FROM wp_woocommerce_order_itemmeta oim_product
    LEFT JOIN wp_woocommerce_order_itemmeta pm_size ON oim_product.order_item_id = pm_size.order_item_id AND pm_size.meta_key = 'pa_size'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_color ON oim_product.order_item_id = pm_color.order_item_id AND pm_color.meta_key = 'pa_color'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_duvtype ON oim_product.order_item_id = pm_duvtype.order_item_id AND pm_duvtype.meta_key = 'pa_duvtype'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_reps ON oim_product.order_item_id = pm_reps.order_item_id AND pm_reps.meta_key = 'pa_reps'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_count ON oim_product.order_item_id = pm_count.order_item_id AND pm_count.meta_key = 'pa_count'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_size_kr ON oim_product.order_item_id = pm_size_kr.order_item_id AND pm_size_kr.meta_key = '사이즈'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_color_kr ON oim_product.order_item_id = pm_color_kr.order_item_id AND pm_color_kr.meta_key = '색상'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_duvtype_kr ON oim_product.order_item_id = pm_duvtype_kr.order_item_id AND pm_duvtype_kr.meta_key = '이불솜 두께'
    LEFT JOIN wp_woocommerce_order_itemmeta pm_reps_kr ON oim_product.order_item_id = pm_reps_kr.order_item_id AND pm_reps_kr.meta_key = '교체 주기'
WHERE
    oim_product.meta_key = '_product_id';

"""


def colnum_string(n):
    alphabet = string.ascii_uppercase
    strng = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        strng = alphabet[remainder] + strng
    return strng


# 최대 재시도 횟수 설정
MAX_RETRIES = 5
RETRY_DELAY = 30  # 30초 대기


scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
         "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]

client = gspread.authorize(creds)


for attempt in range(MAX_RETRIES):
    try:
        break
    except gspread.exceptions.APIError as error:
        if error.code == 503 and attempt < MAX_RETRIES - 1:
            # 서비스가 현재 사용할 수 없는 경우 30초 동안 대기하고 다시 시도합니다.
            logger.warn(f"Service unavailable. Retrying in {RETRY_DELAY} seconds...")
            time.sleep(RETRY_DELAY)
        else:
            # 재시도 횟수를 초과하거나 다른 오류가 발생한 경우 예외를 다시 발생시킵니다.
            #
            # noti to slack?
            #
            logger.error('매출원장 업데이트 실패로 {__file__} 실행 종료')
            raise
worksheet = spreadsheet.worksheet('클러스터5')
# sheet.clear()


# MySQL connection details

# Connect to MySQL
logger.info('MySql connection open')
connection = mysql.connector.connect(**mysql_config)
df = pd.read_sql_query(sale_query, connection)
df_opt = pd.read_sql_query(opt_query, connection)

# cursor.close()
connection.close()
logger.info('MySql connection closed')


pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)


# 조인하여 옵션 내용 추출 및 추가
df = df.merge(df_opt, left_on='주문상품id', right_on='주문상품id', how='left')
# print(df)

cluster_query = """
SELECT all_id, cluster as cluster5
FROM clbe_realestate_clustering2
WHERE all_id IN (:buyer_ids);
"""
df['구매자id'] = df['구매자id'].astype(int)
df['구매자id'] = df['구매자id'] + 50000
buyer_ids = tuple(df['구매자id'].dropna().unique())
cluster_df = load_data_into_dataframe(cluster_query, parameters={'buyer_ids': buyer_ids})

# merge preparation
cluster_df.rename(columns={'all_id': '구매자id'}, inplace=True)
cluster_df['구매자id'] = cluster_df['구매자id'].astype(int)

df = df.merge(cluster_df, on='구매자id', how='left')

# insert_dataframe_into_table(df, 'clbe_alacarte_clustering', if_exists='append')


worksheet.clear()
set_with_dataframe(worksheet, df)
# Get the number of columns in the DataFrame
num_cols = len(df.columns)
num_rows = len(df)
# Dynamically create the range for the header based on the number of columns
header_range = f"A1:{colnum_string(num_cols)}1"

# Apply formatting to the header range
worksheet.format(header_range, {'textFormat': {'bold': True}, 'backgroundColor': {
    'red': 0.9, 'green': 0.9, 'blue': 0.9}})
worksheet.freeze(rows=1)

worksheet_id = worksheet._properties['sheetId']
requests = [{
    "setBasicFilter": {
        "filter": {
            "range": {
                "sheetId": worksheet_id,
                "startRowIndex": 0,
                "endRowIndex": num_rows + 1
            }
        }
    }
}]
batch_update_spreadsheet_request_body = {
    'requests': requests
}
response = spreadsheet.batch_update(batch_update_spreadsheet_request_body)
logger.info(f'WC의 일반 상품 구매건을 읽어 google sheet 클린베딩_상품견적서_주문리스트(공유)의 클러스터5 탭에 저장 {len(df)}')


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



In [14]:
# Calculate the count for each cluster
cluster_counts = df['cluster5'].value_counts().sort_index()

# Calculate the total count of entries
total_counts = df['cluster5'].count()

# Calculate the percentage of each cluster
cluster_percentages = (cluster_counts / total_counts * 100).round(2)

# Create a DataFrame to display the result in tabular form
result_df = pd.DataFrame({
    'Cluster': cluster_counts.index,
    'Count': cluster_counts.values,
    'Percentage': cluster_percentages.values
})

# Print the DataFrame
result_df

Unnamed: 0,Cluster,Count,Percentage
0,0.0,62,65.96
1,1.0,21,22.34
2,2.0,5,5.32
3,3.0,5,5.32
4,4.0,1,1.06
