### K-means 클러스터링을 시도하여 SELLER 군집화 시도

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [None]:
# 샘플 데이터프레임 생성
df = pd.read_csv('./data_in/total_df.csv')

reviews = pd.read_csv('./data_in/reviews.csv')

In [None]:
df.columns

In [None]:
reviews.info()

In [None]:
df.columns

In [None]:
# 클러스터링에 사용할 특징 선택
features = ['Revenue', 'Payment_installments', 'Freight_value', 'Review_score', 'Items_per_order']

In [None]:
# 범주형 데이터들 더미화

customer_dummy = pd.get_dummies(df['Customer_id'], prefix= 'customer')

city_dummy = pd.get_dummies(df['Customer_city'], prefix= 'city')

product_dummy = pd.get_dummies(df['Product_category_name'], prefix= 'category')

In [None]:
# 더미 데이터들을 인코딩

df_plus = pd.concat([df, customer_dummy, city_dummy, product_dummy], axis= 1)

In [None]:
# 원하는 컬럼만을 선택

column_filter = ['Product_id', 'Seller_id', 'Price',
       'Freight_value', 'Customer_id',
       'Product_category_name', 'Revenue',
       'Customer_city',
       'Payment_sequential', 'Payment_type','Payment_installments', 'Payment_value',
       'Review_score', 'Items_per_order']

df_plus = df_plus[column_filter]

In [None]:
# 결측치 제거

df_plus['Review_score'].fillna(0, inplace= True)

In [None]:
# 특징 표준화
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_plus[features])

In [None]:
# k-means 클러스터링
n_clusters = 2  # 클러스터 개수 설정
kmeans = KMeans(n_clusters=n_clusters, random_state=3)
df_plus['cluster'] = kmeans.fit_predict(df_scaled)

In [None]:
# 클러스터 시각화
for cluster in range(n_clusters):
    plt.scatter(df_plus[df_plus['cluster'] == cluster]['Revenue'],
                df_plus[df_plus['cluster'] == cluster]['Items_per_order'],
                label=f'Cluster {cluster + 1}')

plt.xlabel('Revenue')
plt.ylabel('Items_per_order')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

# 클러스터 중심 출력
print("Cluster centers:")
print(kmeans.cluster_centers_)
