In [35]:
import warnings

# 오류 경고 무시하기
warnings.filterwarnings(action='ignore')

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import DBSCAN, KMeans

def categorization(df):
    col_num = list(df.select_dtypes(include = 'number').columns)

    for col in col_num:
        unique_bin_edges = np.unique(df[col])
        
        # 중복된 경계를 제거하고 qcut 적용
        _, bins = pd.qcut(df[col], 
                           q=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
                           labels=False,  # labels를 False로 지정하여 정수형 라벨 생성
                           duplicates='drop',
                           retbins=True)  # 경계만을 반환하도록 수정
        
        # 중복된 경계를 제거하고 cut 적용
        df[f'{col}_category'] = pd.cut(df[col], bins=bins, labels=False, include_lowest=True)
    
    df.drop(columns = col_num, inplace = True)

def clustering(df):
    ### 모델 학습 부분
    df_service = list(df['서비스_업종_코드_명'].unique())
    train = df[['서비스_업종_코드_명', '집객시설수', '당월_매출_금액', '유사_업종_점포_수', '교통시설수', '총_상주인구_수', '총_유동인구_수', '총_직장_인구_수']]

    #
    for i in df_service:
        temp = train[train['서비스_업종_코드_명'] == i]

        categorization(temp)
        
        scaler = StandardScaler()
        col_num = list(temp.select_dtypes(include = 'number').columns)
        temp[col_num] = scaler.fit_transform(temp[col_num])
        
        col_cat = list(temp.select_dtypes(include = 'object').columns)
        le = LabelEncoder()
        temp[col_cat] = temp[col_cat].apply(le.fit_transform)
        
        kmeans = KMeans(n_clusters=5, random_state=2023)
        kmeans.fit(temp)
        labels = kmeans.predict(temp)
        
        result = df[df['서비스_업종_코드_명'] == i]
        result['cluster'] = labels
        result.to_csv(f"C:/Users/Admin/Desktop/최종프로젝트/cluster/{i}.csv", index = False)
    

In [37]:
df = pd.read_csv("model_df.csv")
df.loc[df['서비스_업종_코드_명'] == '운동/경기용품', ['서비스_업종_코드_명']] = '운동경기용품'
clustering(df)