# 1. 데이터 불러오기

In [1]:
import pandas as pd
import os

pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

PATH = './data/'

file_list= os.listdir(PATH)
csv_list = list()

for file in file_list:
    if file.split(".")[-1] == 'csv':
        csv_list.append(file)
data = ['living_popul', 'resident_popul', 'area', 'store', 'working_popul', 'facilitie', 'estimated_sales']

for name, file in zip(data, csv_list):
    globals()[name] = pd.read_csv(PATH + file, encoding = "ansi")

store = store.drop("Unnamed: 0", axis = 1)
estimated_sales = estimated_sales.drop("Unnamed: 0", axis = 1)

# 2. 데이터 전처리

## 2-0. 시점 통일

In [2]:
years = [20223, 20224, 20231, 20232]

living_popul = living_popul[living_popul['기준_년분기_코드'].isin(years)]
resident_popul = resident_popul[resident_popul['기준_년분기_코드'].isin(years)]
store = store[store['기준_년분기_코드'].isin(years)]
working_popul = working_popul[working_popul['기준_년분기_코드'].isin(years)]
facilitie = facilitie[facilitie['기준_년분기_코드'].isin(years)]
estimated_sales = estimated_sales[estimated_sales['기준_년분기_코드'].isin(years)]

## 2-1. 길단위인구 전처리

### 2-1-1 변수 처리

In [3]:
# 파생변수 생성
living_popul['시간대_21_06_유동인구_수'] = living_popul['시간대_00_06_유동인구_수'] + living_popul['시간대_21_24_유동인구_수']
living_popul['주중_유동인구_수'] = living_popul['월요일_유동인구_수'] + living_popul['화요일_유동인구_수'] + living_popul['수요일_유동인구_수'] + living_popul['목요일_유동인구_수'] + living_popul['금요일_유동인구_수']
living_popul['주말_유동인구_수'] = living_popul['토요일_유동인구_수'] + living_popul['일요일_유동인구_수']
living_popul['연령대_30_40_유동인구_수'] = living_popul['연령대_30_유동인구_수'] + living_popul['연령대_40_유동인구_수']
living_popul['연령대_50_60_유동인구_수'] = living_popul['연령대_50_유동인구_수'] + living_popul['연령대_60_이상_유동인구_수']

# 불필요한 변수 제거
living_popul = living_popul.drop(['월요일_유동인구_수', '화요일_유동인구_수', '수요일_유동인구_수', '목요일_유동인구_수', '금요일_유동인구_수', '토요일_유동인구_수', '일요일_유동인구_수', 
                                  '시간대_00_06_유동인구_수', '시간대_21_24_유동인구_수',
                                  '연령대_30_유동인구_수','연령대_40_유동인구_수','연령대_50_유동인구_수','연령대_60_이상_유동인구_수'], axis = 1)

### 2-1-2. 결측치/이상치/중복

In [4]:
# 결측값 확인
living_popul.isnull().sum()

기준_년분기_코드           0
상권_구분_코드_명          0
상권_코드_명             0
총_유동인구_수            0
남성_유동인구_수           0
여성_유동인구_수           0
연령대_10_유동인구_수       0
연령대_20_유동인구_수       0
시간대_06_11_유동인구_수    0
시간대_11_14_유동인구_수    0
시간대_14_17_유동인구_수    0
시간대_17_21_유동인구_수    0
시간대_21_06_유동인구_수    0
주중_유동인구_수           0
주말_유동인구_수           0
연령대_30_40_유동인구_수    0
연령대_50_60_유동인구_수    0
dtype: int64

In [5]:
# 통계량 확인
living_popul.describe()

Unnamed: 0,기준_년분기_코드,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수
count,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0
mean,20227.499,831723.936,396454.429,435269.521,106249.107,148852.466,169196.692,107398.432,108468.099,144295.668,302365.121,600110.514,231613.502,281556.79,295065.647
std,4.032,900980.836,437696.274,466905.782,113447.145,216584.734,179874.328,134146.79,137040.744,167412.286,321245.522,660890.833,246084.966,332193.428,308808.842
min,20223.0,12.0,12.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,221703.0,106884.0,113949.0,25620.0,32184.0,45807.0,28462.0,28587.0,38158.0,75590.0,159214.0,60098.0,74298.0,79315.0
50%,20224.0,568553.0,269829.0,297045.0,71540.0,82940.0,117002.0,72028.0,72359.0,96289.0,205750.0,406266.0,159173.0,185168.0,206051.0
75%,20231.0,1138476.0,537610.0,593960.0,147393.0,184525.0,234575.0,140269.0,140608.0,194982.0,417273.0,828500.0,314287.0,370667.0,412023.0
max,20232.0,8657826.0,4693188.0,4032096.0,829062.0,3487417.0,1678321.0,1853219.0,1861965.0,1793125.0,2942275.0,6833618.0,2124591.0,3506912.0,3565244.0


In [6]:
# 중복 확인
living_popul[living_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수


In [7]:
# 하위 5% 삭제
living_popul = living_popul[living_popul['총_유동인구_수']>living_popul['총_유동인구_수'].quantile(0.05)]

### 2-1-3. 분기별 데이터 연도 기준 변환

In [8]:
living_popul_year = living_popul.groupby(['상권_구분_코드_명','상권_코드_명'],as_index = False)\
                                .mean()\
                                .drop('기준_년분기_코드', axis = 1)

In [9]:
living_popul_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   상권_구분_코드_명        1575 non-null   object 
 1   상권_코드_명           1575 non-null   object 
 2   총_유동인구_수          1575 non-null   float64
 3   남성_유동인구_수         1575 non-null   float64
 4   여성_유동인구_수         1575 non-null   float64
 5   연령대_10_유동인구_수     1575 non-null   float64
 6   연령대_20_유동인구_수     1575 non-null   float64
 7   시간대_06_11_유동인구_수  1575 non-null   float64
 8   시간대_11_14_유동인구_수  1575 non-null   float64
 9   시간대_14_17_유동인구_수  1575 non-null   float64
 10  시간대_17_21_유동인구_수  1575 non-null   float64
 11  시간대_21_06_유동인구_수  1575 non-null   float64
 12  주중_유동인구_수         1575 non-null   float64
 13  주말_유동인구_수         1575 non-null   float64
 14  연령대_30_40_유동인구_수  1575 non-null   float64
 15  연령대_50_60_유동인구_수  1575 non-null   float64
dtypes: float64(14), object(2)
memory usage: 19

## 2-2. 상주인구 전처리

### 2-2-1. 변수 처리

In [10]:
# 파생변수 생성
resident_popul['연령대_30_40_상주인구_수'] = resident_popul['연령대_30_상주인구_수']+resident_popul['연령대_40_상주인구_수']
resident_popul['연령대_50_60_상주인구_수'] = resident_popul['연령대_50_상주인구_수']+resident_popul['연령대_60_이상_상주인구_수']

# 불필요한 변수 제거
resident_popul = resident_popul[['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명',
                                 '총_상주인구_수', '남성_상주인구_수','여성_상주인구_수', 
                                 '연령대_10_상주인구_수', '연령대_20_상주인구_수', '연령대_30_40_상주인구_수', '연령대_50_60_상주인구_수']]

### 2-2-2. 결측치/이상치/중복

In [11]:
# 결측값 확인
resident_popul.isnull().sum()

기준_년분기_코드           0
상권_구분_코드_명          0
상권_코드_명             0
총_상주인구_수            0
남성_상주인구_수           0
여성_상주인구_수           0
연령대_10_상주인구_수       0
연령대_20_상주인구_수       0
연령대_30_40_상주인구_수    0
연령대_50_60_상주인구_수    0
dtype: int64

In [12]:
# 통계량 확인
resident_popul.describe()

Unnamed: 0,기준_년분기_코드,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수
count,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0
mean,20227.495,2361.003,1166.658,1194.345,230.155,300.171,789.651,1041.026
std,4.032,2349.098,1159.233,1195.283,262.098,340.145,834.359,1031.672
min,20223.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,656.75,328.0,329.75,52.0,70.0,207.0,281.0
50%,20224.0,1621.0,811.5,817.0,147.0,195.0,528.0,721.0
75%,20231.0,3306.25,1639.0,1677.0,318.0,413.25,1083.0,1466.0
max,20232.0,21341.0,10459.0,10882.0,3078.0,4457.0,7626.0,8342.0


In [13]:
# 중복 확인
resident_popul[resident_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수


### 2-2-3. 분기별 데이터 연도 기준 변환

In [14]:
resident_popul_year = resident_popul[resident_popul['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드'],axis=1)

In [15]:
resident_popul_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1625 entries, 19887 to 24485
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   상권_구분_코드_명        1625 non-null   object
 1   상권_코드_명           1625 non-null   object
 2   총_상주인구_수          1625 non-null   int64 
 3   남성_상주인구_수         1625 non-null   int64 
 4   여성_상주인구_수         1625 non-null   int64 
 5   연령대_10_상주인구_수     1625 non-null   int64 
 6   연령대_20_상주인구_수     1625 non-null   int64 
 7   연령대_30_40_상주인구_수  1625 non-null   int64 
 8   연령대_50_60_상주인구_수  1625 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 127.0+ KB


## 2-3. 점포 전처리

### 2-3-1. 변수 처리

In [16]:
store.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304774 entries, 1055496 to 1360269
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   기준_년분기_코드    304774 non-null  int64 
 1   상권_구분_코드     304774 non-null  object
 2   상권_구분_코드_명   304774 non-null  object
 3   상권_코드        304774 non-null  int64 
 4   상권_코드_명      304774 non-null  object
 5   서비스_업종_코드    304774 non-null  object
 6   서비스_업종_코드_명  304774 non-null  object
 7   점포_수         304774 non-null  int64 
 8   유사_업종_점포_수   304774 non-null  int64 
 9   개업_율         304774 non-null  int64 
 10  개업_점포_수      304774 non-null  int64 
 11  폐업_률         304774 non-null  int64 
 12  폐업_점포_수      304774 non-null  int64 
 13  프랜차이즈_점포_수   304774 non-null  int64 
dtypes: int64(9), object(5)
memory usage: 34.9+ MB


In [17]:
store['서비스_업종_중분류'] = store['서비스_업종_코드'].str[:7]
store['서비스_업종_대분류'] = store['서비스_업종_코드'].str[:3]

In [18]:
store['서비스_업종_중분류'].unique()

array(['CS20004', 'CS10000', 'CS10001', 'CS30002', 'CS30000', 'CS30001',
       'CS20003', 'CS20001', 'CS20002', 'CS30004', 'CS20000', 'CS30003'],
      dtype=object)

In [19]:
store[store['서비스_업종_대분류'] == 'CS10000']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [20]:
store[store['서비스_업종_대분류'] == 'CS10001']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [21]:
store[store['서비스_업종_대분류'] == 'CS20000']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [22]:
store[store['서비스_업종_대분류'] == 'CS20001']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [23]:
store[store['서비스_업종_대분류'] == 'CS20002']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [24]:
store[store['서비스_업종_대분류'] == 'CS20003']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [25]:
store[store['서비스_업종_대분류'] == 'CS20004']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [26]:
store[store['서비스_업종_대분류'] == 'CS30000']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [27]:
store[store['서비스_업종_대분류'] == 'CS30001']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [28]:
store[store['서비스_업종_대분류'] == 'CS30002']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [29]:
store[store['서비스_업종_대분류'] == 'CS30003']['서비스_업종_코드_명'].unique()

array([], dtype=object)

In [30]:
store[store['서비스_업종_대분류'] == 'CS30004']['서비스_업종_코드_명'].unique()

array([], dtype=object)

### 2-3-1. 결측치/이상치/중복

In [31]:
# 결측값 확인
store.isnull().sum()

기준_년분기_코드      0
상권_구분_코드       0
상권_구분_코드_명     0
상권_코드          0
상권_코드_명        0
서비스_업종_코드      0
서비스_업종_코드_명    0
점포_수           0
유사_업종_점포_수     0
개업_율           0
개업_점포_수        0
폐업_률           0
폐업_점포_수        0
프랜차이즈_점포_수     0
서비스_업종_중분류     0
서비스_업종_대분류     0
dtype: int64

In [32]:
# 통계량 확인
store.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수
count,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0
mean,20227.497,3114874.594,6.056,6.506,2.292,0.138,2.479,0.179,0.45
std,4.031,11742.077,42.503,42.878,10.9,0.615,11.805,1.158,2.232
min,20223.0,3001491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,3110436.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,20224.0,3110871.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,20231.0,3120141.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0
max,20232.0,3130327.0,9002.0,9018.0,200.0,29.0,400.0,288.0,127.0


In [33]:
# 중복 확인
store[store.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,서비스_업종_중분류,서비스_업종_대분류


### 2-3-2. 분기별 데이터 연도 기준 변환

In [34]:
store_year = store[store['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드', '상권_코드', '상권_구분_코드', '서비스_업종_코드', '서비스_업종_중분류', '서비스_업종_코드_명'], axis = 1)

In [35]:
store_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76099 entries, 1284171 to 1360269
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   상권_구분_코드_명  76099 non-null  object
 1   상권_코드_명     76099 non-null  object
 2   점포_수        76099 non-null  int64 
 3   유사_업종_점포_수  76099 non-null  int64 
 4   개업_율        76099 non-null  int64 
 5   개업_점포_수     76099 non-null  int64 
 6   폐업_률        76099 non-null  int64 
 7   폐업_점포_수     76099 non-null  int64 
 8   프랜차이즈_점포_수  76099 non-null  int64 
 9   서비스_업종_대분류  76099 non-null  object
dtypes: int64(7), object(3)
memory usage: 6.4+ MB


## 2-4. 직장인구 전처리

### 2-4-1. 변수 처리

In [36]:
working_popul = working_popul[['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명', '총_직장_인구_수']]

### 2-4-2. 결측치/이상치/중복

In [37]:
# 결측값 확인
working_popul.isnull().sum()

기준_년분기_코드     0
상권_구분_코드_명    0
상권_코드_명       0
총_직장_인구_수     0
dtype: int64

In [38]:
# 통계량 확인
working_popul.describe()

Unnamed: 0,기준_년분기_코드,총_직장_인구_수
count,6492.0,6492.0
mean,20227.5,1771.425
std,4.031,6246.21
min,20223.0,1.0
25%,20223.75,107.0
50%,20227.5,317.0
75%,20231.25,892.0
max,20232.0,104830.0


In [39]:
# 중복 확인
working_popul[working_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_직장_인구_수


### 2-4-3. 분기별 데이터 연도 기준 변환

In [40]:
working_popul_year = working_popul[working_popul['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드'],axis=1)

In [41]:
working_popul_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1623 entries, 11546 to 25988
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   상권_구분_코드_명  1623 non-null   object
 1   상권_코드_명     1623 non-null   object
 2   총_직장_인구_수   1623 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 50.7+ KB


## 2-5. 집객시설 전처리

### 2-5-1. 변수 처리

In [42]:
# 파생변수 생성
facilitie['초중고_수'] = facilitie['초등학교_수']+facilitie['중학교_수']+facilitie['고등학교_수']

# 불필요한 변수 제거
facilitie = facilitie.drop(['집객시설_수','철도_역_수', '초등학교_수', '중학교_수', '고등학교_수'], axis=1)

### 2-5-2. 결측치/이상치/중복

In [43]:
# 결측값 확인
facilitie.isnull().sum()

기준_년분기_코드        0
상권_구분_코드_명       0
상권_코드_명          0
관공서_수         3792
은행_수          4364
종합병원_수        6220
일반_병원_수       5784
약국_수          2276
유치원_수         5708
대학교_수         5976
백화점_수         6268
슈퍼마켓_수        5892
극장_수          5976
숙박_시설_수       5732
공항_수          6308
버스_터미널_수      6304
지하철_역_수       5532
버스_정거장_수      1232
초중고_수         6308
dtype: int64

In [44]:
facilitie = facilitie.fillna(0)

In [45]:
# 통계량 확인
facilitie.describe()

Unnamed: 0,기준_년분기_코드,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수
count,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0
mean,20227.5,0.695,1.015,0.015,0.107,2.359,0.112,0.087,0.008,0.102,0.072,0.213,0.003,0.001,0.176,3.896,0.003
std,4.031,1.172,3.01,0.127,0.396,4.034,0.385,0.466,0.094,0.495,0.352,1.027,0.101,0.036,0.535,4.959,0.126
min,20223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,20227.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
75%,20231.25,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
max,20232.0,14.0,57.0,2.0,4.0,52.0,6.0,7.0,2.0,8.0,5.0,18.0,4.0,1.0,5.0,85.0,5.0


In [46]:
# 중복 확인
facilitie[facilitie.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수


### 2-5-3. 분기별 데이터 연도 기준 변환

In [47]:
facilitie_year = facilitie[facilitie['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드'], axis = 1)

In [48]:
facilitie_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1578 entries, 0 to 17356
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   상권_구분_코드_명  1578 non-null   object 
 1   상권_코드_명     1578 non-null   object 
 2   관공서_수       1578 non-null   float64
 3   은행_수        1578 non-null   float64
 4   종합병원_수      1578 non-null   float64
 5   일반_병원_수     1578 non-null   float64
 6   약국_수        1578 non-null   float64
 7   유치원_수       1578 non-null   float64
 8   대학교_수       1578 non-null   float64
 9   백화점_수       1578 non-null   float64
 10  슈퍼마켓_수      1578 non-null   float64
 11  극장_수        1578 non-null   float64
 12  숙박_시설_수     1578 non-null   float64
 13  공항_수        1578 non-null   float64
 14  버스_터미널_수    1578 non-null   float64
 15  지하철_역_수     1578 non-null   float64
 16  버스_정거장_수    1578 non-null   float64
 17  초중고_수       1578 non-null   float64
dtypes: float64(16), object(2)
memory usage: 234.2+ KB


In [49]:
facilitie_year.describe()

Unnamed: 0,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수
count,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0
mean,0.695,1.015,0.015,0.107,2.359,0.112,0.087,0.008,0.102,0.072,0.213,0.003,0.001,0.176,3.896,0.003
std,1.172,3.01,0.127,0.396,4.035,0.386,0.466,0.094,0.495,0.352,1.027,0.101,0.036,0.536,4.96,0.126
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
75%,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
max,14.0,57.0,2.0,4.0,52.0,6.0,7.0,2.0,8.0,5.0,18.0,4.0,1.0,5.0,85.0,5.0


## 2-6. 추정매출 전처리

### 2-6-1. 변수 처리

In [50]:
# 파생변수 생성
estimated_sales['시간대_21~06_매출_금액'] = estimated_sales['시간대_00~06_매출_금액'] + estimated_sales['시간대_21~24_매출_금액']
estimated_sales['시간대_21~06_매출_건수'] = estimated_sales['시간대_건수~06_매출_건수'] + estimated_sales['시간대_건수~24_매출_건수']
estimated_sales['연령대_30_40_매출_금액'] = estimated_sales['연령대_30_매출_금액'] + estimated_sales['연령대_40_매출_금액']
estimated_sales['연령대_50_60_매출_금액'] = estimated_sales['연령대_50_매출_금액'] + estimated_sales['연령대_60_이상_매출_금액']
estimated_sales['연령대_30_40_매출_건수'] = estimated_sales['연령대_30_매출_건수'] + estimated_sales['연령대_40_매출_건수']
estimated_sales['연령대_50_60_매출_건수'] = estimated_sales['연령대_50_매출_건수'] + estimated_sales['연령대_60_이상_매출_건수']

estimated_sales['서비스_업종_대분류'] = estimated_sales['서비스_업종_코드'].str[:3]

# 불필요한 변수 제거
estimated_sales = estimated_sales.drop(columns = ['서비스_업종_코드'],
                                       axis = 1)

estimated_sales = estimated_sales.drop(columns = ['월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액', '목요일_매출_금액', '금요일_매출_금액', '토요일_매출_금액', '일요일_매출_금액',
                                                  '월요일_매출_건수', '화요일_매출_건수', '수요일_매출_건수', '목요일_매출_건수', '금요일_매출_건수', '토요일_매출_건수', '일요일_매출_건수'],
                                       axis = 1)
estimated_sales = estimated_sales.drop(columns = ['시간대_00~06_매출_금액', '시간대_21~24_매출_금액', '시간대_건수~06_매출_건수', '시간대_건수~24_매출_건수',
                                                 '연령대_30_매출_금액','연령대_40_매출_금액','연령대_50_매출_금액','연령대_60_이상_매출_금액',
                                                 '연령대_30_매출_건수','연령대_40_매출_건수','연령대_50_매출_건수','연령대_60_이상_매출_건수'],
                                       axis = 1)

# 변수 이름 변경
estimated_sales = estimated_sales.rename(columns = {'시간대_건수~11_매출_건수' : '시간대_06~11_매출_건수',
                                                         '시간대_건수~14_매출_건수' : '시간대_11~14_매출_건수',
                                                         '시간대_건수~17_매출_건수' : '시간대_14~17_매출_건수',
                                                         '시간대_건수~21_매출_건수' : '시간대_17~21_매출_건수'})



### 2-6-2. 결측치/이상치/중복

In [51]:
# 결측값 확인
estimated_sales.isnull().sum()

기준_년분기_코드          0
상권_구분_코드           0
상권_구분_코드_명         0
상권_코드              0
상권_코드_명            0
서비스_업종_코드_명        0
당월_매출_금액           0
당월_매출_건수           0
주중_매출_금액           0
주말_매출_금액           0
시간대_06~11_매출_금액    0
시간대_11~14_매출_금액    0
시간대_14~17_매출_금액    0
시간대_17~21_매출_금액    0
남성_매출_금액           0
여성_매출_금액           0
연령대_10_매출_금액       0
연령대_20_매출_금액       0
주중_매출_건수           0
주말_매출_건수           0
시간대_06~11_매출_건수    0
시간대_11~14_매출_건수    0
시간대_14~17_매출_건수    0
시간대_17~21_매출_건수    0
남성_매출_건수           0
여성_매출_건수           0
연령대_10_매출_건수       0
연령대_20_매출_건수       0
시간대_21~06_매출_금액    0
시간대_21~06_매출_건수    0
연령대_30_40_매출_금액    0
연령대_50_60_매출_금액    0
연령대_30_40_매출_건수    0
연령대_50_60_매출_건수    0
서비스_업종_대분류         0
dtype: int64

In [52]:
# 통계량 확인
estimated_sales.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수
count,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0
mean,20227.506,3115479.183,1116599271.468,39184.849,842969184.583,273630043.386,116413408.602,286186909.051,280424841.016,306646276.19,506115717.968,483588097.551,5734552.208,135382675.504,29499.636,9685.213,5416.564,10204.191,8307.715,10443.374,19337.49,18168.507,577.372,7898.857,126927760.944,4813.006,431464849.292,417119530.685,16555.601,12474.165
std,4.031,14347.76,9371981938.469,168454.273,6458423614.17,3509552781.251,1172352096.256,2477461950.285,3124214644.465,2771104568.472,4449083121.756,3616311150.276,31894990.608,909570386.425,117929.298,57711.259,29263.631,45978.917,44186.686,47735.342,83502.247,83244.346,3617.461,42063.066,1000325381.326,23992.316,3469029992.731,3773146796.252,73433.475,61385.401
min,20223.0,3001491.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20224.0,3110547.0,48560077.0,868.0,36205605.5,7456979.5,145574.5,5491691.0,7869005.5,11816467.5,19597595.5,18111528.5,0.0,1369387.5,654.0,153.0,7.0,109.0,173.0,226.0,390.0,342.0,0.0,43.0,0.0,0.0,15857293.0,15467433.0,324.0,297.0
50%,20231.0,3111085.0,188164542.0,4646.0,140846234.0,38302621.0,6268042.0,34963031.0,36234148.0,50583967.0,84295031.0,75951653.0,166368.0,12428462.0,3446.0,1036.0,169.0,985.0,983.0,1304.0,2286.0,1968.0,12.0,446.0,4061538.0,77.0,69746034.0,63905807.0,1783.0,1642.0
75%,20232.0,3120185.0,686193977.0,25492.5,520555159.0,150576100.0,48504462.5,153603359.0,143040793.5,186412033.0,314344883.5,290361639.5,2273891.5,64077848.5,19055.5,5839.5,1996.0,6405.5,5206.5,6992.5,12096.0,11579.0,174.0,3306.0,51155789.0,1497.0,260456794.5,241524127.5,9897.0,8588.0
max,20232.0,3130327.0,995677000000.0,12090273.0,669799000000.0,485357000000.0,107916000000.0,242249000000.0,326543000000.0,386343000000.0,537243000000.0,304162000000.0,1549259735.0,90669553834.0,5827863.0,6262410.0,1474148.0,2611826.0,3245770.0,4287338.0,6708993.0,4615812.0,329040.0,3087414.0,126314287024.0,1514884.0,354347000000.0,411544000000.0,4519958.0,5329793.0


In [53]:
# 중복 확인
estimated_sales[estimated_sales.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드_명,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수,서비스_업종_대분류


In [54]:
def sales_tran(df):
    if df['당월_법인_매출_금액'] < 0:
        return df['당월_매출_금액'] - df['당월_법인_매출_금액']
    else :
        return df['당월_매출_금액']
estimated_sales = estimated_sales[estimated_sales['당월_매출_금액'] > estimated_sales['당월_매출_금액'].quantile(0.05)]
estimated_sales['당월_개인_매출_금액'] = estimated_sales['남성_매출_금액'] + estimated_sales['여성_매출_금액']
estimated_sales['당월_법인_매출_금액'] =  estimated_sales['당월_매출_금액'] - estimated_sales['당월_개인_매출_금액']

estimated_sales['당월_매출_금액'] = estimated_sales.apply(lambda x : sales_tran(x) , axis=1)

estimated_sales.loc[estimated_sales['당월_법인_매출_금액']<0, '당월_법인_매출_금액'] = 0

In [55]:
estimated_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79910 entries, 119954 to 204096
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   기준_년분기_코드        79910 non-null  int64  
 1   상권_구분_코드         79910 non-null  object 
 2   상권_구분_코드_명       79910 non-null  object 
 3   상권_코드            79910 non-null  int64  
 4   상권_코드_명          79910 non-null  object 
 5   서비스_업종_코드_명      79910 non-null  object 
 6   당월_매출_금액         79910 non-null  float64
 7   당월_매출_건수         79910 non-null  int64  
 8   주중_매출_금액         79910 non-null  float64
 9   주말_매출_금액         79910 non-null  float64
 10  시간대_06~11_매출_금액  79910 non-null  float64
 11  시간대_11~14_매출_금액  79910 non-null  float64
 12  시간대_14~17_매출_금액  79910 non-null  float64
 13  시간대_17~21_매출_금액  79910 non-null  float64
 14  남성_매출_금액         79910 non-null  float64
 15  여성_매출_금액         79910 non-null  float64
 16  연령대_10_매출_금액     79910 non-null  int64  
 17  연령대_20_매출_금

### 2-6-3. 분기별 데이터 연도 기준 변환

In [56]:
estimated_sales_year = estimated_sales.drop(columns = ['기준_년분기_코드','상권_구분_코드','상권_코드', '서비스_업종_코드_명'])\
                                      .groupby(['상권_구분_코드_명', '상권_코드_명', '서비스_업종_대분류'])\
                                      .mean()\
                                      .reset_index()

In [57]:
estimated_sales_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4132 entries, 0 to 4131
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   상권_구분_코드_명       4132 non-null   object 
 1   상권_코드_명          4132 non-null   object 
 2   서비스_업종_대분류       4132 non-null   object 
 3   당월_매출_금액         4132 non-null   float64
 4   당월_매출_건수         4132 non-null   float64
 5   주중_매출_금액         4132 non-null   float64
 6   주말_매출_금액         4132 non-null   float64
 7   시간대_06~11_매출_금액  4132 non-null   float64
 8   시간대_11~14_매출_금액  4132 non-null   float64
 9   시간대_14~17_매출_금액  4132 non-null   float64
 10  시간대_17~21_매출_금액  4132 non-null   float64
 11  남성_매출_금액         4132 non-null   float64
 12  여성_매출_금액         4132 non-null   float64
 13  연령대_10_매출_금액     4132 non-null   float64
 14  연령대_20_매출_금액     4132 non-null   float64
 15  주중_매출_건수         4132 non-null   float64
 16  주말_매출_건수         4132 non-null   float64
 17  시간대_06~11_매출_건

## 2-7. 데이터 병합

In [58]:
df = pd.merge(living_popul_year, resident_popul_year)
df = pd.merge(df, store_year)
df = pd.merge(df, working_popul_year)
df = pd.merge(df, facilitie_year)
df = pd.merge(df, estimated_sales_year)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68126 entries, 0 to 68125
Data columns (total 78 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   상권_구분_코드_명        68126 non-null  object 
 1   상권_코드_명           68126 non-null  object 
 2   총_유동인구_수          68126 non-null  float64
 3   남성_유동인구_수         68126 non-null  float64
 4   여성_유동인구_수         68126 non-null  float64
 5   연령대_10_유동인구_수     68126 non-null  float64
 6   연령대_20_유동인구_수     68126 non-null  float64
 7   시간대_06_11_유동인구_수  68126 non-null  float64
 8   시간대_11_14_유동인구_수  68126 non-null  float64
 9   시간대_14_17_유동인구_수  68126 non-null  float64
 10  시간대_17_21_유동인구_수  68126 non-null  float64
 11  시간대_21_06_유동인구_수  68126 non-null  float64
 12  주중_유동인구_수         68126 non-null  float64
 13  주말_유동인구_수         68126 non-null  float64
 14  연령대_30_40_유동인구_수  68126 non-null  float64
 15  연령대_50_60_유동인구_수  68126 non-null  float64
 16  총_상주인구_수          68126 non-null  int64 

In [60]:
df.describe()

Unnamed: 0,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,총_직장_인구_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수,당월_개인_매출_금액,당월_법인_매출_금액
count,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0,68126.0
mean,1179725.397,564212.524,615512.884,146694.859,216129.446,238269.236,156306.375,158498.929,208834.437,417816.506,854306.968,325418.511,407336.59,409564.572,3099.567,1534.172,1565.395,291.923,333.736,1080.669,1393.239,6.359,6.841,2.525,0.165,2.452,0.179,0.482,3001.683,0.97,1.727,0.021,0.172,3.644,0.137,0.103,0.013,0.114,0.121,0.297,0.0,0.002,0.278,5.329,0.006,851712215.892,32144.429,653650818.791,198061359.304,102221351.909,217059190.474,230316415.052,222713167.447,392258566.433,382264922.651,4122408.515,95301491.432,24149.305,7995.123,4740.722,7334.598,6862.03,8677.602,16533.787,14706.792,499.248,6412.045,79402035.056,4529.476,331458829.842,343639381.08,13577.204,10752.087,774523489.084,77188726.808
std,1056359.261,518767.771,542854.035,126490.845,272539.119,207753.951,168826.473,173131.493,204455.666,359681.593,782450.502,283359.198,402828.156,351275.48,2597.697,1283.956,1320.374,288.651,293.051,961.77,1159.314,42.897,43.301,11.15,0.695,11.318,0.938,2.33,8749.448,1.395,4.078,0.152,0.503,5.182,0.423,0.502,0.122,0.582,0.466,1.272,0.0,0.047,0.669,6.156,0.18,2588463969.127,56405.73,1925356488.892,784315675.114,393017736.604,763620236.9,876558889.884,653239418.777,1205804839.0,1061708663.961,11562854.845,269770729.469,40655.07,17149.102,8730.37,15204.334,14079.85,15909.133,27222.933,28559.727,1323.295,15173.256,190032002.341,8135.613,909508425.773,1202753578.667,24187.35,20943.909,2209494891.521,563671141.01
min,41356.5,18580.5,18881.0,1763.5,2419.5,7723.0,5020.25,4708.5,5842.0,5784.25,29058.0,9312.0,9347.5,11240.75,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6102740.0,5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2411357.75,0.0
25%,488067.25,229748.25,253862.0,56427.25,67371.0,98745.0,61057.75,61121.75,82472.0,159589.75,350820.75,133935.5,157429.0,169383.25,1175.0,585.0,596.0,92.0,110.0,380.0,510.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,219.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,174018992.5,4429.429,134953139.438,34293616.818,10974014.812,34144998.75,37264093.667,46593757.583,84941164.5,70691835.333,306493.75,10377385.333,3354.0,899.469,363.063,1022.167,1075.231,1007.444,2256.375,1878.5,19.0,395.721,10053786.5,154.706,68953418.429,63991347.625,1667.389,1677.453,160945931.818,1676480.857
50%,893965.25,425124.25,469631.0,110989.25,138806.25,183495.25,110890.75,111359.0,150519.25,320846.75,642653.25,250898.25,289882.0,318494.5,2426.0,1175.0,1231.0,209.0,258.0,823.0,1082.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,515.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,396590305.543,17201.25,304026225.75,84018751.429,41910574.125,85510346.154,86822666.083,107080295.344,199259302.611,168124981.0,1184377.75,31940219.826,13059.0,3810.409,2212.667,3779.167,3498.378,4394.161,9064.182,7237.421,124.278,2103.606,35405952.829,1464.1,148994451.767,163180405.325,6851.8,6258.833,371488454.615,9124794.074
75%,1546304.0,735782.25,820856.25,199104.5,256581.75,318563.75,198872.0,198742.5,266156.25,581638.75,1110182.5,440799.75,519924.75,551085.25,4330.0,2140.0,2185.0,407.0,472.0,1476.0,1961.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,1618.0,1.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,812856248.866,39029.268,624662649.0,174656639.061,93272263.306,192242548.067,195671110.5,222105723.291,381263252.388,362607820.25,3301734.583,75599939.5,29269.667,9669.018,5887.429,8048.357,7873.519,10797.0,20848.222,17124.156,463.87,6470.867,80596092.0,5710.713,312862126.176,343838640.975,16275.102,13605.583,753163382.8,36860425.981
max,8536354.5,4598446.75,3937907.75,804173.75,3262037.5,1648813.0,1805141.25,1830343.5,1719183.0,2680625.0,6772121.5,2029885.0,3405217.25,3480277.5,21341.0,10459.0,10882.0,3078.0,2295.0,7626.0,8342.0,8493.0,8509.0,200.0,23.0,200.0,146.0,127.0,104830.0,14.0,57.0,2.0,4.0,52.0,6.0,7.0,2.0,8.0,5.0,18.0,0.0,1.0,5.0,85.0,5.0,68024306117.182,852226.5,55713424845.909,24877617673.698,15786387665.636,24961579454.727,24178256692.707,19388654750.667,32814948181.818,34181131814.273,287529327.925,4416535865.397,608724.75,341092.841,157273.75,315635.025,272630.25,279350.25,499177.0,415973.069,27591.133,290023.026,4403414692.95,108291.575,19083277742.778,49866347917.636,340535.941,604526.292,66996079996.091,20813538745.672


In [61]:
df.to_csv("final_df.csv")

# 3. 모델 학습

## 3-1. 분석에 적합한 데이터로 변환

In [62]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

label_df = df.drop(columns = ['상권_구분_코드_명', '상권_코드_명'])
label_df_ID = df[['상권_구분_코드_명', '상권_코드_명']]

scaler = StandardScaler()
col_num = list(label_df.select_dtypes(include = 'number').columns)
label_df[col_num] = scaler.fit_transform(label_df[col_num])

col_cat = list(label_df.select_dtypes(include = 'object').columns)
le = LabelEncoder()
label_df[col_cat] = label_df[col_cat].apply(le.fit_transform)

In [63]:
label_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68126 entries, 0 to 68125
Data columns (total 76 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   총_유동인구_수          68126 non-null  float64
 1   남성_유동인구_수         68126 non-null  float64
 2   여성_유동인구_수         68126 non-null  float64
 3   연령대_10_유동인구_수     68126 non-null  float64
 4   연령대_20_유동인구_수     68126 non-null  float64
 5   시간대_06_11_유동인구_수  68126 non-null  float64
 6   시간대_11_14_유동인구_수  68126 non-null  float64
 7   시간대_14_17_유동인구_수  68126 non-null  float64
 8   시간대_17_21_유동인구_수  68126 non-null  float64
 9   시간대_21_06_유동인구_수  68126 non-null  float64
 10  주중_유동인구_수         68126 non-null  float64
 11  주말_유동인구_수         68126 non-null  float64
 12  연령대_30_40_유동인구_수  68126 non-null  float64
 13  연령대_50_60_유동인구_수  68126 non-null  float64
 14  총_상주인구_수          68126 non-null  float64
 15  남성_상주인구_수         68126 non-null  float64
 16  여성_상주인구_수         68126 non-null  float6

In [96]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs

# DBSCAN 모델 생성 및 학습
dbscan = DBSCAN(eps=0.5, min_samples=200)
labels = dbscan.fit_predict(label_df)

label_df['Cluster_Label'] = labels

label_df['Cluster_Label'].value_counts()

Cluster_Label
-1    67541
 0      585
Name: count, dtype: int64

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score

def custom_silhouette_score(estimator, X):
    labels = estimator.fit_predict(X)
    return silhouette_score(X, labels)

param_grid = {
    'eps': np.arange(0.1, 1.0, 0.05),
    'min_samples': np.arange(10, 300, 10)
}

grid_search = GridSearchCV(dbscan, param_grid, cv=5, scoring=custom_silhouette_score)
grid_search.fit(label_df)

print("최적의 하이퍼파라미터:", grid_search.best_params_)

Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_13060\2532922178.py", line 6, in custom_silhouette_score
    return silhouette_score(X, labels)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 130, in silhouette_score
    return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppData\Local\Programs\Python\P