# 1. 데이터 불러오기

In [212]:
import pandas as pd
import os

pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

PATH = './data/'

file_list= os.listdir(PATH)
csv_list = list()

for file in file_list:
    if file.split(".")[-1] == 'csv':
        csv_list.append(file)
data = ['living_popul', 'resident_popul', 'area', 'store', 'working_popul', 'facilitie', 'estimated_sales']

for name, file in zip(data, csv_list):
    globals()[name] = pd.read_csv(PATH + file, encoding = "ansi")

store = store.drop("Unnamed: 0", axis = 1)
estimated_sales = estimated_sales.drop("Unnamed: 0", axis = 1)

# 2. 데이터 전처리

## 2-0. 시점 통일

In [213]:
years = [20223, 20224, 20231, 20232]

living_popul = living_popul[living_popul['기준_년분기_코드'].isin(years)]
resident_popul = resident_popul[resident_popul['기준_년분기_코드'].isin(years)]
store = store[store['기준_년분기_코드'].isin(years)]
working_popul = working_popul[working_popul['기준_년분기_코드'].isin(years)]
facilitie = facilitie[facilitie['기준_년분기_코드'].isin(years)]
estimated_sales = estimated_sales[estimated_sales['기준_년분기_코드'].isin(years)]

## 2-1. 길단위인구 전처리

### 2-1-1 변수 처리

In [214]:
# 파생변수 생성
living_popul['시간대_21_06_유동인구_수'] = living_popul['시간대_00_06_유동인구_수'] + living_popul['시간대_21_24_유동인구_수']
living_popul['주중_유동인구_수'] = living_popul['월요일_유동인구_수'] + living_popul['화요일_유동인구_수'] + living_popul['수요일_유동인구_수'] + living_popul['목요일_유동인구_수'] + living_popul['금요일_유동인구_수']
living_popul['주말_유동인구_수'] = living_popul['토요일_유동인구_수'] + living_popul['일요일_유동인구_수']
living_popul['연령대_30_40_유동인구_수'] = living_popul['연령대_30_유동인구_수'] + living_popul['연령대_40_유동인구_수']
living_popul['연령대_50_60_유동인구_수'] = living_popul['연령대_50_유동인구_수'] + living_popul['연령대_60_이상_유동인구_수']

# 불필요한 변수 제거
living_popul = living_popul.drop(['월요일_유동인구_수', '화요일_유동인구_수', '수요일_유동인구_수', '목요일_유동인구_수', '금요일_유동인구_수', '토요일_유동인구_수', '일요일_유동인구_수', 
                                  '시간대_00_06_유동인구_수', '시간대_21_24_유동인구_수',
                                  '연령대_30_유동인구_수','연령대_40_유동인구_수','연령대_50_유동인구_수','연령대_60_이상_유동인구_수'], axis = 1)

### 2-1-2. 결측치/이상치/중복

In [215]:
# 결측값 확인
living_popul.isnull().sum()

기준_년분기_코드           0
상권_구분_코드_명          0
상권_코드_명             0
총_유동인구_수            0
남성_유동인구_수           0
여성_유동인구_수           0
연령대_10_유동인구_수       0
연령대_20_유동인구_수       0
시간대_06_11_유동인구_수    0
시간대_11_14_유동인구_수    0
시간대_14_17_유동인구_수    0
시간대_17_21_유동인구_수    0
시간대_21_06_유동인구_수    0
주중_유동인구_수           0
주말_유동인구_수           0
연령대_30_40_유동인구_수    0
연령대_50_60_유동인구_수    0
dtype: int64

In [216]:
# 통계량 확인
living_popul.describe()

Unnamed: 0,기준_년분기_코드,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수
count,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0
mean,20227.499,831723.936,396454.429,435269.521,106249.107,148852.466,169196.692,107398.432,108468.099,144295.668,302365.121,600110.514,231613.502,281556.79,295065.647
std,4.032,900980.836,437696.274,466905.782,113447.145,216584.734,179874.328,134146.79,137040.744,167412.286,321245.522,660890.833,246084.966,332193.428,308808.842
min,20223.0,12.0,12.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,221703.0,106884.0,113949.0,25620.0,32184.0,45807.0,28462.0,28587.0,38158.0,75590.0,159214.0,60098.0,74298.0,79315.0
50%,20224.0,568553.0,269829.0,297045.0,71540.0,82940.0,117002.0,72028.0,72359.0,96289.0,205750.0,406266.0,159173.0,185168.0,206051.0
75%,20231.0,1138476.0,537610.0,593960.0,147393.0,184525.0,234575.0,140269.0,140608.0,194982.0,417273.0,828500.0,314287.0,370667.0,412023.0
max,20232.0,8657826.0,4693188.0,4032096.0,829062.0,3487417.0,1678321.0,1853219.0,1861965.0,1793125.0,2942275.0,6833618.0,2124591.0,3506912.0,3565244.0


In [217]:
# 중복 확인
living_popul[living_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수


In [218]:
# 하위 5% 삭제
living_popul = living_popul[living_popul['총_유동인구_수']>living_popul['총_유동인구_수'].quantile(0.05)]

### 2-1-3. 분기별 데이터 연도 기준 변환

In [219]:
living_popul_year = living_popul.groupby(['상권_구분_코드_명','상권_코드_명'],as_index = False)\
                                .mean()\
                                .drop('기준_년분기_코드', axis = 1)

In [220]:
living_popul_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   상권_구분_코드_명        1575 non-null   object 
 1   상권_코드_명           1575 non-null   object 
 2   총_유동인구_수          1575 non-null   float64
 3   남성_유동인구_수         1575 non-null   float64
 4   여성_유동인구_수         1575 non-null   float64
 5   연령대_10_유동인구_수     1575 non-null   float64
 6   연령대_20_유동인구_수     1575 non-null   float64
 7   시간대_06_11_유동인구_수  1575 non-null   float64
 8   시간대_11_14_유동인구_수  1575 non-null   float64
 9   시간대_14_17_유동인구_수  1575 non-null   float64
 10  시간대_17_21_유동인구_수  1575 non-null   float64
 11  시간대_21_06_유동인구_수  1575 non-null   float64
 12  주중_유동인구_수         1575 non-null   float64
 13  주말_유동인구_수         1575 non-null   float64
 14  연령대_30_40_유동인구_수  1575 non-null   float64
 15  연령대_50_60_유동인구_수  1575 non-null   float64
dtypes: float64(14), object(2)
memory usage: 19

## 2-2. 상주인구 전처리

### 2-2-1. 변수 처리

In [221]:
# 파생변수 생성
resident_popul['연령대_30_40_상주인구_수'] = resident_popul['연령대_30_상주인구_수']+resident_popul['연령대_40_상주인구_수']
resident_popul['연령대_50_60_상주인구_수'] = resident_popul['연령대_50_상주인구_수']+resident_popul['연령대_60_이상_상주인구_수']

# 불필요한 변수 제거
resident_popul = resident_popul[['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명',
                                 '총_상주인구_수', '남성_상주인구_수','여성_상주인구_수', 
                                 '연령대_10_상주인구_수', '연령대_20_상주인구_수', '연령대_30_40_상주인구_수', '연령대_50_60_상주인구_수']]

### 2-2-2. 결측치/이상치/중복

In [222]:
# 결측값 확인
resident_popul.isnull().sum()

기준_년분기_코드           0
상권_구분_코드_명          0
상권_코드_명             0
총_상주인구_수            0
남성_상주인구_수           0
여성_상주인구_수           0
연령대_10_상주인구_수       0
연령대_20_상주인구_수       0
연령대_30_40_상주인구_수    0
연령대_50_60_상주인구_수    0
dtype: int64

In [223]:
# 통계량 확인
resident_popul.describe()

Unnamed: 0,기준_년분기_코드,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수
count,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0
mean,20227.495,2361.003,1166.658,1194.345,230.155,300.171,789.651,1041.026
std,4.032,2349.098,1159.233,1195.283,262.098,340.145,834.359,1031.672
min,20223.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,656.75,328.0,329.75,52.0,70.0,207.0,281.0
50%,20224.0,1621.0,811.5,817.0,147.0,195.0,528.0,721.0
75%,20231.0,3306.25,1639.0,1677.0,318.0,413.25,1083.0,1466.0
max,20232.0,21341.0,10459.0,10882.0,3078.0,4457.0,7626.0,8342.0


In [224]:
# 중복 확인
resident_popul[resident_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수


### 2-2-3. 분기별 데이터 연도 기준 변환

In [225]:
resident_popul_year = resident_popul[resident_popul['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드'],axis=1)

In [226]:
resident_popul_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1625 entries, 19887 to 24485
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   상권_구분_코드_명        1625 non-null   object
 1   상권_코드_명           1625 non-null   object
 2   총_상주인구_수          1625 non-null   int64 
 3   남성_상주인구_수         1625 non-null   int64 
 4   여성_상주인구_수         1625 non-null   int64 
 5   연령대_10_상주인구_수     1625 non-null   int64 
 6   연령대_20_상주인구_수     1625 non-null   int64 
 7   연령대_30_40_상주인구_수  1625 non-null   int64 
 8   연령대_50_60_상주인구_수  1625 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 127.0+ KB


## 2-3. 점포 전처리

### 2-3-1. 변수 처리

In [227]:
store.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304774 entries, 1055496 to 1360269
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   기준_년분기_코드    304774 non-null  int64 
 1   상권_구분_코드     304774 non-null  object
 2   상권_구분_코드_명   304774 non-null  object
 3   상권_코드        304774 non-null  int64 
 4   상권_코드_명      304774 non-null  object
 5   서비스_업종_코드    304774 non-null  object
 6   서비스_업종_코드_명  304774 non-null  object
 7   점포_수         304774 non-null  int64 
 8   유사_업종_점포_수   304774 non-null  int64 
 9   개업_율         304774 non-null  int64 
 10  개업_점포_수      304774 non-null  int64 
 11  폐업_률         304774 non-null  int64 
 12  폐업_점포_수      304774 non-null  int64 
 13  프랜차이즈_점포_수   304774 non-null  int64 
dtypes: int64(9), object(5)
memory usage: 34.9+ MB


In [228]:
store['서비스_업종_대분류'] = store['서비스_업종_코드'].str[:3]

In [229]:
store.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304774 entries, 1055496 to 1360269
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   기준_년분기_코드    304774 non-null  int64 
 1   상권_구분_코드     304774 non-null  object
 2   상권_구분_코드_명   304774 non-null  object
 3   상권_코드        304774 non-null  int64 
 4   상권_코드_명      304774 non-null  object
 5   서비스_업종_코드    304774 non-null  object
 6   서비스_업종_코드_명  304774 non-null  object
 7   점포_수         304774 non-null  int64 
 8   유사_업종_점포_수   304774 non-null  int64 
 9   개업_율         304774 non-null  int64 
 10  개업_점포_수      304774 non-null  int64 
 11  폐업_률         304774 non-null  int64 
 12  폐업_점포_수      304774 non-null  int64 
 13  프랜차이즈_점포_수   304774 non-null  int64 
 14  서비스_업종_대분류   304774 non-null  object
dtypes: int64(9), object(6)
memory usage: 37.2+ MB


### 2-3-1. 결측치/이상치/중복

In [230]:
# 결측값 확인
store.isnull().sum()

기준_년분기_코드      0
상권_구분_코드       0
상권_구분_코드_명     0
상권_코드          0
상권_코드_명        0
서비스_업종_코드      0
서비스_업종_코드_명    0
점포_수           0
유사_업종_점포_수     0
개업_율           0
개업_점포_수        0
폐업_률           0
폐업_점포_수        0
프랜차이즈_점포_수     0
서비스_업종_대분류     0
dtype: int64

In [231]:
# 통계량 확인
store.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수
count,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0,304774.0
mean,20227.497,3114874.594,6.056,6.506,2.292,0.138,2.479,0.179,0.45
std,4.031,11742.077,42.503,42.878,10.9,0.615,11.805,1.158,2.232
min,20223.0,3001491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,3110436.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,20224.0,3110871.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,20231.0,3120141.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0
max,20232.0,3130327.0,9002.0,9018.0,200.0,29.0,400.0,288.0,127.0


In [232]:
# 중복 확인
store[store.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,서비스_업종_대분류


### 2-3-2. 분기별 데이터 연도 기준 변환

In [233]:
store_year = store[store['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드', '상권_코드', '상권_구분_코드', '서비스_업종_코드', '서비스_업종_코드_명'], axis = 1)\
                                                    .groupby(['상권_구분_코드_명', '상권_코드_명', '서비스_업종_대분류'], as_index = False)\
                                                    .sum()

In [234]:
store_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4912 entries, 0 to 4911
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   상권_구분_코드_명  4912 non-null   object
 1   상권_코드_명     4912 non-null   object
 2   서비스_업종_대분류  4912 non-null   object
 3   점포_수        4912 non-null   int64 
 4   유사_업종_점포_수  4912 non-null   int64 
 5   개업_율        4912 non-null   int64 
 6   개업_점포_수     4912 non-null   int64 
 7   폐업_률        4912 non-null   int64 
 8   폐업_점포_수     4912 non-null   int64 
 9   프랜차이즈_점포_수  4912 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 383.9+ KB


## 2-4. 직장인구 전처리

### 2-4-1. 변수 처리

In [235]:
working_popul = working_popul[['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명', '총_직장_인구_수']]

### 2-4-2. 결측치/이상치/중복

In [236]:
# 결측값 확인
working_popul.isnull().sum()

기준_년분기_코드     0
상권_구분_코드_명    0
상권_코드_명       0
총_직장_인구_수     0
dtype: int64

In [237]:
# 통계량 확인
working_popul.describe()

Unnamed: 0,기준_년분기_코드,총_직장_인구_수
count,6492.0,6492.0
mean,20227.5,1771.425
std,4.031,6246.21
min,20223.0,1.0
25%,20223.75,107.0
50%,20227.5,317.0
75%,20231.25,892.0
max,20232.0,104830.0


In [238]:
# 중복 확인
working_popul[working_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_직장_인구_수


### 2-4-3. 분기별 데이터 연도 기준 변환

In [239]:
working_popul_year = working_popul[working_popul['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드'],axis=1)

In [240]:
working_popul_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1623 entries, 11546 to 25988
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   상권_구분_코드_명  1623 non-null   object
 1   상권_코드_명     1623 non-null   object
 2   총_직장_인구_수   1623 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 50.7+ KB


## 2-5. 집객시설 전처리

### 2-5-1. 변수 처리

In [241]:
# 파생변수 생성
facilitie['초중고_수'] = facilitie['초등학교_수']+facilitie['중학교_수']+facilitie['고등학교_수']

# 불필요한 변수 제거
facilitie = facilitie.drop(['집객시설_수','철도_역_수', '초등학교_수', '중학교_수', '고등학교_수'], axis=1)

### 2-5-2. 결측치/이상치/중복

In [242]:
# 결측값 확인
facilitie.isnull().sum()

기준_년분기_코드        0
상권_구분_코드_명       0
상권_코드_명          0
관공서_수         3792
은행_수          4364
종합병원_수        6220
일반_병원_수       5784
약국_수          2276
유치원_수         5708
대학교_수         5976
백화점_수         6268
슈퍼마켓_수        5892
극장_수          5976
숙박_시설_수       5732
공항_수          6308
버스_터미널_수      6304
지하철_역_수       5532
버스_정거장_수      1232
초중고_수         6308
dtype: int64

In [243]:
facilitie = facilitie.fillna(0)

In [244]:
# 통계량 확인
facilitie.describe()

Unnamed: 0,기준_년분기_코드,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수
count,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0
mean,20227.5,0.695,1.015,0.015,0.107,2.359,0.112,0.087,0.008,0.102,0.072,0.213,0.003,0.001,0.176,3.896,0.003
std,4.031,1.172,3.01,0.127,0.396,4.034,0.385,0.466,0.094,0.495,0.352,1.027,0.101,0.036,0.535,4.959,0.126
min,20223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,20227.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
75%,20231.25,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
max,20232.0,14.0,57.0,2.0,4.0,52.0,6.0,7.0,2.0,8.0,5.0,18.0,4.0,1.0,5.0,85.0,5.0


In [245]:
# 중복 확인
facilitie[facilitie.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수


### 2-5-3. 분기별 데이터 연도 기준 변환

In [246]:
facilitie_year = facilitie[facilitie['기준_년분기_코드'] == 20232].drop(['기준_년분기_코드'], axis = 1)

In [247]:
facilitie_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1578 entries, 0 to 17356
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   상권_구분_코드_명  1578 non-null   object 
 1   상권_코드_명     1578 non-null   object 
 2   관공서_수       1578 non-null   float64
 3   은행_수        1578 non-null   float64
 4   종합병원_수      1578 non-null   float64
 5   일반_병원_수     1578 non-null   float64
 6   약국_수        1578 non-null   float64
 7   유치원_수       1578 non-null   float64
 8   대학교_수       1578 non-null   float64
 9   백화점_수       1578 non-null   float64
 10  슈퍼마켓_수      1578 non-null   float64
 11  극장_수        1578 non-null   float64
 12  숙박_시설_수     1578 non-null   float64
 13  공항_수        1578 non-null   float64
 14  버스_터미널_수    1578 non-null   float64
 15  지하철_역_수     1578 non-null   float64
 16  버스_정거장_수    1578 non-null   float64
 17  초중고_수       1578 non-null   float64
dtypes: float64(16), object(2)
memory usage: 234.2+ KB


In [248]:
facilitie_year.describe()

Unnamed: 0,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수
count,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0,1578.0
mean,0.695,1.015,0.015,0.107,2.359,0.112,0.087,0.008,0.102,0.072,0.213,0.003,0.001,0.176,3.896,0.003
std,1.172,3.01,0.127,0.396,4.035,0.386,0.466,0.094,0.495,0.352,1.027,0.101,0.036,0.536,4.96,0.126
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
75%,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
max,14.0,57.0,2.0,4.0,52.0,6.0,7.0,2.0,8.0,5.0,18.0,4.0,1.0,5.0,85.0,5.0


## 2-6. 추정매출 전처리

### 2-6-1. 변수 처리

In [249]:
# 파생변수 생성
estimated_sales['시간대_21~06_매출_금액'] = estimated_sales['시간대_00~06_매출_금액'] + estimated_sales['시간대_21~24_매출_금액']
estimated_sales['시간대_21~06_매출_건수'] = estimated_sales['시간대_건수~06_매출_건수'] + estimated_sales['시간대_건수~24_매출_건수']
estimated_sales['연령대_30_40_매출_금액'] = estimated_sales['연령대_30_매출_금액'] + estimated_sales['연령대_40_매출_금액']
estimated_sales['연령대_50_60_매출_금액'] = estimated_sales['연령대_50_매출_금액'] + estimated_sales['연령대_60_이상_매출_금액']
estimated_sales['연령대_30_40_매출_건수'] = estimated_sales['연령대_30_매출_건수'] + estimated_sales['연령대_40_매출_건수']
estimated_sales['연령대_50_60_매출_건수'] = estimated_sales['연령대_50_매출_건수'] + estimated_sales['연령대_60_이상_매출_건수']

estimated_sales['서비스_업종_대분류'] = estimated_sales['서비스_업종_코드'].str[:3]

# 불필요한 변수 제거
estimated_sales = estimated_sales.drop(columns = ['서비스_업종_코드'],
                                       axis = 1)

estimated_sales = estimated_sales.drop(columns = ['월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액', '목요일_매출_금액', '금요일_매출_금액', '토요일_매출_금액', '일요일_매출_금액',
                                                  '월요일_매출_건수', '화요일_매출_건수', '수요일_매출_건수', '목요일_매출_건수', '금요일_매출_건수', '토요일_매출_건수', '일요일_매출_건수'],
                                       axis = 1)
estimated_sales = estimated_sales.drop(columns = ['시간대_00~06_매출_금액', '시간대_21~24_매출_금액', '시간대_건수~06_매출_건수', '시간대_건수~24_매출_건수',
                                                 '연령대_30_매출_금액','연령대_40_매출_금액','연령대_50_매출_금액','연령대_60_이상_매출_금액',
                                                 '연령대_30_매출_건수','연령대_40_매출_건수','연령대_50_매출_건수','연령대_60_이상_매출_건수'],
                                       axis = 1)

# 변수 이름 변경
estimated_sales = estimated_sales.rename(columns = {'시간대_건수~11_매출_건수' : '시간대_06~11_매출_건수',
                                                         '시간대_건수~14_매출_건수' : '시간대_11~14_매출_건수',
                                                         '시간대_건수~17_매출_건수' : '시간대_14~17_매출_건수',
                                                         '시간대_건수~21_매출_건수' : '시간대_17~21_매출_건수'})



### 2-6-2. 결측치/이상치/중복

In [250]:
# 결측값 확인
estimated_sales.isnull().sum()

기준_년분기_코드          0
상권_구분_코드           0
상권_구분_코드_명         0
상권_코드              0
상권_코드_명            0
서비스_업종_코드_명        0
당월_매출_금액           0
당월_매출_건수           0
주중_매출_금액           0
주말_매출_금액           0
시간대_06~11_매출_금액    0
시간대_11~14_매출_금액    0
시간대_14~17_매출_금액    0
시간대_17~21_매출_금액    0
남성_매출_금액           0
여성_매출_금액           0
연령대_10_매출_금액       0
연령대_20_매출_금액       0
주중_매출_건수           0
주말_매출_건수           0
시간대_06~11_매출_건수    0
시간대_11~14_매출_건수    0
시간대_14~17_매출_건수    0
시간대_17~21_매출_건수    0
남성_매출_건수           0
여성_매출_건수           0
연령대_10_매출_건수       0
연령대_20_매출_건수       0
시간대_21~06_매출_금액    0
시간대_21~06_매출_건수    0
연령대_30_40_매출_금액    0
연령대_50_60_매출_금액    0
연령대_30_40_매출_건수    0
연령대_50_60_매출_건수    0
서비스_업종_대분류         0
dtype: int64

In [251]:
# 통계량 확인
estimated_sales.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수
count,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0
mean,20227.506,3115479.183,1116599271.468,39184.849,842969184.583,273630043.386,116413408.602,286186909.051,280424841.016,306646276.19,506115717.968,483588097.551,5734552.208,135382675.504,29499.636,9685.213,5416.564,10204.191,8307.715,10443.374,19337.49,18168.507,577.372,7898.857,126927760.944,4813.006,431464849.292,417119530.685,16555.601,12474.165
std,4.031,14347.76,9371981938.469,168454.273,6458423614.17,3509552781.251,1172352096.256,2477461950.285,3124214644.465,2771104568.472,4449083121.756,3616311150.276,31894990.608,909570386.425,117929.298,57711.259,29263.631,45978.917,44186.686,47735.342,83502.247,83244.346,3617.461,42063.066,1000325381.326,23992.316,3469029992.731,3773146796.252,73433.475,61385.401
min,20223.0,3001491.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20224.0,3110547.0,48560077.0,868.0,36205605.5,7456979.5,145574.5,5491691.0,7869005.5,11816467.5,19597595.5,18111528.5,0.0,1369387.5,654.0,153.0,7.0,109.0,173.0,226.0,390.0,342.0,0.0,43.0,0.0,0.0,15857293.0,15467433.0,324.0,297.0
50%,20231.0,3111085.0,188164542.0,4646.0,140846234.0,38302621.0,6268042.0,34963031.0,36234148.0,50583967.0,84295031.0,75951653.0,166368.0,12428462.0,3446.0,1036.0,169.0,985.0,983.0,1304.0,2286.0,1968.0,12.0,446.0,4061538.0,77.0,69746034.0,63905807.0,1783.0,1642.0
75%,20232.0,3120185.0,686193977.0,25492.5,520555159.0,150576100.0,48504462.5,153603359.0,143040793.5,186412033.0,314344883.5,290361639.5,2273891.5,64077848.5,19055.5,5839.5,1996.0,6405.5,5206.5,6992.5,12096.0,11579.0,174.0,3306.0,51155789.0,1497.0,260456794.5,241524127.5,9897.0,8588.0
max,20232.0,3130327.0,995677000000.0,12090273.0,669799000000.0,485357000000.0,107916000000.0,242249000000.0,326543000000.0,386343000000.0,537243000000.0,304162000000.0,1549259735.0,90669553834.0,5827863.0,6262410.0,1474148.0,2611826.0,3245770.0,4287338.0,6708993.0,4615812.0,329040.0,3087414.0,126314287024.0,1514884.0,354347000000.0,411544000000.0,4519958.0,5329793.0


In [252]:
# 중복 확인
estimated_sales[estimated_sales.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드_명,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수,서비스_업종_대분류


In [253]:
def sales_tran(df):
    if df['당월_법인_매출_금액'] < 0:
        return df['당월_매출_금액'] - df['당월_법인_매출_금액']
    else :
        return df['당월_매출_금액']
estimated_sales = estimated_sales[estimated_sales['당월_매출_금액'] > estimated_sales['당월_매출_금액'].quantile(0.05)]
estimated_sales['당월_개인_매출_금액'] = estimated_sales['남성_매출_금액'] + estimated_sales['여성_매출_금액']
estimated_sales['당월_법인_매출_금액'] =  estimated_sales['당월_매출_금액'] - estimated_sales['당월_개인_매출_금액']

estimated_sales['당월_매출_금액'] = estimated_sales.apply(lambda x : sales_tran(x) , axis=1)

estimated_sales.loc[estimated_sales['당월_법인_매출_금액']<0, '당월_법인_매출_금액'] = 0

In [254]:
estimated_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79910 entries, 119954 to 204096
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   기준_년분기_코드        79910 non-null  int64  
 1   상권_구분_코드         79910 non-null  object 
 2   상권_구분_코드_명       79910 non-null  object 
 3   상권_코드            79910 non-null  int64  
 4   상권_코드_명          79910 non-null  object 
 5   서비스_업종_코드_명      79910 non-null  object 
 6   당월_매출_금액         79910 non-null  float64
 7   당월_매출_건수         79910 non-null  int64  
 8   주중_매출_금액         79910 non-null  float64
 9   주말_매출_금액         79910 non-null  float64
 10  시간대_06~11_매출_금액  79910 non-null  float64
 11  시간대_11~14_매출_금액  79910 non-null  float64
 12  시간대_14~17_매출_금액  79910 non-null  float64
 13  시간대_17~21_매출_금액  79910 non-null  float64
 14  남성_매출_금액         79910 non-null  float64
 15  여성_매출_금액         79910 non-null  float64
 16  연령대_10_매출_금액     79910 non-null  int64  
 17  연령대_20_매출_금

### 2-6-3. 분기별 데이터 연도 기준 변환

In [255]:
estimated_sales_year = estimated_sales.drop(columns = ['기준_년분기_코드','상권_구분_코드','상권_코드', '서비스_업종_코드_명'])\
                                      .groupby(['상권_구분_코드_명', '상권_코드_명', '서비스_업종_대분류'])\
                                      .mean()\
                                      .reset_index()

In [256]:
estimated_sales_year.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4132 entries, 0 to 4131
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   상권_구분_코드_명       4132 non-null   object 
 1   상권_코드_명          4132 non-null   object 
 2   서비스_업종_대분류       4132 non-null   object 
 3   당월_매출_금액         4132 non-null   float64
 4   당월_매출_건수         4132 non-null   float64
 5   주중_매출_금액         4132 non-null   float64
 6   주말_매출_금액         4132 non-null   float64
 7   시간대_06~11_매출_금액  4132 non-null   float64
 8   시간대_11~14_매출_금액  4132 non-null   float64
 9   시간대_14~17_매출_금액  4132 non-null   float64
 10  시간대_17~21_매출_금액  4132 non-null   float64
 11  남성_매출_금액         4132 non-null   float64
 12  여성_매출_금액         4132 non-null   float64
 13  연령대_10_매출_금액     4132 non-null   float64
 14  연령대_20_매출_금액     4132 non-null   float64
 15  주중_매출_건수         4132 non-null   float64
 16  주말_매출_건수         4132 non-null   float64
 17  시간대_06~11_매출_건

## 2-7. 데이터 병합

In [257]:
df = pd.merge(living_popul_year, resident_popul_year)
df = pd.merge(df, store_year)
df = pd.merge(df, working_popul_year)
df = pd.merge(df, facilitie_year)
df = pd.merge(df, estimated_sales_year)

In [258]:
df.loc[df['서비스_업종_대분류'] == "CS1", '서비스_업종_대분류'] = '요식업'
df.loc[df['서비스_업종_대분류'] == "CS2", '서비스_업종_대분류'] = '서비스업'
df.loc[df['서비스_업종_대분류'] == "CS3", '서비스_업종_대분류'] = '도매 및 소매업'

In [259]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3923 entries, 0 to 3922
Data columns (total 78 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   상권_구분_코드_명        3923 non-null   object 
 1   상권_코드_명           3923 non-null   object 
 2   총_유동인구_수          3923 non-null   float64
 3   남성_유동인구_수         3923 non-null   float64
 4   여성_유동인구_수         3923 non-null   float64
 5   연령대_10_유동인구_수     3923 non-null   float64
 6   연령대_20_유동인구_수     3923 non-null   float64
 7   시간대_06_11_유동인구_수  3923 non-null   float64
 8   시간대_11_14_유동인구_수  3923 non-null   float64
 9   시간대_14_17_유동인구_수  3923 non-null   float64
 10  시간대_17_21_유동인구_수  3923 non-null   float64
 11  시간대_21_06_유동인구_수  3923 non-null   float64
 12  주중_유동인구_수         3923 non-null   float64
 13  주말_유동인구_수         3923 non-null   float64
 14  연령대_30_40_유동인구_수  3923 non-null   float64
 15  연령대_50_60_유동인구_수  3923 non-null   float64
 16  총_상주인구_수          3923 non-null   int64  


In [260]:
df.describe()

Unnamed: 0,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,총_직장_인구_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수,당월_개인_매출_금액,당월_법인_매출_금액
count,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0,3923.0
mean,990662.923,471951.017,518711.919,126706.627,177990.6,201128.885,127850.164,129232.237,172422.284,360029.433,714476.252,276186.753,335048.336,350917.435,2750.341,1362.167,1388.174,260.154,299.233,943.378,1247.577,110.426,118.797,43.844,2.858,42.576,3.112,8.371,2081.319,0.789,1.185,0.017,0.127,2.767,0.128,0.09,0.008,0.096,0.081,0.238,0.0,0.002,0.196,4.347,0.004,716658818.316,28194.005,544676659.106,171982131.686,76625819.809,183894450.087,167653034.862,200391532.903,336735886.689,303762313.096,3648645.657,82712757.852,21170.304,7023.701,3864.181,7179.614,5722.842,7621.76,14430.05,12671.629,414.505,5452.889,88093943.676,3805.609,270131484.319,284004767.417,11690.293,9544.004,640498199.785,76160618.531
std,932634.453,454400.472,482522.101,116934.9,232061.831,185262.826,141562.786,144795.256,175292.158,331467.574,685232.623,254389.107,347161.178,318155.46,2443.832,1208.957,1240.488,269.158,276.678,891.059,1097.924,284.342,292.492,49.811,5.154,51.772,5.955,17.833,6871.566,1.241,3.267,0.135,0.429,4.304,0.413,0.469,0.099,0.505,0.375,1.111,0.0,0.039,0.563,5.211,0.138,2174786763.807,52068.062,1657272497.681,614689342.021,373350007.471,668927621.661,697568241.978,517324334.397,1013487229.373,917962999.583,11267898.599,234556263.561,38451.902,14973.397,8391.824,15675.837,12534.01,13864.152,25397.811,25383.649,1188.485,13289.973,218474280.247,7621.159,718031359.241,1120052415.546,21595.427,21054.942,1889246998.319,451482268.354
min,41356.5,18580.5,18881.0,1763.5,2419.5,7723.0,5020.25,4708.5,5842.0,5784.25,29058.0,9312.0,9347.5,11240.75,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6102740.0,5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2411357.75,0.0
25%,369325.0,176654.25,192909.625,43398.5,51955.0,75427.75,47155.25,47247.125,63211.5,123050.5,267541.5,102081.0,116936.25,129015.125,969.0,487.0,489.0,77.0,97.0,324.0,438.0,30.0,33.0,0.0,0.0,0.0,1.0,1.0,158.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,142839539.071,3544.405,108274069.548,29395039.417,5412530.756,27120231.896,25933295.268,40101665.375,69544279.708,54203932.458,215657.875,8720095.254,2638.95,754.362,179.938,860.583,774.299,895.925,1805.884,1408.137,13.0,304.333,7989040.917,118.208,55190775.223,52189851.792,1359.9,1315.85,130166607.5,1679940.814
50%,747268.25,350414.0,388178.0,92683.0,111250.75,151488.0,92263.75,92405.0,127566.5,266283.75,532684.5,208446.25,240681.25,267295.0,2055.0,1025.0,1038.0,176.0,219.0,704.0,930.0,55.0,60.0,30.0,1.0,25.0,2.0,4.0,388.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,338018425.211,13641.35,255444913.938,76603799.2,23143404.5,74640658.421,62905606.261,99210136.409,173358119.769,132950262.0,956198.941,26850257.162,10274.083,3136.043,1328.615,3405.167,2590.75,3725.375,7156.45,5567.667,82.697,1609.167,33458818.13,1066.783,126697827.667,134495749.0,5309.18,5139.975,314053949.375,10791410.278
75%,1331112.0,635243.25,697998.75,172265.625,218486.75,274680.375,163364.75,163528.5,224858.5,502414.5,954800.125,377188.0,431610.25,477765.25,3794.0,1889.0,1939.0,354.0,417.0,1291.0,1719.0,105.0,113.5,66.0,3.0,61.0,4.0,8.0,1026.5,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,689076477.938,33115.689,523726062.923,164028090.455,66227401.972,167929352.899,144690296.641,208506549.229,342598915.633,294279055.221,2764112.225,67220402.77,24906.353,8178.427,4580.534,7653.833,6494.127,9390.212,17319.289,14340.984,354.881,5192.403,84226583.904,4203.921,263670282.967,280683908.5,13468.867,11656.639,635607696.653,43165086.458
max,8536354.5,4598446.75,3937907.75,804173.75,3262037.5,1648813.0,1805141.25,1830343.5,1719183.0,2680625.0,6772121.5,2029885.0,3405217.25,3480277.5,21341.0,10459.0,10882.0,3078.0,2295.0,7626.0,8342.0,11277.0,11356.0,361.0,74.0,380.0,191.0,340.0,104830.0,14.0,57.0,2.0,4.0,52.0,6.0,7.0,2.0,8.0,5.0,18.0,0.0,1.0,5.0,85.0,5.0,68024306117.182,852226.5,55713424845.909,24877617673.698,15786387665.636,24961579454.727,24178256692.707,19388654750.667,32814948181.818,34181131814.273,287529327.925,4416535865.397,608724.75,341092.841,157273.75,315635.025,272630.25,279350.25,499177.0,415973.069,27591.133,290023.026,4403414692.95,108291.575,19083277742.778,49866347917.636,340535.941,604526.292,66996079996.091,20813538745.672


In [261]:
df.head()

Unnamed: 0,상권_구분_코드_명,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,연령대_30_40_유동인구_수,연령대_50_60_유동인구_수,총_상주인구_수,남성_상주인구_수,여성_상주인구_수,연령대_10_상주인구_수,연령대_20_상주인구_수,연령대_30_40_상주인구_수,연령대_50_60_상주인구_수,서비스_업종_대분류,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,총_직장_인구_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,버스_터미널_수,지하철_역_수,버스_정거장_수,초중고_수,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,연령대_30_40_매출_금액,연령대_50_60_매출_금액,연령대_30_40_매출_건수,연령대_50_60_매출_건수,당월_개인_매출_금액,당월_법인_매출_금액
0,골목상권,4.19민주묘지역 2번,868030.0,361490.25,506539.25,128381.0,163223.25,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,221593.25,354833.0,1247,596,651,135,121,324,667,요식업,38,48,8,2,4,1,10,96,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,411361983.913,20467.522,249668438.609,161693545.304,15936808.13,104889853.435,75817345.826,160273338.13,223277977.609,169226593.739,2611829.304,44509673.87,13399.13,7068.391,1599.739,5434.174,4205.0,6571.0,10135.13,10052.696,309.348,4043.435,54444638.391,2657.609,135490587.522,209892480.783,7626.87,8208.043,392504571.348,18857412.565
1,골목상권,4.19민주묘지역 2번,868030.0,361490.25,506539.25,128381.0,163223.25,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,221593.25,354833.0,1247,596,651,135,121,324,667,서비스업,17,17,0,0,0,0,0,96,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,34384167.0,1306.5,21903351.0,12480816.0,862554.5,9952889.75,10859155.0,12330743.0,20974955.0,13117808.25,136012.5,3947675.5,824.5,482.0,55.0,368.5,438.25,440.25,919.5,384.0,10.5,177.5,378824.75,4.5,20604973.0,9404102.25,788.75,326.5,34092763.25,291403.75
2,골목상권,4.19민주묘지역 2번,868030.0,361490.25,506539.25,128381.0,163223.25,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,221593.25,354833.0,1247,596,651,135,121,324,667,도매 및 소매업,20,23,33,1,50,1,3,96,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,304908427.0,44877.0,212658965.0,92249462.0,45252762.0,30557113.0,39478507.0,86471437.0,191164903.0,113743524.0,557091.0,62379154.0,31545.0,13332.0,7512.0,5310.0,6319.0,11742.0,28457.0,16420.0,165.0,10557.0,103148608.0,13994.0,131357183.0,110614999.0,18677.0,15478.0,304908427.0,0.0
3,골목상권,GS강동자이아파트,839601.25,383011.75,456589.75,175441.75,89926.5,169725.75,98596.5,101511.0,145801.5,323967.25,576675.25,262926.25,235045.0,339189.0,3443,1725,1718,404,413,1066,1560,요식업,32,42,150,4,31,3,10,411,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,286289020.0,17363.625,185667042.375,100621977.625,14879907.25,53871970.812,47333706.0,108390689.188,153030456.062,114711833.625,1023351.062,31041859.25,11307.812,6055.812,1677.688,4133.75,4211.875,5266.438,8529.938,8210.875,87.625,2876.0,61812746.75,2073.875,125058005.5,110619074.062,8390.5,5385.375,267742289.688,18546730.312
4,골목상권,GS강동자이아파트,839601.25,383011.75,456589.75,175441.75,89926.5,169725.75,98596.5,101511.0,145801.5,323967.25,576675.25,262926.25,235045.0,339189.0,3443,1725,1718,404,413,1066,1560,서비스업,71,75,133,4,7,1,4,411,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,107339195.182,1676.727,96099351.455,11239843.727,3162804.364,13415256.318,39256329.091,46620649.727,37947121.136,52851738.591,214350.318,1305563.318,1230.318,446.409,155.409,449.545,571.727,487.273,984.773,651.091,5.636,96.682,4884155.682,12.773,66180159.682,23098786.5,784.182,749.227,90798859.727,16540335.455


In [262]:
df.to_csv("final_df.csv")

# 3. 모델 학습

## 3-1. 분석에 적합한 데이터로 변환

In [263]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

label_df = df.drop(columns = ['상권_구분_코드_명', '상권_코드_명'])
label_df_ID = df[['상권_구분_코드_명', '상권_코드_명']]

scaler = StandardScaler()
col_num = list(label_df.select_dtypes(include = 'number').columns)
label_df[col_num] = scaler.fit_transform(label_df[col_num])

col_cat = list(label_df.select_dtypes(include = 'object').columns)
le = LabelEncoder()
label_df[col_cat] = label_df[col_cat].apply(le.fit_transform)

In [264]:
label_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3923 entries, 0 to 3922
Data columns (total 76 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   총_유동인구_수          3923 non-null   float64
 1   남성_유동인구_수         3923 non-null   float64
 2   여성_유동인구_수         3923 non-null   float64
 3   연령대_10_유동인구_수     3923 non-null   float64
 4   연령대_20_유동인구_수     3923 non-null   float64
 5   시간대_06_11_유동인구_수  3923 non-null   float64
 6   시간대_11_14_유동인구_수  3923 non-null   float64
 7   시간대_14_17_유동인구_수  3923 non-null   float64
 8   시간대_17_21_유동인구_수  3923 non-null   float64
 9   시간대_21_06_유동인구_수  3923 non-null   float64
 10  주중_유동인구_수         3923 non-null   float64
 11  주말_유동인구_수         3923 non-null   float64
 12  연령대_30_40_유동인구_수  3923 non-null   float64
 13  연령대_50_60_유동인구_수  3923 non-null   float64
 14  총_상주인구_수          3923 non-null   float64
 15  남성_상주인구_수         3923 non-null   float64
 16  여성_상주인구_수         3923 non-null   float64


In [285]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs

# DBSCAN 모델 생성 및 학습
dbscan = DBSCAN(eps=5, min_samples=15)
labels = dbscan.fit_predict(label_df)

label_df['Cluster_Label'] = labels

label_df['Cluster_Label'].value_counts()

Cluster_Label
 0    3423
-1     484
 1      16
Name: count, dtype: int64