# 데이터 불러오기

In [24]:
import pandas as pd
import os

pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)

PATH = './data/서울시 상권분석서비스(오탈자 수정)/'

file_list= os.listdir(PATH)
csv_list = list()

for file in file_list:
    if file.split(".")[-1] == 'csv':
        csv_list.append(file)
data = ['rent','small_rent','big_rent','living_popul','indicator','resident_popul', 'income','apartment', 'area', 'store', 'working_popul', 'facilitie', 'estimated_sales']

for name, file in zip(data, csv_list):
    if name == 'rent':
        globals()[name] = pd.read_csv(PATH + file, names = ['기준년코드', '기준분기코드', '행정동코드', '행정동코드명', '임대시세층구분코드', '임대시세층구분명', '보증금평균','월임대료평균','환산임대료평균','임대건수'],encoding = "ansi")
    else:
        globals()[name] = pd.read_csv(PATH + file, encoding = "ansi")

store = store.drop("Unnamed: 0", axis = 1)
estimated_sales = estimated_sales.drop("Unnamed: 0", axis = 1)

# 데이터 전처리

## 시점 통일

In [25]:
years = [20223, 20224, 20231, 20232]

living_popul = living_popul[living_popul['기준_년분기_코드'].isin(years)]
resident_popul = resident_popul[resident_popul['기준_년분기_코드'].isin(years)]
store = store[store['기준_년분기_코드'].isin(years)]
working_popul = working_popul[working_popul['기준_년분기_코드'].isin(years)]
facilitie = facilitie[facilitie['기준_년분기_코드'].isin(years)]
estimated_sales = estimated_sales[estimated_sales['기준_년분기_코드'].isin(years)]

## 길단위인구 전처리

### 파생변수 생성
1.시간대_21_06_유동인구_수
  - 21_00시 유동인구의 특징과 00_06시 유동인구의 특징이 비슷하다고 생각하여 합침

2.주중_유동인구_수
  - 월~금 유동인구 수의 합

3.주말_유동인구_수
  - 토,일 유동인구 수의 합

In [3]:
living_popul['시간대_21_06_유동인구_수'] = living_popul['시간대_00_06_유동인구_수'] + living_popul['시간대_21_24_유동인구_수']
living_popul['주중_유동인구_수'] = living_popul['월요일_유동인구_수'] + living_popul['화요일_유동인구_수'] + living_popul['수요일_유동인구_수'] + living_popul['목요일_유동인구_수'] + living_popul['금요일_유동인구_수']
living_popul['주말_유동인구_수'] = living_popul['토요일_유동인구_수'] + living_popul['일요일_유동인구_수']

### 불필요한 feature 제거
- 상권_구분_코드
- 상권_코드
- 월~일 유동인구 수
- 21~06시 유동인구수

In [4]:
living_popul = living_popul.drop(['상권_구분_코드','상권_코드','월요일_유동인구_수', '화요일_유동인구_수', '수요일_유동인구_수', '목요일_유동인구_수', '금요일_유동인구_수', '토요일_유동인구_수', '일요일_유동인구_수', '시간대_00_06_유동인구_수', '시간대_21_24_유동인구_수'], axis = 1)


### 결측치 확인 및 제거
- 결측치 없음

In [5]:
living_popul.isnull().sum()

기준_년분기_코드           0
상권_구분_코드_명          0
상권_코드_명             0
총_유동인구_수            0
남성_유동인구_수           0
여성_유동인구_수           0
연령대_10_유동인구_수       0
연령대_20_유동인구_수       0
연령대_30_유동인구_수       0
연령대_40_유동인구_수       0
연령대_50_유동인구_수       0
연령대_60_이상_유동인구_수    0
시간대_06_11_유동인구_수    0
시간대_11_14_유동인구_수    0
시간대_14_17_유동인구_수    0
시간대_17_21_유동인구_수    0
시간대_21_06_유동인구_수    0
주중_유동인구_수           0
주말_유동인구_수           0
dtype: int64

### 이상치 확인 및 제거
- 총유동인구수 8백만명? 이상치인듯

In [6]:
living_popul.describe()

Unnamed: 0,기준_년분기_코드,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수
count,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0,6597.0
mean,20227.499,831723.936,396454.429,435269.521,106249.107,148852.466,147395.794,134160.996,121115.739,173949.908,169196.692,107398.432,108468.099,144295.668,302365.121,600110.514,231613.502
std,4.032,900980.836,437696.274,466905.782,113447.145,216584.734,182996.722,151853.2,128640.435,182778.57,179874.328,134146.79,137040.744,167412.286,321245.522,660890.833,246084.966
min,20223.0,12.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,221703.0,106884.0,113949.0,25620.0,32184.0,36292.0,36453.0,32922.0,45849.0,45807.0,28462.0,28587.0,38158.0,75590.0,159214.0,60098.0
50%,20224.0,568553.0,269829.0,297045.0,71540.0,82940.0,93288.0,90968.0,83512.0,120170.0,117002.0,72028.0,72359.0,96289.0,205750.0,406266.0,159173.0
75%,20231.0,1138476.0,537610.0,593960.0,147393.0,184525.0,190351.0,179976.0,167648.0,240410.0,234575.0,140269.0,140608.0,194982.0,417273.0,828500.0,314287.0
max,20232.0,8657826.0,4693188.0,4032096.0,829062.0,3487417.0,2024145.0,1667878.0,1555937.0,2009307.0,1678321.0,1853219.0,1861965.0,1793125.0,2942275.0,6833618.0,2124591.0


In [19]:
living_popul[living_popul['여성_유동인구_수'] == 0]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수
702,20232,골목상권,헌인가구단지,15,15,0,0,15,0,0,0,0,9,5,1,0,0,15,0
2352,20231,골목상권,청계산원터골,12,12,0,0,0,0,0,0,12,4,8,0,0,0,0,12
4000,20224,골목상권,헌인가구단지,43,43,0,0,43,0,0,0,0,14,21,8,0,0,43,0


In [8]:
area[area['상권_코드_명']=='헌인가구단지']

Unnamed: 0,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,자치구_코드,자치구_코드_명,행정동_코드,행정동_코드_명,영역_면적
955,A,골목상권,3110948,헌인가구단지,207939,439987,11650,서초구,11650660,내곡동,198674


In [28]:
drop_list = ['헌인가구단지','청계산원터골']
living_popul = living_popul[~living_popul['상권_코드_명'].isin(drop_list)]

In [29]:
living_popul.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_00_06_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
count,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0,6592.0
mean,20227.5,3115227.093,832354.434,396754.899,435599.55,106329.674,148965.342,147507.534,134262.677,121207.537,174081.744,199532.344,169324.923,107479.822,108550.304,144405.047,103062.07,119108.69,120175.212,120554.79,120558.28,120168.473,116733.269,115055.801
std,4.031,10324.498,901031.466,437726.198,466928.964,113452.424,216628.068,183021.114,151865.875,128646.007,182785.156,213574.206,179882.239,134165.09,137060.187,167428.635,109857.073,129253.693,132141.931,133063.56,133531.976,133200.781,125605.811,120906.981
min,20223.0,3001491.0,2208.0,1180.0,336.0,171.0,0.0,0.0,289.0,392.0,508.0,0.0,454.0,350.0,298.0,156.0,44.0,260.0,324.0,357.0,340.0,320.0,200.0,196.0
25%,20223.75,3110406.75,221944.75,107104.75,114677.5,25661.75,32221.5,36435.25,36561.25,33025.75,45946.0,48230.5,45986.5,28511.5,28704.0,38284.5,26147.75,31674.5,31813.75,31957.75,32042.5,31838.75,30642.75,29696.5
50%,20227.5,3110818.5,569426.0,270193.0,297292.5,71593.5,83095.5,93373.0,91013.0,83573.5,120242.0,132917.5,117016.0,72188.5,72446.0,96368.5,69991.5,81442.5,81673.5,81625.5,81317.5,81255.5,79957.5,79579.5
75%,20231.25,3120142.25,1139881.0,537663.25,595211.75,147670.0,184660.5,190569.25,180432.5,167700.75,240527.0,277825.5,234715.0,140328.0,140697.25,195004.25,140674.25,164928.25,166089.0,167224.5,166299.5,165254.75,157322.25,156711.5
max,20232.0,3130327.0,8657826.0,4693188.0,4032096.0,829062.0,3487417.0,2024145.0,1667878.0,1555937.0,2009307.0,2066410.0,1678321.0,1853219.0,1861965.0,1793125.0,932288.0,1305495.0,1381468.0,1393885.0,1401446.0,1396973.0,1121321.0,1072841.0


In [30]:
living_popul[living_popul['연령대_20_유동인구_수'] == 0]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_00_06_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
648,20232,A,골목상권,3111002,한국교통안전공단 강남자동차검사소,2232,1896,336,200,0,0,680,540,812,0,724,616,660,188,44,336,376,388,400,336,200,196
2297,20231,A,골목상권,3111002,한국교통안전공단 강남자동차검사소,2308,1904,404,448,0,0,496,596,768,0,644,616,736,240,72,344,324,412,364,412,208,244
3946,20224,A,골목상권,3111002,한국교통안전공단 강남자동차검사소,2208,1696,512,336,0,0,492,528,852,0,656,580,668,220,84,260,388,364,340,320,232,304


In [31]:
drop_list = ['한국교통안전공단 강남자동차검사소']
living_popul = living_popul[~living_popul['상권_코드_명'].isin(drop_list)]

In [32]:
living_popul.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_00_06_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
count,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0,6588.0
mean,20227.5,3115229.658,832858.038,396994.531,435863.521,106393.944,149055.728,147596.993,134343.804,121280.745,174186.898,199653.395,169427.274,107544.679,108615.756,144492.479,103124.529,119180.774,120247.901,120627.695,120631.196,120241.172,116803.94,115125.44
std,4.031,10327.107,901073.073,437750.989,466947.75,113456.866,216662.759,183040.647,151876.269,128650.735,182790.801,213582.51,179888.862,134179.986,137076.042,167441.842,109861.162,129259.807,132149.101,133071.043,133539.706,133208.524,125611.179,120910.635
min,20223.0,3001491.0,2412.0,1180.0,1086.0,171.0,57.0,38.0,289.0,392.0,508.0,236.0,454.0,350.0,298.0,156.0,164.0,358.0,368.0,357.0,349.0,354.0,350.0,257.0
25%,20223.75,3110406.0,223100.25,107382.25,115069.5,25708.75,32334.25,36631.75,36748.25,33085.25,46091.0,48326.0,46082.75,28561.0,28773.25,38324.25,26202.25,31759.25,31962.75,32083.75,32084.0,31945.0,30716.0,29741.25
50%,20227.5,3110818.0,569686.0,270305.5,297377.0,71625.5,83144.5,93469.0,91078.5,83658.0,120378.5,133072.0,117055.0,72287.5,72632.5,96502.0,70117.5,81501.0,81721.5,81819.5,81422.0,81317.0,80031.0,79654.5
75%,20231.25,3120143.0,1140100.5,537673.75,595809.75,147759.0,184898.5,190695.0,180467.25,167755.0,240585.25,277920.0,234930.5,140484.0,140719.25,195039.75,140719.0,164976.25,166115.0,167290.75,166778.75,165368.5,157436.75,156908.25
max,20232.0,3130327.0,8657826.0,4693188.0,4032096.0,829062.0,3487417.0,2024145.0,1667878.0,1555937.0,2009307.0,2066410.0,1678321.0,1853219.0,1861965.0,1793125.0,932288.0,1305495.0,1381468.0,1393885.0,1401446.0,1396973.0,1121321.0,1072841.0


In [34]:
living_popul[living_popul['총_유동인구_수'] > 7000000]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_00_06_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
2,20232,U,관광특구,3001494,종로·청계 관광특구,8580607,4570612,4009996,322394,1490540,1649310,1625177,1528185,1965000,880985,1678321,1815991,1823148,1715164,666997,1305495,1351576,1393885,1401446,1371576,998401,758227
4,20232,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,7258225,3500135,3758090,304018,1221932,1764290,1635316,1174656,1158013,406335,1528984,1820261,1785956,1355004,361685,1104058,1189462,1242379,1237291,1174209,733393,577432
371,20232,D,발달상권,3120189,강남역,7823728,3955268,3868460,769592,2195122,2024145,1406941,784087,643841,970217,1476879,1408434,1453614,1726038,788548,1152231,1211168,1250178,1265723,1252864,956079,735485
466,20232,D,발달상권,3120094,"신촌역(신촌역, 신촌로터리)",7280557,3248459,4032096,797975,3487417,1133647,668219,549830,643468,1285465,1254635,1041957,1108082,1658129,932288,995707,1013201,1025897,1038732,1082427,1121321,1003270
1651,20231,U,관광특구,3001494,종로·청계 관광특구,8502972,4590859,3912112,351982,1513170,1606018,1602118,1515462,1914224,855100,1614425,1819068,1856681,1707645,650054,1296243,1339484,1351829,1392225,1377047,1009101,737044
1653,20231,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,7334315,3588578,3745738,317233,1266022,1793576,1648217,1171566,1137703,432967,1526756,1814899,1808164,1377142,374389,1144424,1199754,1200975,1248387,1209872,762858,568045
2020,20231,D,발달상권,3120189,강남역,7942915,4000287,3942628,816174,2253035,2021207,1412213,787438,652848,963989,1455555,1429461,1491003,1793125,809780,1185229,1238518,1236835,1279921,1296327,983786,722300
2642,20231,A,골목상권,3110656,까치산역 3번,7012938,3305441,3707497,605814,1273343,1533725,1136200,886534,1577319,2066410,1529921,757403,731697,1051641,875865,985368,988658,984150,970849,971426,1039645,1072841
3300,20224,U,관광특구,3001494,종로·청계 관광특구,8657826,4693188,3964638,334742,1500979,1638219,1618642,1555937,2009307,903657,1653054,1825114,1861965,1729022,685015,1272457,1381468,1386192,1396528,1396973,1047401,776808
3302,20224,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,7629595,3717728,3911867,365427,1301955,1839034,1667878,1216256,1239044,481664,1556366,1853219,1851308,1478804,408233,1103048,1241705,1251450,1267499,1240066,864591,661235


### 중복행 확인 및 제거
- 중복행 없음

In [38]:
living_popul[living_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수


### 분기데이터 1년치 데이터로 변환
- 유동인구 -> 4개 분기의 평균

In [36]:
living_popul_year = living_popul.groupby(['상권_구분_코드_명','상권_코드_명'],as_index=False).mean()
living_popul_year = living_popul_year.drop('기준_년분기_코드', axis = 1)

## 상주인구 전처리

### 불필요한 feature 제거
- 상권_구분_코드
- 상권_코드
- 총 상주인구 수
- 남성 상주인구 수
- 여성 상주인구 수
- 연령대 10 ~ 60 이상상주인구 수
- 총 가구 수, 아파트 가구 수, 비아파트 가구수

In [39]:
resident_popul.drop(['상권_구분_코드','상권_코드','총_상주인구_수','남성_상주인구_수','여성_상주인구_수','연령대_10_상주인구_수','연령대_20_상주인구_수','연령대_30_상주인구_수','연령대_40_상주인구_수','연령대_50_상주인구_수','연령대_60_이상_상주인구_수','총_가구_수','아파트_가구_수','비_아파트_가구_수'],axis=1,inplace=True)

### 결측치 확인 및 제거
- 결측치 없음

In [40]:
resident_popul.isnull().sum()

기준_년분기_코드             0
상권_구분_코드_명            0
상권_코드_명               0
남성연령대_10_상주인구_수       0
남성연령대_20_상주인구_수       0
남성연령대_30_상주인구_수       0
남성연령대_40_상주인구_수       0
남성연령대_50_상주인구_수       0
남성연령대_60_이상_상주인구_수    0
여성연령대_10_상주인구_수       0
여성연령대_20_상주인구_수       0
여성연령대_30_상주인구_수       0
여성연령대_40_상주인구_수       0
여성연령대_50_상주인구_수       0
여성연령대_60_이상_상주인구_수    0
dtype: int64

### 이상치 확인 및 제거
- 이상치가 없다고 판단됨

In [41]:
resident_popul.describe()

Unnamed: 0,기준_년분기_코드,남성연령대_10_상주인구_수,남성연령대_20_상주인구_수,남성연령대_30_상주인구_수,남성연령대_40_상주인구_수,남성연령대_50_상주인구_수,남성연령대_60_이상_상주인구_수,여성연령대_10_상주인구_수,여성연령대_20_상주인구_수,여성연령대_30_상주인구_수,여성연령대_40_상주인구_수,여성연령대_50_상주인구_수,여성연령대_60_이상_상주인구_수
count,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0
mean,20227.495,118.175,147.964,210.038,193.31,188.114,309.056,111.979,152.207,206.473,179.83,184.968,358.887
std,4.032,134.548,165.438,231.487,204.475,187.631,305.688,128.133,179.216,231.099,193.03,190.433,359.223
min,20223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.0,26.0,34.0,52.0,50.0,51.0,84.0,25.0,35.0,49.0,44.0,47.0,95.0
50%,20224.0,75.5,95.0,138.0,127.0,130.0,213.0,71.0,96.0,135.0,118.0,124.0,247.0
75%,20231.0,164.0,208.0,290.0,270.0,266.0,434.0,155.0,207.0,285.25,249.0,263.25,507.0
max,20232.0,1586.0,2161.0,2148.0,2235.0,1577.0,2260.0,1492.0,2812.0,2070.0,2045.0,1584.0,2921.0


### 중복행 확인 및 제거
- 중복행 없음

In [42]:
resident_popul[resident_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,남성연령대_10_상주인구_수,남성연령대_20_상주인구_수,남성연령대_30_상주인구_수,남성연령대_40_상주인구_수,남성연령대_50_상주인구_수,남성연령대_60_이상_상주인구_수,여성연령대_10_상주인구_수,여성연령대_20_상주인구_수,여성연령대_30_상주인구_수,여성연령대_40_상주인구_수,여성연령대_50_상주인구_수,여성연령대_60_이상_상주인구_수


### 분기데이터 1년치 데이터로 변환
- 마지막 분기인 2023년 2분기의 상주인구만 추출

In [43]:
resident_popul_year = resident_popul[resident_popul['기준_년분기_코드'] == 20232]
resident_popul_year = resident_popul_year.drop(['기준_년분기_코드'],axis=1)

## 점포 전처리

### 불필요한 feature 제거
- 상권_구분_코드
- 상권_코드

In [49]:
store.drop(['상권_구분_코드','상권_코드',],axis=1)

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수
1055496,20223,골목상권,덕산중학교,CS200043,건축물청소,1,1,0,0,0,0,0
1055497,20223,전통시장,수유중앙골목시장(수유중앙시장),CS100009,호프-간이주점,3,3,0,0,0,0,0
1055498,20223,발달상권,서울시흥동우체국(홈플러스시흥점),CS100010,커피-음료,5,9,0,0,0,0,4
1055499,20223,골목상권,문정역 1번,CS100002,중식음식점,3,3,0,0,0,0,0
1055500,20223,골목상권,언북초등학교,CS300029,애완동물,2,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1360265,20232,전통시장,"평화시장(남평화시장, 제일평화시장, 신평화패션타운)",CS300024,운동/경기용품,7,7,0,0,0,0,0
1360266,20232,전통시장,"평화시장(남평화시장, 제일평화시장, 신평화패션타운)",CS300027,섬유제품,10,10,0,0,0,0,0
1360267,20232,전통시장,"평화시장(남평화시장, 제일평화시장, 신평화패션타운)",CS300029,애완동물,1,1,0,0,0,0,0
1360268,20232,전통시장,"평화시장(남평화시장, 제일평화시장, 신평화패션타운)",CS300035,인테리어,2,2,0,0,0,0,0


### 중복행 확인 및 제거
- 중복행 없음

In [50]:
store[store.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수


### 결측치 확인 및 제거
- 결측치 없음

In [51]:
store.isnull().sum()

기준_년분기_코드      0
상권_구분_코드       0
상권_구분_코드_명     0
상권_코드          0
상권_코드_명        0
서비스_업종_코드      0
서비스_업종_코드_명    0
점포_수           0
유사_업종_점포_수     0
개업_율           0
개업_점포_수        0
폐업_률           0
폐업_점포_수        0
프랜차이즈_점포_수     0
dtype: int64

### 이상치 확인 및 제거
- 점포수 -> 8500개? 이상치 인듯?

In [52]:
store_year.describe()

Unnamed: 0,상권_코드,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수
count,76099.0,76099.0,76099.0,76099.0,76099.0,76099.0,76099.0,76099.0
mean,3114873.204,6.019,6.464,2.503,0.153,2.367,0.168,0.445
std,11730.472,41.496,41.876,11.382,0.667,11.33,0.899,2.22
min,3001491.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3110436.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,3110871.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,3120141.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0
max,3130327.0,8493.0,8509.0,200.0,23.0,200.0,146.0,127.0


### 분기데이터 1년치 데이터로 변환
- 마지막 분기인 2023년 2분기의 데이터만 추출

In [53]:
store_year = store[store['기준_년분기_코드'] == 20232]
store_year = store_year.drop(columns = ['기준_년분기_코드'])

## 직장인구 전처리

### 필요한 컬럼만 추출
- 기준_년분기_코드
- 상권_구분_코드_명
- 상권_코드_명
- 총_직장_인구_수

In [54]:
working_popul = working_popul[['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명', '총_직장_인구_수']]


### 결측치 확인 및 제거
- 결측치 없음

In [57]:
working_popul.isnull().sum()

기준_년분기_코드     0
상권_구분_코드_명    0
상권_코드_명       0
총_직장_인구_수     0
dtype: int64

### 이상치 확인 및 제거
- 이상치 없음

In [58]:
working_popul.describe()

Unnamed: 0,기준_년분기_코드,총_직장_인구_수
count,6492.0,6492.0
mean,20227.5,1771.425
std,4.031,6246.21
min,20223.0,1.0
25%,20223.75,107.0
50%,20227.5,317.0
75%,20231.25,892.0
max,20232.0,104830.0


### 중복행 확인 및 제거
- 중복행 없음

In [59]:
working_popul[working_popul.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,총_직장_인구_수


### 분기데이터 1년치 데이터로 변환
- 마지막 분기인 2023년 2분기의 데이터만 추출

In [60]:
working_popul_year = working_popul[working_popul['기준_년분기_코드'] == 20232]
working_popul_year = working_popul_year.drop(['기준_년분기_코드'],axis=1)

## 집객시설 전처리

### 불필요한 feature 제거
- 상권_구분_코드
- 상권_코드
- 집객시설_수

In [62]:
facilitie.drop(['상권_구분_코드','상권_코드','집객시설_수'],axis=1)

Unnamed: 0,기준_년분기_코드,상권_구분_코드_명,상권_코드_명,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,철도_역_수,버스_터미널_수,지하철_역_수,버스_정거장_수
0,20232,골목상권,동묘앞역 2번,1.000,,,,,,,,,,,,,2.000,,,,,4.000
1,20232,골목상권,목동로데오거리,,,,,,,,,,,,,,,,,,,
2,20231,골목상권,동묘앞역 2번,1.000,,,,,,,,,,,,,2.000,,,,,4.000
3,20232,골목상권,청운초등학교,,,,,,,,,,,,,,,,,,,1.000
4,20231,골목상권,청운초등학교,,,,,,,,,,,,,,,,,,,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17353,20231,관광특구,종로·청계 관광특구,5.000,17.000,,,32.000,,,,,,,5.000,2.000,15.000,,,,3.000,28.000
17354,20232,관광특구,잠실 관광특구,3.000,13.000,,,14.000,,1.000,,,,,,1.000,,,,,3.000,20.000
17355,20231,관광특구,잠실 관광특구,3.000,13.000,,,14.000,,1.000,,,,,,1.000,,,,,3.000,20.000
17356,20232,관광특구,강남 마이스 관광특구,1.000,8.000,,,3.000,,,,,,1.000,,1.000,,,,,,8.000


### 결측치 확인 및 제거
- 결측치 -> 0으로 처리

In [63]:
facilitie.isnull().sum()

기준_년분기_코드        0
상권_구분_코드         0
상권_구분_코드_명       0
상권_코드            0
상권_코드_명          0
집객시설_수           0
관공서_수         3792
은행_수          4364
종합병원_수        6220
일반_병원_수       5784
약국_수          2276
유치원_수         5708
초등학교_수        6260
중학교_수         6296
고등학교_수        6268
대학교_수         5976
백화점_수         6268
슈퍼마켓_수        5892
극장_수          5976
숙박_시설_수       5732
공항_수          6308
철도_역_수        6312
버스_터미널_수      6304
지하철_역_수       5532
버스_정거장_수      1232
dtype: int64

In [66]:
facilitie = facilitie.fillna(0)
facilitie.isnull().sum()

기준_년분기_코드     0
상권_구분_코드      0
상권_구분_코드_명    0
상권_코드         0
상권_코드_명       0
집객시설_수        0
관공서_수         0
은행_수          0
종합병원_수        0
일반_병원_수       0
약국_수          0
유치원_수         0
초등학교_수        0
중학교_수         0
고등학교_수        0
대학교_수         0
백화점_수         0
슈퍼마켓_수        0
극장_수          0
숙박_시설_수       0
공항_수          0
철도_역_수        0
버스_터미널_수      0
지하철_역_수       0
버스_정거장_수      0
dtype: int64

In [220]:
facilitie = facilitie.drop('집객시설_수', axis = 1)
facilitie = facilitie.fillna(0)

# facilitie_year = facilitie[facilitie['기준_년분기_코드'] == 20232]
# facilitie_year = facilitie_year.drop('기준_년분기_코드', axis = 1)

### 이상치 확인 및 제거
- 은행 52개 버스정거장수 85 집객 시설 수 594개 이상한데

In [68]:
facilitie.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,철도_역_수,버스_터미널_수,지하철_역_수,버스_정거장_수
count,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0,6312.0
mean,20227.5,3115009.244,20.871,0.695,1.015,0.015,0.107,2.359,0.112,0.008,0.003,0.008,0.087,0.008,0.102,0.072,0.213,0.003,0.0,0.001,0.176,3.896
std,4.031,10294.271,32.262,1.172,3.01,0.127,0.396,4.034,0.385,0.09,0.05,0.109,0.466,0.094,0.495,0.352,1.027,0.101,0.0,0.036,0.535,4.959
min,20223.0,3001491.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20223.75,3110400.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,20227.5,3110802.5,12.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
75%,20231.25,3120126.0,23.0,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
max,20232.0,3130327.0,594.0,14.0,57.0,2.0,4.0,52.0,6.0,1.0,1.0,3.0,7.0,2.0,8.0,5.0,18.0,4.0,0.0,1.0,5.0,85.0


### 중복행 확인 및 제거
- 중복행 없음


In [69]:
facilitie[facilitie.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,철도_역_수,버스_터미널_수,지하철_역_수,버스_정거장_수


### 분기데이터 1년치 데이터로 변환
- 마지막 분기인 2023년 2분기의 데이터만 추출

In [70]:
facilitie_year = facilitie[facilitie['기준_년분기_코드'] == 20232]
facilitie_year = facilitie_year.drop(['기준_년분기_코드'],axis=1)

## 추정매출 전처리

### 파생변수 생성
1.시간대_21_06_매출_건수, 시간대_21_06_매출_금액
- 21_00시 유동인구의 특징과 00_06시 유동인구의 특징이 비슷하다고 판단함

2.건당 매출금액
- 건당 매출금액이 중요한 지표일 것이라고 생각하여 파생변수 생성

In [91]:
estimated_sales['시간대_21~06_매출_금액'] = estimated_sales['시간대_00~06_매출_금액'] + estimated_sales['시간대_21~24_매출_금액']
estimated_sales['시간대_21~06_매출_건수'] = estimated_sales['시간대_건수~06_매출_건수'] + estimated_sales['시간대_건수~24_매출_건수']

# 건당 매출 금액 컬럼 생성
estimated_sales['주말_건당_매출_금액'] = estimated_sales[(estimated_sales['주말_매출_금액'] != 0) & (estimated_sales['주말_매출_건수'] != 0 )]['주말_매출_금액'] / estimated_sales[(estimated_sales['주말_매출_금액'] != 0) & (estimated_sales['주말_매출_건수'] != 0) ]['주말_매출_건수']
estimated_sales['당월_건당_매출_금액'] = estimated_sales[(estimated_sales['당월_매출_금액'] != 0) & (estimated_sales['당월_매출_건수'] != 0 )]['당월_매출_금액'] / estimated_sales[(estimated_sales['당월_매출_금액'] != 0) & (estimated_sales['당월_매출_건수'] != 0 )]['당월_매출_건수']
estimated_sales['주중_건당_매출_금액'] = estimated_sales[(estimated_sales['주중_매출_금액'] != 0) & (estimated_sales['주중_매출_건수'] != 0 )]['주중_매출_금액'] / estimated_sales[(estimated_sales['주중_매출_금액'] != 0) & (estimated_sales['주중_매출_건수'] != 0 )]['주중_매출_건수']

estimated_sales['시간대_21~06_건당_매출_금액'] =  estimated_sales[(estimated_sales['시간대_21~06_매출_금액'] != 0) & (estimated_sales['시간대_21~06_매출_건수'] != 0) ]['시간대_21~06_매출_금액'] / estimated_sales[(estimated_sales['시간대_21~06_매출_금액'] != 0) & (estimated_sales['시간대_21~06_매출_건수'] != 0 )]['시간대_21~06_매출_건수']
estimated_sales['시간대_06~11_건당_매출_금액'] =  estimated_sales[(estimated_sales['시간대_06~11_매출_금액'] != 0) & (estimated_sales['시간대_건수~11_매출_건수'] != 0) ]['시간대_06~11_매출_금액'] / estimated_sales[(estimated_sales['시간대_06~11_매출_금액'] != 0) & (estimated_sales['시간대_건수~11_매출_건수'] != 0) ]['시간대_건수~11_매출_건수']
estimated_sales['시간대_11~14_건당_매출_금액'] =  estimated_sales[(estimated_sales['시간대_11~14_매출_금액'] != 0) & (estimated_sales['시간대_건수~14_매출_건수'] != 0) ]['시간대_11~14_매출_금액'] / estimated_sales[(estimated_sales['시간대_11~14_매출_금액'] != 0) & (estimated_sales['시간대_건수~14_매출_건수'] != 0 )]['시간대_건수~14_매출_건수']
estimated_sales['시간대_14~17_건당_매출_금액'] =  estimated_sales[(estimated_sales['시간대_14~17_매출_금액'] != 0) & (estimated_sales['시간대_건수~17_매출_건수'] != 0) ]['시간대_14~17_매출_금액'] / estimated_sales[(estimated_sales['시간대_14~17_매출_금액'] != 0) & (estimated_sales['시간대_건수~17_매출_건수'] != 0) ]['시간대_건수~17_매출_건수']
estimated_sales['시간대_17~21_건당_매출_금액'] =  estimated_sales[(estimated_sales['시간대_17~21_매출_금액'] != 0) & (estimated_sales['시간대_건수~21_매출_건수'] != 0) ]['시간대_17~21_매출_금액'] / estimated_sales[(estimated_sales['시간대_17~21_매출_금액'] != 0) & (estimated_sales['시간대_건수~21_매출_건수'] != 0 )]['시간대_건수~21_매출_건수']

estimated_sales['남성_건당_매출_금액'] = estimated_sales[(estimated_sales['남성_매출_금액'] != 0) & (estimated_sales['남성_매출_건수'] != 0)]['남성_매출_금액'] / estimated_sales[(estimated_sales['남성_매출_금액'] != 0) & (estimated_sales['남성_매출_건수'] != 0)]['남성_매출_건수']
estimated_sales['여성_건당_매출_금액'] = estimated_sales[(estimated_sales['여성_매출_금액'] != 0) & (estimated_sales['여성_매출_건수'] != 0)]['여성_매출_금액'] / estimated_sales[(estimated_sales['여성_매출_금액'] != 0) & (estimated_sales['여성_매출_건수'] != 0)]['여성_매출_건수']

estimated_sales['연령대_10_건당_매출_금액'] = estimated_sales[(estimated_sales['연령대_10_매출_금액'] != 0) & (estimated_sales['연령대_10_매출_건수'] != 0)]['연령대_10_매출_금액'] / estimated_sales[(estimated_sales['연령대_10_매출_금액'] != 0) & (estimated_sales['연령대_10_매출_건수'] != 0)]['연령대_10_매출_건수']
estimated_sales['연령대_20_건당_매출_금액'] = estimated_sales[(estimated_sales['연령대_20_매출_금액'] != 0) & (estimated_sales['연령대_20_매출_건수'] != 0)]['연령대_20_매출_금액'] / estimated_sales[(estimated_sales['연령대_20_매출_금액'] != 0) & (estimated_sales['연령대_20_매출_건수'] != 0)]['연령대_20_매출_건수']
estimated_sales['연령대_30_건당_매출_금액'] = estimated_sales[(estimated_sales['연령대_30_매출_금액'] != 0) & (estimated_sales['연령대_30_매출_건수'] != 0)]['연령대_30_매출_금액'] / estimated_sales[(estimated_sales['연령대_30_매출_금액'] != 0) & (estimated_sales['연령대_30_매출_건수'] != 0)]['연령대_30_매출_건수']
estimated_sales['연령대_40_건당_매출_금액'] = estimated_sales[(estimated_sales['연령대_40_매출_금액'] != 0) & (estimated_sales['연령대_40_매출_건수'] != 0)]['연령대_40_매출_금액'] / estimated_sales[(estimated_sales['연령대_40_매출_금액'] != 0) & (estimated_sales['연령대_40_매출_건수'] != 0)]['연령대_40_매출_건수']
estimated_sales['연령대_50_건당_매출_금액'] = estimated_sales[(estimated_sales['연령대_50_매출_금액'] != 0) & (estimated_sales['연령대_50_매출_건수'] != 0)]['연령대_50_매출_금액'] / estimated_sales[(estimated_sales['연령대_50_매출_금액'] != 0) & (estimated_sales['연령대_50_매출_건수'] != 0)]['연령대_50_매출_건수']
estimated_sales['연령대_60_이상_건당_매출_금액'] = estimated_sales[(estimated_sales['연령대_60_이상_매출_금액'] != 0) & (estimated_sales['연령대_60_이상_매출_건수'] != 0)]['연령대_60_이상_매출_금액'] / estimated_sales[(estimated_sales['연령대_60_이상_매출_금액'] != 0) & (estimated_sales['연령대_60_이상_매출_건수'] != 0)]['연령대_60_이상_매출_건수']

### 불필요한 feature제거
- 상권_구분_코드- 
상권_코
- 서비스_업종_코드
- 21~06 매출금액, 매출건수
- 월~일 매출금액, 매출건수드

In [92]:
estimated_sales = estimated_sales.drop(columns = ['월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액', '목요일_매출_금액', '금요일_매출_금액', '토요일_매출_금액', '일요일_매출_금액',
                                                  '월요일_매출_건수', '화요일_매출_건수', '수요일_매출_건수', '목요일_매출_건수', '금요일_매출_건수', '토요일_매출_건수', '일요일_매출_건수'],
                                       axis = 1)
estimated_sales = estimated_sales.drop(columns = ['시간대_00~06_매출_금액', '시간대_21~24_매출_금액', '시간대_건수~06_매출_건수', '시간대_건수~24_매출_건수'],
                                       axis = 1)

### 결측치 확인 및 제거
- 결측치 어떻게 처리하지

In [93]:
estimated_sales.isnull().sum()

기준_년분기_코드                 0
상권_구분_코드                  0
상권_구분_코드_명                0
상권_코드                     0
상권_코드_명                   0
서비스_업종_코드                 0
서비스_업종_코드_명               0
당월_매출_금액                  0
당월_매출_건수                  0
주중_매출_금액                  0
주말_매출_금액                  0
시간대_06~11_매출_금액           0
시간대_11~14_매출_금액           0
시간대_14~17_매출_금액           0
시간대_17~21_매출_금액           0
남성_매출_금액                  0
여성_매출_금액                  0
연령대_10_매출_금액              0
연령대_20_매출_금액              0
연령대_30_매출_금액              0
연령대_40_매출_금액              0
연령대_50_매출_금액              0
연령대_60_이상_매출_금액           0
주중_매출_건수                  0
주말_매출_건수                  0
시간대_건수~11_매출_건수           0
시간대_건수~14_매출_건수           0
시간대_건수~17_매출_건수           0
시간대_건수~21_매출_건수           0
남성_매출_건수                  0
여성_매출_건수                  0
연령대_10_매출_건수              0
연령대_20_매출_건수              0
연령대_30_매출_건수              0
연령대_40_매출_건수              0
연령대_50_매출_건수        

### 이상치 확인 및 제거
- 당원매출금액 천억 이상치인듯

In [86]:
estimated_sales.describe()

Unnamed: 0,기준_년분기_코드,상권_코드,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,연령대_30_매출_금액,연령대_40_매출_금액,연령대_50_매출_금액,연령대_60_이상_매출_금액,주중_매출_건수,주말_매출_건수,시간대_건수~11_매출_건수,시간대_건수~14_매출_건수,시간대_건수~17_매출_건수,시간대_건수~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,연령대_30_매출_건수,연령대_40_매출_건수,연령대_50_매출_건수,연령대_60_이상_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,주말_건당_매출_금액,당월_건당_매출_금액,주중_건당_매출_금액,시간대_21~06_건당_매출_금액,시간대_06~11_건당_매출_금액,시간대_11~14_건당_매출_금액,시간대_14~17_건당_매출_금액,시간대_17~21_건당_매출_금액,남성_건당_매출_금액,여성_건당_매출_금액,연령대_10_건당_매출_금액,연령대_20_건당_매출_금액,연령대_30_건당_매출_금액,연령대_40_건당_매출_금액,연령대_50_건당_매출_금액,연령대_60_이상_건당_매출_금액
count,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,84143.0,77279.0,84143.0,83774.0,56349.0,64785.0,76612.0,80184.0,80538.0,81833.0,82357.0,47588.0,71546.0,77474.0,79629.0,79686.0,76841.0
mean,20227.506,3115479.183,1116599271.468,39184.849,842969184.583,273630043.386,116413408.602,286186909.051,280424841.016,306646276.19,506115717.968,483588097.551,5734552.208,135382675.504,205194817.011,226270032.281,217404282.219,199715248.466,29499.636,9685.213,5416.564,10204.191,8307.715,10443.374,19337.49,18168.507,577.372,7898.857,9086.808,7468.793,6750.112,5724.053,126927760.944,4813.006,83132.047,109621.121,109041.418,93690.484,93743.594,98326.29,100626.167,96620.709,102187.413,97906.973,23466.79,57017.362,81133.588,98027.47,102809.128,95077.685
std,4.031,14347.76,9371981938.469,168454.273,6458423614.17,3509552781.251,1172352096.256,2477461950.285,3124214644.465,2771104568.472,4449083121.756,3616311150.276,31894990.608,909570386.425,1708256766.645,1844309119.926,1920483460.46,1994032159.984,117929.298,57711.259,29263.631,45978.917,44186.686,47735.342,83502.247,83244.346,3617.461,42063.066,42550.8,32101.841,28616.901,35747.1,1000325381.326,23992.316,228405.297,331781.307,324431.012,239620.241,445048.339,322100.088,346651.97,306060.891,351204.358,277196.776,59250.907,203798.638,267730.902,314884.804,334354.107,503888.622
min,20223.0,3001491.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-144509.0,1.0,1.0,9.833,10.091,1.0,1.0,1.0,9.741,3.0
25%,20224.0,3110547.0,48560077.0,868.0,36205605.5,7456979.5,145574.5,5491691.0,7869005.5,11816467.5,19597595.5,18111528.5,0.0,1369387.5,4899653.0,8000895.0,8424536.0,4355162.0,654.0,153.0,7.0,109.0,173.0,226.0,390.0,342.0,0.0,43.0,115.0,170.5,167.0,100.0,0.0,0.0,13796.053,15043.424,14829.54,11398.98,10440.423,12987.548,13593.591,13385.873,13837.129,13489.028,5788.062,9265.385,11262.564,13211.44,14123.821,14226.924
50%,20231.0,3111085.0,188164542.0,4646.0,140846234.0,38302621.0,6268042.0,34963031.0,36234148.0,50583967.0,84295031.0,75951653.0,166368.0,12428462.0,26992718.0,36596239.0,36308017.0,22326341.0,3446.0,1036.0,169.0,985.0,983.0,1304.0,2286.0,1968.0,12.0,446.0,796.0,910.0,936.0,621.0,4061538.0,77.0,30379.97,33330.238,32587.181,35715.773,20028.121,25417.26,28107.967,31259.496,28974.456,30823.493,11102.329,20282.307,25799.753,29738.885,30278.494,29165.246
75%,20232.0,3120185.0,686193977.0,25492.5,520555159.0,150576100.0,48504462.5,153603359.0,143040793.5,186412033.0,314344883.5,290361639.5,2273891.5,64077848.5,114128620.0,136065461.5,132552681.0,98029973.5,19055.5,5839.5,1996.0,6405.5,5206.5,6992.5,12096.0,11579.0,174.0,3306.0,4845.5,4833.0,4731.5,3523.5,51155789.0,1497.0,67503.758,91892.818,91393.093,81064.218,58389.83,69939.543,77748.635,76644.914,77807.355,77595.43,22556.146,43082.098,59023.138,76601.734,78592.973,67687.754
max,20232.0,3130327.0,995677000000.0,12090273.0,669799000000.0,485357000000.0,107916000000.0,242249000000.0,326543000000.0,386343000000.0,537243000000.0,304162000000.0,1549259735.0,90669553834.0,185968000000.0,168379000000.0,179689000000.0,238248000000.0,5827863.0,6262410.0,1474148.0,2611826.0,3245770.0,4287338.0,6708993.0,4615812.0,329040.0,3087414.0,2413132.0,2106826.0,2297953.0,3188920.0,126314287024.0,1514884.0,19671457.896,27447230.947,27447230.947,19555555.667,56064225.231,23291042.833,27447230.947,21438295.142,27447230.947,14000000.0,4787225.5,29928546.065,18138461.538,18989769.821,27447230.947,95937160.6


### 중복행 확인 및 제거
- 중복행 없음

In [87]:
estimated_sales[estimated_sales.duplicated()]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,당월_매출_금액,당월_매출_건수,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,연령대_30_매출_금액,연령대_40_매출_금액,연령대_50_매출_금액,연령대_60_이상_매출_금액,주중_매출_건수,주말_매출_건수,시간대_건수~11_매출_건수,시간대_건수~14_매출_건수,시간대_건수~17_매출_건수,시간대_건수~21_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,연령대_30_매출_건수,연령대_40_매출_건수,연령대_50_매출_건수,연령대_60_이상_매출_건수,시간대_21~06_매출_금액,시간대_21~06_매출_건수,주말_건당_매출_금액,당월_건당_매출_금액,주중_건당_매출_금액,시간대_21~06_건당_매출_금액,시간대_06~11_건당_매출_금액,시간대_11~14_건당_매출_금액,시간대_14~17_건당_매출_금액,시간대_17~21_건당_매출_금액,남성_건당_매출_금액,여성_건당_매출_금액,연령대_10_건당_매출_금액,연령대_20_건당_매출_금액,연령대_30_건당_매출_금액,연령대_40_건당_매출_금액,연령대_50_건당_매출_금액,연령대_60_이상_건당_매출_금액


### feature순서 재배치

In [94]:
cols_1 = ['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명', '서비스_업종_코드_명']

cols_2 = ['당월_매출_금액', '주중_매출_금액', '주말_매출_금액',
         '시간대_06~11_매출_금액', '시간대_11~14_매출_금액', '시간대_14~17_매출_금액', '시간대_17~21_매출_금액', '시간대_21~06_매출_금액',
         '남성_매출_금액', '여성_매출_금액', 
         '연령대_10_매출_금액', '연령대_20_매출_금액', '연령대_30_매출_금액', '연령대_40_매출_금액', '연령대_50_매출_금액', '연령대_60_이상_매출_금액']

cols_3 = ['당월_매출_건수', '주중_매출_건수', '주말_매출_건수',
         '시간대_건수~11_매출_건수', '시간대_건수~14_매출_건수', '시간대_건수~17_매출_건수', '시간대_건수~21_매출_건수', '시간대_21~06_매출_건수',
         '남성_매출_건수', '여성_매출_건수',
         '연령대_10_매출_건수', '연령대_20_매출_건수', '연령대_30_매출_건수', '연령대_40_매출_건수', '연령대_50_매출_건수', '연령대_60_이상_매출_건수']

cols_4 = ['당월_건당_매출_금액', '주중_건당_매출_금액', '주말_건당_매출_금액',
         '시간대_06~11_건당_매출_금액', '시간대_11~14_건당_매출_금액', '시간대_14~17_건당_매출_금액', '시간대_17~21_건당_매출_금액', '시간대_21~06_건당_매출_금액',
         '남성_건당_매출_금액', '여성_건당_매출_금액',
         '연령대_10_건당_매출_금액', '연령대_20_건당_매출_금액', '연령대_30_건당_매출_금액', '연령대_40_건당_매출_금액', '연령대_50_건당_매출_금액', '연령대_60_이상_건당_매출_금액']

cols = [*cols_1, *cols_2, *cols_3, *cols_4]

estimated_sales = estimated_sales[cols].rename(columns = {'시간대_건수~11_매출_건수' : '시간대_06~11_매출_건수',
                                                         '시간대_건수~14_매출_건수' : '시간대_11~14_매출_건수',
                                                         '시간대_건수~17_매출_건수' : '시간대_14~17_매출_건수',
                                                         '시간대_건수~21_매출_건수' : '시간대_17~21_매출_건수'})

### 분기데이터 1년치 데이터로 변환

In [95]:
estimated_sales_year = estimated_sales.drop(columns = ['기준_년분기_코드']).groupby(['상권_구분_코드_명', '상권_코드_명', '서비스_업종_코드_명']).mean().reset_index()

## 임대료 전처리

### 불필요한 feature 제거
- 행정동코드
- 임대시세층구분코드
- 기준년코드
- 기준분기코드
- 임대건수
- 환산임대료평균

In [96]:
rent = rent.rename(columns = {"행정동코드명" : "행정동_코드_명"}).drop(columns = ['행정동코드', '임대시세층구분코드', '기준년코드', '기준분기코드', '임대건수', '환산임대료평균'])[rent['임대시세층구분명'] != '전체층']

In [97]:
rent.head()

Unnamed: 0,행정동_코드_명,임대시세층구분명,보증금평균,월임대료평균
0,내곡동,1층,3286492,124345
1,내곡동,1층외,17778757,110169
3,방배3동,1층외,3590641,92720
5,방배3동,1층,1093228,94289
6,삼성2동,1층외,1312205,164525


### 결측치 확인 및 제거
- 결측치 없음

In [100]:
rent.isnull().sum()

행정동_코드_명    0
임대시세층구분명    0
보증금평균       0
월임대료평균      0
dtype: int64

### 이상치 처리

In [101]:
rent.describe()

Unnamed: 0,보증금평균,월임대료평균
count,849.0,849.0
mean,2290233.134,104758.296
std,1368703.396,54340.249
min,0.0,0.0
25%,1466278.0,66164.0
50%,2003512.0,98072.0
75%,2817322.0,130993.0
max,17778757.0,424738.0


## 데이터 병합

In [228]:
df = pd.merge(living_popul_year, resident_popul_year)
df = pd.merge(df, area)
df = pd.merge(df, store_year)
df = pd.merge(df, working_popul_year)
df = pd.merge(df, facilitie_year)
df = pd.merge(df, estimated_sales_year)
df = pd.merge(df, rent)

In [229]:
cols_1 = ['자치구_코드_명', '행정동_코드_명', '임대시세층구분명', '보증금평균', '월임대료평균']
cols_2 = ['상권_구분_코드_명', '상권_코드_명', '영역_면적', '엑스좌표_값', '와이좌표_값']
cols_3 = ['총_유동인구_수', '남성_유동인구_수', '여성_유동인구_수',
          '연령대_10_유동인구_수', '연령대_20_유동인구_수', '연령대_30_유동인구_수', '연령대_40_유동인구_수', '연령대_50_유동인구_수', '연령대_60_이상_유동인구_수',
          '시간대_06_11_유동인구_수', '시간대_11_14_유동인구_수', '시간대_14_17_유동인구_수', '시간대_17_21_유동인구_수', '시간대_21_06_유동인구_수',
          '주중_유동인구_수', '주말_유동인구_수']
cols_4 = ['남성연령대_10_상주인구_수', '남성연령대_20_상주인구_수', '남성연령대_30_상주인구_수', '남성연령대_40_상주인구_수', '남성연령대_50_상주인구_수', '남성연령대_60_이상_상주인구_수', 
          '여성연령대_10_상주인구_수', '여성연령대_20_상주인구_수', '여성연령대_30_상주인구_수', '여성연령대_40_상주인구_수', '여성연령대_50_상주인구_수', '여성연령대_60_이상_상주인구_수',
          '총_직장_인구_수']
cols_5 = ['관공서_수', '은행_수', '종합병원_수', '일반_병원_수', '약국_수', '유치원_수', '초등학교_수', '중학교_수', '고등학교_수',
          '대학교_수', '백화점_수', '슈퍼마켓_수', '극장_수', '숙박_시설_수', '공항_수', '철도_역_수', '버스_터미널_수', '지하철_역_수', '버스_정거장_수']
cols_6 = ['서비스_업종_코드_명', '점포_수', '유사_업종_점포_수', '개업_율', '개업_점포_수', '폐업_률', '폐업_점포_수', '프랜차이즈_점포_수']
cols_7 = ['당월_매출_금액', '주중_매출_금액', '주말_매출_금액', 
          '시간대_06~11_매출_금액', '시간대_11~14_매출_금액', '시간대_14~17_매출_금액', '시간대_17~21_매출_금액', '시간대_21~06_매출_금액', 
          '남성_매출_금액', '여성_매출_금액', 
          '연령대_10_매출_금액', '연령대_20_매출_금액', '연령대_30_매출_금액', '연령대_40_매출_금액', '연령대_50_매출_금액', '연령대_60_이상_매출_금액', 
          '당월_매출_건수', '주중_매출_건수', '주말_매출_건수', 
          '시간대_06~11_매출_건수', '시간대_11~14_매출_건수', '시간대_14~17_매출_건수', '시간대_17~21_매출_건수', '시간대_21~06_매출_건수', 
          '남성_매출_건수', '여성_매출_건수', 
          '연령대_10_매출_건수', '연령대_20_매출_건수', '연령대_30_매출_건수', '연령대_40_매출_건수', '연령대_50_매출_건수', '연령대_60_이상_매출_건수', 
          '당월_건당_매출_금액', '주중_건당_매출_금액', '주말_건당_매출_금액', 
          '시간대_06~11_건당_매출_금액', '시간대_11~14_건당_매출_금액', '시간대_14~17_건당_매출_금액', '시간대_17~21_건당_매출_금액', '시간대_21~06_건당_매출_금액', 
          '남성_건당_매출_금액', '여성_건당_매출_금액', 
          '연령대_10_건당_매출_금액', '연령대_20_건당_매출_금액', '연령대_30_건당_매출_금액', '연령대_40_건당_매출_금액', '연령대_50_건당_매출_금액', '연령대_60_이상_건당_매출_금액']


cols = [*cols_1, *cols_2, *cols_3, *cols_4, *cols_5, *cols_6, *cols_7]

df = df[cols]

In [230]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26970 entries, 0 to 26969
Columns: 114 entries, 자치구_코드_명 to 월임대료평균
dtypes: float64(83), int64(25), object(6)
memory usage: 23.5+ MB


In [231]:
df.head()

Unnamed: 0,자치구_코드_명,행정동_코드_명,상권_구분_코드_명,상권_코드_명,영역_면적,엑스좌표_값,와이좌표_값,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,연령대_30_유동인구_수,연령대_40_유동인구_수,연령대_50_유동인구_수,연령대_60_이상_유동인구_수,시간대_06_11_유동인구_수,시간대_11_14_유동인구_수,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_06_유동인구_수,주중_유동인구_수,주말_유동인구_수,남성연령대_10_상주인구_수,남성연령대_20_상주인구_수,남성연령대_30_상주인구_수,남성연령대_40_상주인구_수,남성연령대_50_상주인구_수,남성연령대_60_이상_상주인구_수,여성연령대_10_상주인구_수,여성연령대_20_상주인구_수,여성연령대_30_상주인구_수,여성연령대_40_상주인구_수,여성연령대_50_상주인구_수,여성연령대_60_이상_상주인구_수,총_직장_인구_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,철도_역_수,버스_터미널_수,지하철_역_수,버스_정거장_수,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수,당월_매출_금액,주중_매출_금액,주말_매출_금액,시간대_06~11_매출_금액,시간대_11~14_매출_금액,시간대_14~17_매출_금액,시간대_17~21_매출_금액,시간대_21~06_매출_금액,남성_매출_금액,여성_매출_금액,연령대_10_매출_금액,연령대_20_매출_금액,연령대_30_매출_금액,연령대_40_매출_금액,연령대_50_매출_금액,연령대_60_이상_매출_금액,당월_매출_건수,주중_매출_건수,주말_매출_건수,시간대_06~11_매출_건수,시간대_11~14_매출_건수,시간대_14~17_매출_건수,시간대_17~21_매출_건수,시간대_21~06_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,연령대_30_매출_건수,연령대_40_매출_건수,연령대_50_매출_건수,연령대_60_이상_매출_건수,당월_건당_매출_금액,주중_건당_매출_금액,주말_건당_매출_금액,시간대_06~11_건당_매출_금액,시간대_11~14_건당_매출_금액,시간대_14~17_건당_매출_금액,시간대_17~21_건당_매출_금액,시간대_21~06_건당_매출_금액,남성_건당_매출_금액,여성_건당_매출_금액,연령대_10_건당_매출_금액,연령대_20_건당_매출_금액,연령대_30_건당_매출_금액,연령대_40_건당_매출_금액,연령대_50_건당_매출_금액,연령대_60_이상_건당_매출_금액,임대시세층구분명,보증금평균,월임대료평균
0,강북구,우이동,골목상권,4.19민주묘지역 2번,60794,201112,461090,868030.0,361490.25,506539.25,128381.0,163223.25,107811.5,113781.75,121303.5,233529.5,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,73,57,62,93,95,216,62,64,72,97,95,261,96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,한식음식점,23,24,8,2,4,1,1,1739897842.0,1033559973.0,706337869.0,34758423.75,481068145.75,310010178.0,724466602.75,189594491.75,989044375.25,651314594.25,5154289.25,116807695.25,212122359.0,294670997.5,521347174.25,490256455.0,51956.5,33236.75,18719.75,1961.25,17455.5,10006.5,18257.75,4275.5,29305.25,21307.25,335.5,5561.5,6832.0,9084.25,14829.25,13972.0,33572.889,31134.794,37937.946,17773.801,27381.008,31010.383,39874.189,44545.446,33800.232,30709.071,15353.105,21074.157,31275.325,32766.235,35019.042,35398.899,1층외,814856,35822
1,강북구,우이동,골목상권,4.19민주묘지역 2번,60794,201112,461090,868030.0,361490.25,506539.25,128381.0,163223.25,107811.5,113781.75,121303.5,233529.5,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,73,57,62,93,95,216,62,64,72,97,95,261,96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,한식음식점,23,24,8,2,4,1,1,1739897842.0,1033559973.0,706337869.0,34758423.75,481068145.75,310010178.0,724466602.75,189594491.75,989044375.25,651314594.25,5154289.25,116807695.25,212122359.0,294670997.5,521347174.25,490256455.0,51956.5,33236.75,18719.75,1961.25,17455.5,10006.5,18257.75,4275.5,29305.25,21307.25,335.5,5561.5,6832.0,9084.25,14829.25,13972.0,33572.889,31134.794,37937.946,17773.801,27381.008,31010.383,39874.189,44545.446,33800.232,30709.071,15353.105,21074.157,31275.325,32766.235,35019.042,35398.899,1층,1600781,74305
2,강북구,우이동,골목상권,4.19민주묘지역 2번,60794,201112,461090,868030.0,361490.25,506539.25,128381.0,163223.25,107811.5,113781.75,121303.5,233529.5,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,73,57,62,93,95,216,62,64,72,97,95,261,96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,일식음식점,3,3,0,0,0,0,0,18435727.0,15543098.25,2892628.75,0.0,9658447.5,3074001.75,5623422.5,79855.25,7005386.75,11430340.25,561557.25,6405609.5,2608330.25,3453367.25,3140322.75,2266540.0,1519.25,1359.25,160.0,0.0,821.5,247.0,441.75,9.0,463.25,1056.0,55.25,601.25,198.5,296.75,264.0,103.5,12216.994,11513.839,18130.938,0.0,11779.065,12683.103,12867.929,8648.625,15253.336,10867.332,10839.657,10668.088,13203.072,11725.296,12048.076,21269.099,1층외,814856,35822
3,강북구,우이동,골목상권,4.19민주묘지역 2번,60794,201112,461090,868030.0,361490.25,506539.25,128381.0,163223.25,107811.5,113781.75,121303.5,233529.5,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,73,57,62,93,95,216,62,64,72,97,95,261,96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,일식음식점,3,3,0,0,0,0,0,18435727.0,15543098.25,2892628.75,0.0,9658447.5,3074001.75,5623422.5,79855.25,7005386.75,11430340.25,561557.25,6405609.5,2608330.25,3453367.25,3140322.75,2266540.0,1519.25,1359.25,160.0,0.0,821.5,247.0,441.75,9.0,463.25,1056.0,55.25,601.25,198.5,296.75,264.0,103.5,12216.994,11513.839,18130.938,0.0,11779.065,12683.103,12867.929,8648.625,15253.336,10867.332,10839.657,10668.088,13203.072,11725.296,12048.076,21269.099,1층,1600781,74305
4,강북구,우이동,골목상권,4.19민주묘지역 2번,60794,201112,461090,868030.0,361490.25,506539.25,128381.0,163223.25,107811.5,113781.75,121303.5,233529.5,169905.0,105148.0,103902.5,145960.75,343114.0,619012.25,249017.5,73,57,62,93,95,216,62,64,72,97,95,261,96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,패스트푸드점,0,3,0,0,0,0,3,86378566.5,59122514.5,27256052.0,3169532.25,24443206.75,20407827.5,33587803.75,4770196.25,41442797.25,42919979.0,1285185.75,17418838.0,19863153.25,21952675.5,14165818.5,9677105.5,9085.25,6316.5,2768.75,327.0,2693.5,2068.0,3472.75,524.0,4168.75,4881.0,166.25,2228.0,2101.0,2142.0,1393.5,1017.5,9524.851,9389.652,9837.983,9698.443,9090.27,9903.015,9676.0,9051.212,9972.561,8805.697,7651.68,7839.99,9505.546,10256.964,10182.59,9459.499,1층외,814856,35822
