# 데이터 수집 및 전처리

### [활용 데이터]

- 1.[상권영역] 서울시 상권분석서비스(영역-상권) 2023.10월 기준
https://data.seoul.go.kr/dataList/OA-15560/S/1/datasetView.do

- 2.[개폐업 수] 서울시 상권분석서비스(점포-상권),
[프랜차이즈 수] 서울시 상권분석서비스(점포-상권) 2019~2022 기준
https://data.seoul.go.kr/dataList/OA-15577/S/1/datasetView.do

- 3.[매출금액] 서울시 상권분석서비스(추정매출-상권) 2019~2022 기준
https://data.seoul.go.kr/dataList/OA-15572/S/1/datasetView.do

- 4.[길단위인구 수] 서울시 상권분석서비스(길단위인구-상권) 2019~2022 기준
https://data.seoul.go.kr/dataList/OA-15568/S/1/datasetView.do

In [1]:
# 필요한 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
# 한글 폰트 깨짐 현상 해결(jupyter notebook)
plt.rcParams['font.family'] = 'Malgun Gothic'
# 마이너스 부호 출력 설정
plt.rcParams['axes.unicode_minus']=False

# 한글 폰트 깨짐 현상 해결(Colab)

## 1.상권영역 데이터 전처리
- 서울시 상권영역: 1650개 (2023년 기준)


In [3]:
# 파일 읽어오기, 한글csv file이므로 encoding cp949
commer_district = pd.read_csv('./data/commer_district.csv', encoding='cp949')
commer_district

Unnamed: 0,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,자치구_코드,자치구_코드_명,행정동_코드,행정동_코드_명,영역_면적
0,A,골목상권,3110008,배화여자대학교(박노수미술관),197093,453418,11110,종로구,11110515,청운효자동,149264
1,A,골목상권,3110009,자하문터널,196991,455057,11110,종로구,11110550,부암동,178306
2,A,골목상권,3110010,평창동서측,197064,456643,11110,종로구,11110560,평창동,369415
3,A,골목상권,3110017,정독도서관,198581,453781,11110,종로구,11110600,가회동,83855
4,A,골목상권,3110018,중앙고등학교,198883,453690,11110,종로구,11110600,가회동,166872
...,...,...,...,...,...,...,...,...,...,...,...
1645,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,198397,451614,11140,중구,11140520,소공동,983618
1646,U,관광특구,3001493,동대문패션타운 관광특구,200996,451976,11140,중구,11140590,광희동,606058
1647,U,관광특구,3001494,종로?청계 관광특구,199796,452274,11110,종로구,11110615,종로1?2?3?4가동,653127
1648,U,관광특구,3001495,잠실 관광특구,210188,446334,11710,송파구,11710562,방이2동,2462734


In [4]:
# 상권_코드, 상권_코드_명 칼럼만 남기고 전부 삭제
commer_district.drop(commer_district.columns[4:11], axis=1, inplace=True)
commer_district.drop(commer_district.columns[0:2], axis=1, inplace=True)

# 합친 파일 저장
commer_district.to_csv('../gentrification/data_finish/commer_district_finish.csv', index=False)
commer_district

Unnamed: 0,상권_코드,상권_코드_명
0,3110008,배화여자대학교(박노수미술관)
1,3110009,자하문터널
2,3110010,평창동서측
3,3110017,정독도서관
4,3110018,중앙고등학교
...,...,...
1645,3001492,명동 남대문 북창동 다동 무교동 관광특구
1646,3001493,동대문패션타운 관광특구
1647,3001494,종로?청계 관광특구
1648,3001495,잠실 관광특구


## 2.개폐업수 데이터 전처리
- 2019~2022 프랜차이즈수, 개폐업수 파일 합치기


In [5]:
# 프랜차이즈수, 개폐업수 파일 전부 읽어오기, 한글csv file이므로 encoding cp949
fran_openclose_2019 = pd.read_csv('./data/fran_openclose_2019.csv', encoding='cp949')
fran_openclose_2020 = pd.read_csv('./data/fran_openclose_2020.csv', encoding='cp949')
fran_openclose_2021 = pd.read_csv('./data/fran_openclose_2021.csv', encoding='cp949')
fran_openclose_2022 = pd.read_csv('./data/fran_openclose_2022.csv', encoding='cp949')
fran_openclose = pd.concat([fran_openclose_2019,fran_openclose_2020,fran_openclose_2021,fran_openclose_2022])

# 합친 파일 저장
fran_openclose.to_csv('../gentrification/data_finish/fran_openclose_finish.csv', index=False)
fran_openclose

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,점포_수,유사_업종_점포_수,개업_율,개업_점포_수,폐업_률,폐업_점포_수,프랜차이즈_점포_수
0,20191,R,전통시장,3130018,동대문문구완구거리(동대문문구완구시장),CS300043,전자상거래업,10,10,10,1,0,0,0
1,20191,D,발달상권,3120165,서래마을카페거리(서래마을),CS100007,치킨전문점,0,1,0,0,100,1,1
2,20191,D,발달상권,3120177,신논현역 1번,CS300015,가방,3,3,0,0,0,0,0
3,20191,A,골목상권,3110508,서연중학교,CS200041,사진관,6,6,17,1,0,0,0
4,20191,A,골목상권,3110241,먹골역 5번,CS300036,조명용품,2,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304750,20224,A,골목상권,3110631,방화초등학교,CS200038,독서실,0,0,0,0,0,1,0
304751,20224,A,골목상권,3110954,국기원,CS200010,변호사사무소,0,0,0,0,0,1,0
304752,20224,A,골목상권,3110687,"천왕역 3번(오류2동주민센터, 서울오류남초등학교)",CS100008,분식전문점,0,0,0,0,0,1,0
304753,20224,R,전통시장,3130054,황학동주방가구거리상점가,CS200034,여관,0,0,0,0,0,1,0


## 3. 매출액 데이터 전처리
- 2019~2022 매출액 파일 합치기


In [6]:
# 매출액 파일 전부 읽어오기, 한글csv file이므로 encoding cp949
money_2019 = pd.read_csv('./data/money_2019.csv', encoding='cp949')
money_2020 = pd.read_csv('./data/money_2020.csv', encoding='cp949')
money_2021 = pd.read_csv('./data/money_2021.csv', encoding='cp949')
money_2022 = pd.read_csv('./data/money_2022.csv', encoding='cp949')
money = pd.concat([money_2019,money_2020,money_2021,money_2022])

# 합친 파일 저장
money.to_csv('../gentrification/data_finish/money_finish.csv', index=False)
money

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,서비스_업종_코드,서비스_업종_코드_명,당월_매출_금액,당월_매출_건수,주중_매출_금액,...,시간대_건수~21_매출_건수,시간대_건수~24_매출_건수,남성_매출_건수,여성_매출_건수,연령대_10_매출_건수,연령대_20_매출_건수,연령대_30_매출_건수,연령대_40_매출_건수,연령대_50_매출_건수,연령대_60_이상_매출_건수
0,20191,D,발달상권,3120037,동대문역사문화공원역,CS200029,네일숍,20393290,208,17051978,...,86,27,0,208,0,56,21,0,123,8
1,20191,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,CS200029,네일숍,73147711,2136,57025458,...,1270,35,59,2077,0,715,995,236,157,33
2,20191,A,골목상권,3110085,경리단길남측,CS200029,네일숍,3000000,40,2337081,...,14,9,0,40,0,8,28,5,0,0
3,20191,A,골목상권,3110166,구의1동주민센터,CS200029,네일숍,8443798,153,4984275,...,64,13,13,140,0,52,88,13,0,0
4,20191,A,골목상권,3110161,중곡역 3번,CS200029,네일숍,20756990,305,16694436,...,196,36,4,301,0,79,28,68,91,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83568,20224,R,전통시장,3130051,동화동 골목형상점가,CS100010,커피-음료,74566607,11618,46002960,...,811,56,4239,7016,41,1539,2107,2831,2696,2037
83569,20224,A,골목상권,3110121,한양대역 4번,CS100010,커피-음료,496341232,74976,343713652,...,22056,9747,37372,37498,6632,45799,6409,5767,9168,1094
83570,20224,A,골목상권,3110163,성자초등학교,CS100010,커피-음료,60999234,10668,39994099,...,2205,349,5054,5440,212,2459,2483,2900,1459,980
83571,20224,A,골목상권,3110110,상왕십리역 6번,CS100010,커피-음료,35826724,4681,17309483,...,361,0,1811,2870,0,1966,1784,672,155,104


In [7]:
# 모든 칼럼이 보이지 않아서 칼럼을 바꿈
money.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83563,83564,83565,83566,83567,83568,83569,83570,83571,83572
기준_년분기_코드,20191,20191,20191,20191,20191,20191,20191,20191,20191,20191,...,20224,20224,20224,20224,20224,20224,20224,20224,20224,20224
상권_구분_코드,D,U,A,A,A,D,R,A,D,A,...,D,R,A,D,A,R,A,A,A,D
상권_구분_코드_명,발달상권,관광특구,골목상권,골목상권,골목상권,발달상권,전통시장,골목상권,발달상권,골목상권,...,발달상권,전통시장,골목상권,발달상권,골목상권,전통시장,골목상권,골목상권,골목상권,발달상권
상권_코드,3120037,3001492,3110085,3110166,3110161,3120072,3130147,3110438,3120158,3110850,...,3120024,3130182,3110591,3120056,3110043,3130051,3110121,3110163,3110110,3120065
상권_코드_명,동대문역사문화공원역,명동 남대문 북창동 다동 무교동 관광특구,경리단길남측,구의1동주민센터,중곡역 3번,성신여대,쌍문역동측상점가,경춘선숲길 우측,녹두거리(대학동),사당역 10번,...,회현역,영천시장,충정로역 6번,구의역,충무초등학교,동화동 골목형상점가,한양대역 4번,성자초등학교,상왕십리역 6번,회기역
서비스_업종_코드,CS200029,CS200029,CS200029,CS200029,CS200029,CS200029,CS200029,CS200029,CS200029,CS200029,...,CS100010,CS200001,CS100010,CS100010,CS100010,CS100010,CS100010,CS100010,CS100010,CS100010
서비스_업종_코드_명,네일숍,네일숍,네일숍,네일숍,네일숍,네일숍,네일숍,네일숍,네일숍,네일숍,...,커피-음료,일반교습학원,커피-음료,커피-음료,커피-음료,커피-음료,커피-음료,커피-음료,커피-음료,커피-음료
당월_매출_금액,20393290,73147711,3000000,8443798,20756990,122719691,11384929,3542727,27950110,25952543,...,663360129,3287672,23208485,717270334,126493832,74566607,496341232,60999234,35826724,340596205
당월_매출_건수,208,2136,40,153,305,1931,173,55,380,548,...,45590,16,5104,111612,22085,11618,74976,10668,4681,67496
주중_매출_금액,17051978,57025458,2337081,4984275,16694436,89162313,9896703,2448866,14764854,18495846,...,559002499,1643836,18042474,515200032,108671002,46002960,343713652,39994099,17309483,206793777


## 4. 유동인구수 데이터 전처리
- 2019~2022 유동인구수 데이터만 남기고 2023년도꺼는 삭제한 데이터셋 만들기


In [8]:
# 유동인구수 파일 읽어오기, 한글csv file이므로 encoding cp949
# 그런데 유동인구수 파일에는 2023년 3분기 데이터셋이 있음을 확인함.
moving_people = pd.read_csv('./data/moving_people.csv', encoding='cp949')
moving_people

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,...,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
0,20233,U,관광특구,3001496,강남 마이스 관광특구,116820,56546,60274,9741,24430,...,28982,24541,6292,16713,17686,18378,18560,18293,14662,12527
1,20233,U,관광특구,3001495,잠실 관광특구,4005509,1937145,2068365,413439,892838,...,633309,864839,465448,566137,566857,573760,570159,579731,595722,553142
2,20233,U,관광특구,3001494,종로?청계 관광특구,8353018,4439304,3913714,321655,1531086,...,1773540,1659889,647989,1344871,1344464,1353279,1316337,1316317,962386,715364
3,20233,U,관광특구,3001493,동대문패션타운 관광특구,3189182,1510236,1678946,173200,566457,...,480423,562444,384970,505715,509594,510428,496596,464684,354613,347552
4,20233,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,6822274,3317776,3504498,284079,1188425,...,1665421,1296312,342464,1117683,1129837,1153089,1116744,1090865,678392,535663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31341,20191,A,골목상권,3110005,세검정,129171,55368,73803,19624,15305,...,13667,21367,20706,18686,18746,18727,18736,18576,17782,17918
31342,20191,A,골목상권,3110004,대신고등학교,495628,226246,269381,94335,57494,...,70258,94736,65414,69892,70158,71562,71577,70647,70556,71236
31343,20191,A,골목상권,3110003,세검정초등학교,706686,310470,396215,128431,82037,...,74478,96768,85012,100783,100429,100067,99815,98912,102760,103921
31344,20191,A,골목상권,3110002,독립문역 1번,540585,246868,293717,96885,65609,...,75590,96631,70361,76301,77283,79044,78826,78583,75041,75508


In [9]:
# 유동인구수 데이터셋은 2023년도의 데이터도 있으므로 다른 데이터셋과 맞추기 위해 2019~2022만 남겨놓고 2023을 전부 삭제해야함
# 기준_년분기_코드 중에 '20233' 행 존재 확인하기

moving_people[moving_people['기준_년분기_코드'] == 20233]

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,...,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
0,20233,U,관광특구,3001496,강남 마이스 관광특구,116820,56546,60274,9741,24430,...,28982,24541,6292,16713,17686,18378,18560,18293,14662,12527
1,20233,U,관광특구,3001495,잠실 관광특구,4005509,1937145,2068365,413439,892838,...,633309,864839,465448,566137,566857,573760,570159,579731,595722,553142
2,20233,U,관광특구,3001494,종로?청계 관광특구,8353018,4439304,3913714,321655,1531086,...,1773540,1659889,647989,1344871,1344464,1353279,1316337,1316317,962386,715364
3,20233,U,관광특구,3001493,동대문패션타운 관광특구,3189182,1510236,1678946,173200,566457,...,480423,562444,384970,505715,509594,510428,496596,464684,354613,347552
4,20233,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,6822274,3317776,3504498,284079,1188425,...,1665421,1296312,342464,1117683,1129837,1153089,1116744,1090865,678392,535663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1644,20233,A,골목상권,3110005,세검정,110053,49414,60639,22217,9713,...,12468,17785,15014,15942,15859,15811,15695,15593,15524,15628
1645,20233,A,골목상권,3110004,대신고등학교,642932,298259,344674,123865,75286,...,91388,125700,80105,92747,93200,92953,91929,91969,90272,89864
1646,20233,A,골목상권,3110003,세검정초등학교,552786,250919,301867,123081,60841,...,47678,67875,71892,79800,77901,77674,77580,77442,79930,82458
1647,20233,A,골목상권,3110002,독립문역 1번,547929,244018,303911,102436,62836,...,75714,97709,64942,79498,80036,79966,79213,78982,76009,74227


In [10]:
# 전체 유동인구 수 데이터셋에서 20231, 20232, 20233 행 삭제
moving_people.drop(moving_people[moving_people['기준_년분기_코드'] == 20231].index, axis=0, inplace=True)
moving_people.drop(moving_people[moving_people['기준_년분기_코드'] == 20232].index, axis=0, inplace=True)
moving_people.drop(moving_people[moving_people['기준_년분기_코드'] == 20233].index, axis=0, inplace=True)

# 합친 파일 저장
moving_people.to_csv('../gentrification/data_finish/moving_people_finish.csv', index=False)
moving_people

Unnamed: 0,기준_년분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,총_유동인구_수,남성_유동인구_수,여성_유동인구_수,연령대_10_유동인구_수,연령대_20_유동인구_수,...,시간대_14_17_유동인구_수,시간대_17_21_유동인구_수,시간대_21_24_유동인구_수,월요일_유동인구_수,화요일_유동인구_수,수요일_유동인구_수,목요일_유동인구_수,금요일_유동인구_수,토요일_유동인구_수,일요일_유동인구_수
4947,20224,U,관광특구,3001496,강남 마이스 관광특구,113561,56267,57295,8423,23526,...,27583,24323,6411,15349,17220,18047,18597,18288,13967,12091
4948,20224,U,관광특구,3001495,잠실 관광특구,4179934,2015767,2164168,490222,895245,...,692222,918306,473577,571728,588228,595830,598401,615406,633869,576473
4949,20224,U,관광특구,3001494,종로?청계 관광특구,8657826,4693188,3964638,334742,1500979,...,1861965,1729022,685015,1272457,1381468,1386192,1396528,1396973,1047401,776808
4950,20224,U,관광특구,3001493,동대문패션타운 관광특구,3418870,1617046,1801825,182494,618819,...,512176,587790,412248,520582,540938,546081,547095,512377,377450,374347
4951,20224,U,관광특구,3001492,명동 남대문 북창동 다동 무교동 관광특구,7629595,3717728,3911867,365427,1301955,...,1851308,1478804,408233,1103048,1241705,1251450,1267499,1240066,864591,661235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31341,20191,A,골목상권,3110005,세검정,129171,55368,73803,19624,15305,...,13667,21367,20706,18686,18746,18727,18736,18576,17782,17918
31342,20191,A,골목상권,3110004,대신고등학교,495628,226246,269381,94335,57494,...,70258,94736,65414,69892,70158,71562,71577,70647,70556,71236
31343,20191,A,골목상권,3110003,세검정초등학교,706686,310470,396215,128431,82037,...,74478,96768,85012,100783,100429,100067,99815,98912,102760,103921
31344,20191,A,골목상권,3110002,독립문역 1번,540585,246868,293717,96885,65609,...,75590,96631,70361,76301,77283,79044,78826,78583,75041,75508


In [11]:
moving_people.shape

(26399, 27)

In [12]:
moving_people.transpose()

Unnamed: 0,4947,4948,4949,4950,4951,4952,4953,4954,4955,4956,...,31336,31337,31338,31339,31340,31341,31342,31343,31344,31345
기준_년분기_코드,20224,20224,20224,20224,20224,20224,20224,20224,20224,20224,...,20191,20191,20191,20191,20191,20191,20191,20191,20191,20191
상권_구분_코드,U,U,U,U,U,U,R,R,R,R,...,A,A,A,A,A,A,A,A,A,A
상권_구분_코드_명,관광특구,관광특구,관광특구,관광특구,관광특구,관광특구,전통시장,전통시장,전통시장,전통시장,...,골목상권,골목상권,골목상권,골목상권,골목상권,골목상권,골목상권,골목상권,골목상권,골목상권
상권_코드,3001496,3001495,3001494,3001493,3001492,3001491,3130327,3130326,3130325,3130324,...,3110010,3110009,3110008,3110007,3110006,3110005,3110004,3110003,3110002,3110001
상권_코드_명,강남 마이스 관광특구,잠실 관광특구,종로?청계 관광특구,동대문패션타운 관광특구,명동 남대문 북창동 다동 무교동 관광특구,이태원 관광특구,"평화시장(남평화시장, 제일평화시장, 신평화패션타운)",고덕 골목형상점가,명일전통시장,길동복조리시장,...,평창동서측,자하문터널,배화여자대학교(박노수미술관),사직공원(한국사회과학도서관),부암동주민센터,세검정,대신고등학교,세검정초등학교,독립문역 1번,이북5도청사
총_유동인구_수,113561,4179934,8657826,3418870,7629595,2161642,62192,147922,296251,555117,...,243528,188048,1577188,581385,21363,129171,495628,706686,540585,155680
남성_유동인구_수,56267,2015767,4693188,1617046,3717728,1085212,29346,68449,130055,251495,...,100741,87427,672613,257684,9858,55368,226246,310470,246868,68432
여성_유동인구_수,57295,2164168,3964638,1801825,3911867,1076430,32848,79473,166196,303620,...,142787,100622,904573,323701,11505,73803,269381,396215,293717,87248
연령대_10_유동인구_수,8423,490222,334742,182494,365427,141845,2326,29457,53697,86062,...,36996,26330,290718,121015,3160,19624,94335,128431,96885,27743
연령대_20_유동인구_수,23526,895245,1500979,618819,1301955,531363,9614,13415,30849,61442,...,24990,24437,206254,63921,2691,15305,57494,82037,65609,15216
