# 03-1. Sentiment Analysis Crawling
> 네이버 블로그 API를 활용한 행정동 키워드 크롤링

## 환경설정

라이브러리

In [None]:
import os
import sys
import urllib.request
import pandas as pd
import json

In [None]:
client_id = ''
client_secret = ''

<br>
<br>

## 1. 상권 행정동 코드 매핑

크롤링 키워드 추출을 위해 상권 코드 명의 행정동 코드를 매핑하여 행정동 문자로 받는다. [행정안정부 주민등록 행정구역 코드](https://www.mois.go.kr/frt/bbs/type001/commonSelectBoardArticle.do?bbsId=BBSMSTR_000000000052&nttId=78739)

In [None]:
mapping = pd.read_excel("/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/행정동코드.xlsx")
area_data = pd.read_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/서울시 우리마을가게 상권분석서비스(상권영역).csv', encoding="cp949")
data = pd.read_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/cluster_data.csv')

In [None]:
area_data = area_data[['상권_코드_명', '행정동_코드']]

In [None]:
data = pd.merge(data, area_data, how='inner')
data.head()

Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드
0,가로공원로58길,0.093507,0.002016,0.012739,0.007389,0.0,0.0,0.235266,1,11470580
1,가로공원로76가길,0.136968,0.00378,0.161359,0.019704,0.0,0.0,0.312747,1,11500540
2,가로공원로80길,0.11764,0.006117,0.180467,0.007389,0.0,0.0,0.281615,1,11500540
3,가마산로61길,0.148102,0.001008,0.016985,0.009852,0.022727,0.0,0.161983,2,11560650
4,가산디지털단지역_2,0.023777,0.577087,0.006369,0.152709,0.431818,0.0,0.057139,0,11545510


In [None]:
data['행정동_코드'] = list(map(lambda x: int(str(x) + '00'), data['행정동_코드']))

In [None]:
dongs = []
for i in range(len(data)):
  dong_map = np.array(mapping[mapping['행정동코드'] == data['행정동_코드'][i]]['읍면동명'])
  if not dong_map:
    dong_map = 'none'
  else:
    dong_map = dong_map[0]
  dongs.append(np.array(dong_map))

In [None]:
data['행정동'] = dongs

In [None]:
data.head()

Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드,행정동
0,가로공원로58길,0.093507,0.002016,0.012739,0.007389,0.0,0.0,0.235266,1,1147058000,신월3동
1,가로공원로76가길,0.136968,0.00378,0.161359,0.019704,0.0,0.0,0.312747,1,1150054000,화곡제1동
2,가로공원로80길,0.11764,0.006117,0.180467,0.007389,0.0,0.0,0.281615,1,1150054000,화곡제1동
3,가마산로61길,0.148102,0.001008,0.016985,0.009852,0.022727,0.0,0.161983,2,1156065000,신길제3동
4,가산디지털단지역_2,0.023777,0.577087,0.006369,0.152709,0.431818,0.0,0.057139,0,1154551000,가산동


행정동 코드로 변환되지 않은 부분은 하나씩 처리해준다.

In [None]:
data[data['행정동'] == 'none']['행정동_코드'].unique()

array([1130563000, 1130562000, 1130560000, 1130559000, 1130561000,
       1130560600])

In [None]:
for i in range(len(data)):
  if data['행정동'][i] == 'none':
      if data['행정동_코드'][i] in [1130563000, 1130562000, 1130561000]:
        data['행정동'][i] = '수유동'
      else:
        data['행정동'][i] = '번동'

In [None]:
data[data['행정동'] == 'none']['행정동_코드'].unique()

array([], dtype=int64)

In [None]:
data.to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data.csv')

In [None]:
data[data['cluster'] == 0].to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data_area0.csv')
data[data['cluster'] == 1].to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data_area1.csv')
data[data['cluster'] == 2].to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data_area2.csv')
data[data['cluster'] == 3].to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data_area3.csv')

<br>
<br>

## 2. area0 상권 키워드 크롤링

In [None]:
area0 = pd.read_csv('data/dong_data_area0.csv')
area0.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,구로디지탈1단지,구로디지털단지,16.3,0.135935,1.0,0.019108,0.204433,0.363636,0.0,0.0662,0
1,서울 중구 충무로역_2,충무로역,27.8,0.196221,0.374003,0.002123,0.157635,0.113636,0.0,0.092767,0
2,영등포전통시장,영등포전통시장,34.4,0.253975,0.048658,0.006369,0.32266,0.454545,0.0,0.196497,0
3,종암로19길,종암동,36.0,0.158126,0.418446,0.048832,0.061576,0.227273,0.0,0.162453,0
4,압구정 로데오거리_1,압구정로데오거리,40.3,0.168727,0.33357,0.029724,0.29064,0.113636,0.0,0.058965,0


In [None]:
area_type = list(set(area0['행정동']))
area_type

In [None]:
contents = []

for area in area_type:
  encText = urllib.parse.quote(area)
  start = 1
  print(area)
  while start < 11:
    url = f"https://openapi.naver.com/v1/search/blog?display=100&start={start}&query=" + encText # json 결과
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()

    if rescode == 200:
        response_body = response.read().decode('utf-8')
        res = json.loads(response_body)
        text = res['items']
        for r in text:
          contents.append([area, r['title']])
        start += 1
    else:
        print("Error Code:" + rescode)

df = pd.DataFrame(contents, columns=['name', 'title'])
df.to_csv('data/area0_crawl.csv', index=False)

<br>
<br>

## 3. area1 상권 키워드 크롤링

In [None]:
area1 = pd.read_csv('data/dong_data_area1.csv')
area1.head()

Unnamed: 0.1,Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드,행정동
0,0,가로공원로58길,0.093507,0.002016,0.012739,0.007389,0.0,0.0,0.235266,1,1147058000,신월3동
1,1,가로공원로76가길,0.136968,0.00378,0.161359,0.019704,0.0,0.0,0.312747,1,1150054000,화곡제1동
2,2,가로공원로80길,0.11764,0.006117,0.180467,0.007389,0.0,0.0,0.281615,1,1150054000,화곡제1동
3,7,가산로5길,0.132014,0.011363,0.063694,0.029557,0.181818,0.0,0.272768,1,1154551000,가산동
4,8,가재울로6길,0.150733,0.00378,0.091295,0.036946,0.045455,0.0,0.241588,1,1141070000,남가좌제2동


In [None]:
area_type = list(set(area1['행정동']))
area_type

In [None]:
contents = []

for area in area_type:
  encText = urllib.parse.quote(area)
  start = 1
  print(area)
  while start < 11:
    url = f"https://openapi.naver.com/v1/search/blog?display=100&start={start}&query=" + encText # json 결과
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()

    if rescode == 200:
        response_body = response.read().decode('utf-8')
        res = json.loads(response_body)
        text = res['items']
        for r in text:
          contents.append([area, r['title']])
        start += 1
    else:
        print("Error Code:" + rescode)

df = pd.DataFrame(contents, columns=['name', 'title'])
df.to_csv('data/area1_crawl.csv', index=False)

<br>
<br>

## 4. area2 상권 키워드 크롤링

In [None]:
area2 = pd.read_csv('data/dong_data_area2.csv')
area2.head()

Unnamed: 0.1,Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드,행정동
0,3,가마산로61길,0.148102,0.001008,0.016985,0.009852,0.022727,0.0,0.161983,2,1156065000,신길제3동
1,6,가산로3길,0.041782,0.022817,0.021231,0.014778,0.113636,0.0,0.098591,2,1154561000,독산제1동
2,14,강남골목시장,0.052822,0.00378,0.008493,0.0,0.0,0.0,0.053743,2,1162072500,조원동
3,17,강남구 논현역_3,0.045345,0.056424,0.0,0.03202,0.068182,0.0,0.004558,2,1168052100,논현1동
4,19,강남구 신사역_1,0.082954,0.025314,0.025478,0.078818,0.045455,0.0,0.087372,2,1165054000,잠원동


In [None]:
area_type = list(set(area2['행정동']))
area_type

In [None]:
contents = []

for area in area_type:
  encText = urllib.parse.quote(area)
  start = 1
  print(area)
  while start < 11:
    url = f"https://openapi.naver.com/v1/search/blog?display=100&start={start}&query=" + encText # json 결과
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()

    if rescode == 200:
        response_body = response.read().decode('utf-8')
        res = json.loads(response_body)
        text = res['items']
        for r in text:
          contents.append([area, r['title']])
        start += 1
    else:
        print("Error Code:" + rescode)

df = pd.DataFrame(contents, columns=['name', 'title'])
df.to_csv('data/area2_crawl.csv', index=False)

<br>
<br>

## 5. area3 상권 키워드 크롤링

In [None]:
area3 = pd.read_csv('data/dong_data_area3.csv')
area3.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,갈현로33길,갈현동,,0.195418,0.005773,0.165605,0.022167,0.045455,0.0,0.349827,3
1,강남대로8길,양재동,,0.283412,0.042312,0.184713,0.017241,0.0,0.0,0.323406,3
2,강동대로53길,성내동,,0.270944,0.024947,0.161359,0.029557,0.022727,0.0,0.362381,3
3,강서로15길,화곡동,,0.297673,0.00591,0.282378,0.027094,0.045455,0.0,0.496732,3
4,강서로18길,화곡동,,0.226992,0.015417,0.4862,0.046798,0.113636,0.0,0.660748,3


In [None]:
area_type = list(set(area3['행정동']))
area_type

In [None]:
contents = []

for area in area_type:
  encText = urllib.parse.quote(area)
  start = 1
  print(area)
  while start < 11:
    url = f"https://openapi.naver.com/v1/search/blog?display=100&start={start}&query=" + encText # json 결과
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()

    if rescode == 200:
        response_body = response.read().decode('utf-8')
        res = json.loads(response_body)
        text = res['items']
        for r in text:
          contents.append([area, r['title']])
        start += 1
    else:
        print("Error Code:" + rescode)

df = pd.DataFrame(contents, columns=['name', 'title'])
df.to_csv('data/area3_crawl.csv', index=False)