# 6장 19대 대선 자료 분석하기

* 이 책을 집필 완료한 후 선거관리위원회 홈페이지가 재보궐선거로 구성화면이 변경되었습니다.
* 그러므로 일일이 진입하던 코드를 생략하고, 바로 19대 대통령 선거 결과를 확인할 수 있는 페이지 바로가기로 진입합니다.
* 그리고 6-1, 6-2절의 코드는 바뀝니다. 그러나 6-3절부터 학습하시는 경우라면 Github에서 배포하는 데이터를 사용하시면 됩니다.
* 아래 6-1, 6-2절의 코드가 변경되는 것을 양해해 주시기 바랍니다.

## 6-1. Selenium과 Beautiful Soup을 이용한 데이터 획득 준비 작업

In [1]:
import pandas as pd
import numpy as np

import platform
import matplotlib.pyplot as plt

%matplotlib inline

path = 'c://Windows/Fonts/malgun.ttf'
from matplotlib import font_manager, rc
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')

plt.rcParams['axes.unicode_minus'] = False

In [2]:
from selenium import webdriver
import time

In [5]:
# 크롬 드라이버를 사용하여 크롬 창을 연다.
driver = webdriver.Chrome('C:/Users/Slim3 15IIL/Desktop/0-2020데청캠/driver/chromedriver.exe')

# 분야별정보 -> 역대선거통계 -> 투개표 -> 개표현황(읍면동별)
driver.get("http://info.nec.go.kr/main/showDocument.xhtml?electionId=0000000000&topMenuId=VC&secondMenuId=VCCP04")

In [6]:
# '대통령선거' 항목 클릭
driver.find_element_by_id("electionType1").click()

In [7]:
driver.find_element_by_id("electionName").send_keys("제19대")

In [47]:
driver.find_element_by_id("electionCode").send_keys("대통령선거")

In [48]:
# 'option'에 있는 시도 항목들을 전부 가지고 온다.
sido_list_raw = driver.find_element_by_xpath("""//*[@id="cityCode"]""")  # xpath 코드 복사
sido_list = sido_list_raw.find_elements_by_tag_name("option")
sido_names_values = [option.text for option in sido_list]
sido_names_values = sido_names_values[1:]
sido_names_values

['서울특별시',
 '부산광역시',
 '대구광역시',
 '인천광역시',
 '광주광역시',
 '대전광역시',
 '울산광역시',
 '세종특별자치시',
 '경기도',
 '강원도',
 '충청북도',
 '충청남도',
 '전라북도',
 '전라남도',
 '경상북도',
 '경상남도',
 '제주특별자치도']

In [53]:
# 각 시도의 각 구 정보가 나온다.
sigun_list_raw = driver.find_element_by_xpath("""//*[@id="townCode"]""")  # xpath 코드 복사
sigun_list = sigun_list_raw.find_elements_by_tag_name("option")
sigun_names_values = [option.text for option in sigun_list]
sigun_names_values = sigun_names_values[1:]
sigun_names_values

['종로구',
 '중구',
 '용산구',
 '성동구',
 '광진구',
 '동대문구',
 '중랑구',
 '성북구',
 '강북구',
 '도봉구',
 '노원구',
 '은평구',
 '서대문구',
 '마포구',
 '양천구',
 '강서구',
 '구로구',
 '금천구',
 '영등포구',
 '동작구',
 '관악구',
 '서초구',
 '강남구',
 '송파구',
 '강동구']

In [54]:
driver.find_element_by_xpath("""//*[@id="searchBtn"]""").click()  # 종로구 선택 후 검색 xpath 코드 입력

In [55]:
# beautifulsoup을 사용해서 현재 페이지 파싱
from bs4 import BeautifulSoup

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
#soup

In [88]:
# alignR을 모두 찾아서 1~4번째까지 출력
tmp = soup.find_all('td', 'alignR')
tmp[1:5]

[<td class="alignR">102,566</td>,
 <td class="alignR">42,512</td>,
 <td class="alignR">22,325</td>,
 <td class="alignR">22,313</td>]

## 6-2. 19대 대선 개표 결과 데이터 획득하기

In [89]:
from tqdm import tqdm_notebook
# 시도/시군 목록을 넣기 위한 리스트를 생성해 준다.
sido_name = []
sigun_name = []

# 시도 목록을 모두 입력칸에 반복해서 넣어준다.
for sido_value in tqdm_notebook(sido_names_values):
    element = driver.find_element_by_id("cityCode")
    element.send_keys(sido_value)
    
    time.sleep(1)
    
    # 시군 목록을 모두 입력칸에 반복해서 넣어준다.
    sigun_list_raw = driver.find_element_by_xpath("""//*[@id="townCode"]""")
    sigun_list = sigun_list_raw.find_elements_by_tag_name("option")
    
    # 시군 목록에서 나온 값들을 출력
    sigun_names_values = [option.text for option in sigun_list]
    sigun_names_values = sigun_names_values[1:]
    
    # 시군 목록에서 나온 값들을 리스트에 추가해 준다.
    for sigun_value in sigun_names_values:
        sido_name.append(sido_value)
        sigun_name.append(sigun_value)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




In [90]:
# 각 값들을 데이터프레임으로 만들어 준다.
election_result = pd.DataFrame({'광역시도':sido_name, '시군':sigun_name})
election_result.head()
#len(election_result)

Unnamed: 0,광역시도,시군
0,서울특별시,종로구
1,서울특별시,중구
2,서울특별시,용산구
3,서울특별시,성동구
4,서울특별시,광진구


In [91]:
# 투표율을 얻기 위한 함수 생성
def get_vote_info(n):
    # 페이지 html 소스를 읽어온다.
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    # 'td' 태그에 있는 값들을 불러오고 ','을 삭제한다.
    tmp = soup.find_all('td', 'alignR')
    tmp_values = [float(tmp_val.get_text().replace(',', '')) for tmp_val in tmp[1:5]]
    
    # 각 후보들의 득표 값들을 알맞게 넣어준다.
    pop[n] = tmp_values[0]
    moon[n] = tmp_values[1]
    hong[n] = tmp_values[2]
    ahn[n] = tmp_values[3]

In [92]:
import numpy as np

# 값을 불러올 때, 없으면 nan 처리한다.
def fail_procedure(n):
    pop[n] = np.nan
    moon[n] = np.nan
    hong[n] = np.nan
    ahn[n] = np.nan

In [93]:
# nan 처리된 값들을 계산한다.
pop = [np.nan]*len(election_result)
moon = [np.nan]*len(election_result)
hong = [np.nan]*len(election_result)
ahn = [np.nan]*len(election_result)

len(pop), len(moon), len(hong), len(ahn)

(250, 250, 250, 250)

In [96]:
for n in tqdm_notebook(election_result.index):
    # 에러가 나면 except 구문을 실행
    try:
        # 각 항목들에 값을 넣어주는 구문
        element = driver.find_element_by_id("cityCode")
        element.send_keys(election_result['광역시도'][n])
        
        time.sleep(0.5)
        
        element = driver.find_element_by_id("townCode")
        element.send_keys(election_result['시군'][n])
        
        driver.find_element_by_xpath("""//*[@id="searchBtn"]""").click()  # 검색bar
        
        time.sleep(0.5)
        
        get_vote_info(n)

    except:
        print('--- Error ---')
        fail_procedure(n)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Er

In [97]:
election_result['pop'] = pop
election_result['moon'] = moon
election_result['hong'] = hong
election_result['ahn'] = ahn

election_result.head()

Unnamed: 0,광역시도,시군,pop,moon,hong,ahn
0,서울특별시,종로구,102566.0,42512.0,22325.0,22313.0
1,서울특별시,중구,82852.0,34062.0,17901.0,19372.0
2,서울특별시,용산구,148157.0,58081.0,35230.0,32109.0
3,서울특별시,성동구,203175.0,86686.0,40566.0,45674.0
4,서울특별시,광진구,240030.0,105512.0,46368.0,52824.0


In [98]:
# 에러가 난 부분들을 모아준다.
re_try_index = election_result[election_result['pop'].isnull()].index

for n in tqdm_notebook(re_try_index):
    try:
        element = driver.find_element_by_id("cityCode")
        element.send_keys(election_result['광역시도'][n])
        
        time.sleep(0.5)
        
        element = driver.find_element_by_id("townCode")
        element.send_keys(election_result['시군'][n])
        
        driver.find_element_by_xpath("""//*[@id="searchBtn"]""").click()
        
        time.sleep(0.5)
        
        get_vote_info(n)

    except:
        print('--- Error ---')
        fail_procedure(n)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=96.0), HTML(value='')))

--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---
--- Error ---



In [99]:
election_result['pop'] = pop
election_result['moon'] = moon
election_result['hong'] = hong
election_result['ahn'] = ahn

election_result.head()

Unnamed: 0,광역시도,시군,pop,moon,hong,ahn
0,서울특별시,종로구,102566.0,42512.0,22325.0,22313.0
1,서울특별시,중구,82852.0,34062.0,17901.0,19372.0
2,서울특별시,용산구,148157.0,58081.0,35230.0,32109.0
3,서울특별시,성동구,203175.0,86686.0,40566.0,45674.0
4,서울특별시,광진구,240030.0,105512.0,46368.0,52824.0


In [100]:
election_result.to_csv('C:/Users/Slim3 15IIL/Desktop/0-2020데청캠/data/06. election_result.csv', encoding='utf-8', sep=',')

## 6-3. 각 후보의 득표율과 지역 ID 정리하기

In [102]:
# csv 파일 읽어오기
election_result = pd.read_csv('C:/Users/Slim3 15IIL/Desktop/0-2020데청캠/data/06. election_result.csv', encoding='utf-8', index_col=0)
election_result.head()

Unnamed: 0,광역시도,시군,pop,moon,hong,ahn
0,서울특별시,종로구,102566.0,42512.0,22325.0,22313.0
1,서울특별시,중구,82852.0,34062.0,17901.0,19372.0
2,서울특별시,용산구,148157.0,58081.0,35230.0,32109.0
3,서울특별시,성동구,203175.0,86686.0,40566.0,45674.0
4,서울특별시,광진구,240030.0,105512.0,46368.0,52824.0


In [103]:
# '광역시도' 이름을 정리한다.
sido_candi = election_result['광역시도']
sido_candi = [name[:2] if name[:2] in ['서울', '부산', '대구', '광주', '인천', '대전', '울산']
                         else '' for name in sido_candi]

In [104]:
# '시구' 이름을 정리하는 함수를 지정해 준다.
def cut_char_sigu(name):
    return name if len(name)==2 else name[:-1]