# 웹 크롤링 및 워드클라우드

## 빅데이터

### 이영석, 문현수

#### munhyunsu@cs-cnu.org

## KoNLPy 설치
### 참고 웹 페이지:
- [KoNLPy](https://konlpy.org/en/latest/)
- [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

#### 자연어처리 KoNLPy 설치
#### HTML 파서 BeautifulSoup4 설치

In [None]:
# install KoNLPy
!~/Jupyter/scikit-learn/bin/python3 -m pip install --force-reinstall --upgrade konlpy

In [None]:
# install wordcloud
!~/Jupyter/scikit-learn/bin/python3 -m pip install --force-reinstall --upgrade wordcloud

#### 라이브러리 버전 확인

In [None]:
import sys
import konlpy
import bs4
from bs4 import BeautifulSoup
import wordcloud
print(f'Python3 버전: {sys.version.split()[0]}')
print(f'KoNLPy 버전: {konlpy.__version__}')
print(f'BeautifulSoup 버전: {bs4.__version__}')
print(f'wordcloud 버전: {wordcloud.__version__}')

#### 자연어 처리, 형태소 분석기 살펴보기

In [None]:
# 꼬꼬마 형태소 분석기: http://kkma.snu.ac.kr/
from konlpy.tag import Kkma

In [None]:
kkma = Kkma()
kkma

In [None]:
message = '안녕하세요. 2021년 봄학기 산업대학원 빅데이터 실습입니다. 오늘 7주차에는 웹 크롤링 및 워드클라우드 실습을 진행합니다.'
message

In [None]:
kkma.pos(message)

In [None]:
kkma.nouns(message)

#### 웹 페이지 데이터 수집

In [None]:
import urllib.request

In [None]:
url = 'https://news.naver.com/'

In [None]:
with urllib.request.urlopen(url) as f:
    print(f.read())

In [None]:
from urllib.parse import urlparse

In [None]:
host = urlparse(url).netloc
req = urllib.request.Request(url)
req.add_header('Host', host) # it is mandatory in HTTP/1.1
req.add_header('User-Agent', 
               '')
req.headers

In [None]:
with urllib.request.urlopen(req) as f:
    data = f.read()
data

In [None]:
soup = BeautifulSoup(data, 'html.parser')
print(soup.prettify())

In [None]:
print(soup.get_text())

In [None]:
kkma.nouns(soup.get_text())

#### 워드 클라우드

In [None]:
import collections

In [None]:
counter = collections.Counter(kkma.nouns(soup.get_text()))
counter.most_common(10)

In [None]:
import matplotlib.font_manager as fm

In [None]:
print(f'총 {len(fm.fontManager.ttflist)}개의 폰트 검색됨')
for index, font in enumerate(fm.fontManager.ttflist, start=1):
    print(f'[{index}] {font.name} ==> {font.fname}')

In [None]:
font_path = ''

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Colormap: https://matplotlib.org/stable/tutorials/colors/colormaps.html
wc = WordCloud(font_path=font_path, 
               width=400, height=400,
               background_color='white', colormap='rainbow')
wc_img = wc.generate_from_frequencies(counter)

In [None]:
fig = plt.figure(figsize= (8*1, 8*1))
ax = fig.add_subplot()
ax.imshow(wc_img)
ax.axis('off')
fig.show()

In [None]:
length = 2
words = []
for word in kkma.nouns(soup.get_text()):
    if len(word) >= length:
        words.append(word)
print(f'{len(kkma.nouns(soup.get_text()))}개 단어 ==> {len(words)}개 단어')

In [None]:
counter2 = collections.Counter(words)
counter2.most_common(10)

In [None]:
wc = WordCloud(font_path=font_path, 
               width=400, height=400,
               background_color='white', colormap='hsv')
wc_img = wc.generate_from_frequencies(counter2)

In [None]:
fig = plt.figure(figsize= (8*1, 8*1))
ax = fig.add_subplot()
ax.imshow(wc_img)
ax.axis('off')
fig.show()