# 13주차 2강 실습

## < 텍스트 마이닝 >
- 비정형 데이터, 텍스트 데이터로부터 유의미한 정보를 추출하는 데이터 분석을 텍스트 마이닝(Text Mining)이라고 한다.

### 1. 웹 크롤링으로 기초 데이터 수집
- 대상 페이지의 구조 살펴보기
- www.namu.wiki
- 웹 크롤링 라이브러리 사용하기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import re

In [2]:
source_url = "https://namu.wiki/RecentChanges"
req = requests.get(source_url)
html = req.content

In [3]:
html



In [4]:
soup = BeautifulSoup(html, 'lxml')
contents_table = soup.find(name = "table")
table_body = contents_table.find(name = "tbody")
table_rows = table_body.find_all(name = "tr")

AttributeError: 'NoneType' object has no attribute 'find'

In [None]:
page_url_base = "https://namu.wiki"
page_urls = []

for index in range(0, len(table_rows)):
    first_td = table_rows[index].find_all('td')[0]
    td_url = first_td.find_all('a')
    
    if len(id_url) > 0:
        page_url = page_url_base + td_url[0].get('href')
        
        if 'png' not in page_url:
            page_urls.append(page_url)
            
# 중복 url을 제거합니다.
page_urls = list(set(page_urls))
for page in page_urls[:5]:
    print(page)

In [None]:
req = requests.get(page_urls[0])
html = req.content
soup = BeautifulSoup(html, 'lxml')
contents_table = soup.find(name = "article")
title = contents_table.find_all('h1')[0]
category = contents_table.find_all('ul')[0]
content_paragraphs = contents_table.find_all(name = 'div', attrs = {"class":"wiki-paragraph"})
content_corpus_list = []

for paragraphs in content_paragraphs:
    content_corpus_list.append(paragraphs.text)

content_corpus = "".join(content_corpus.list)

print(title.text)
print("\n")
print(category.text)
print("\n")
print(content_corpus)

In [None]:
columns = ['title', 'category', 'content_text']
df = pd.DataFrame(columns = columns)

for page_url in page_urls:
    
    req = requests.get(page_url)
    html = req.content
    soup = BeautifulSoup(html, 'lxml')
    contents_table = soup.find(name = 'article')
    title = contents_table.find_all('h1')[0]
    category = contents_table.find_all('ul')[0]
    content_paragraphs = content_table.find_all(name = 'div', attrs = {"class":"wiki-paragraph"})
    content_corpus_list = []
    
    if title is not None:
        row_title = title.text.replace("\n", " ")
    else:
        row_title = ""
        
    if content_paragraphs in not None:
        for paragraphs in content_paragraphs:
            if paragraphs is not None:
                content_corpus_list.append(paragraphs.text.replace("\n", " "))
            else:
                content_corpus_list.append("")
    else:
        content_corpus_list.append("")
        
    if category is not None:
        row_category = category.text.replace("\n", " ")
    else:
        row_category = ""
        
    row = [row_title, row_category, "".join(content_corpus_list)]
    series = pd.Series(row, index = df.columns)
    df = df.append(series, ignore_index = True)

In [None]:
df.head(5)

### 2. 추출 : 키워드 추출
#### [ 텍스트 데이터 전처리 ]

In [None]:
def text_cleaning(text):
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') # 한글의 정규표현식을 나타낸다
    result = hangul.sub('', text)
    return result

print(text_cleaning(df['content_text'][0]))

In [None]:
df['title'] = df['title'].apply(lambda x: text_cleaning(x))
df['category'] = df['category'].apply(lambda x: text_cleaning(x))
df['content_text'] = df['content_text'].apply(lambda x: text_cleaning(x))
df.head(5)

#### [ 말뭉치 만들기 ]

In [None]:
title_corpus = "".join(df['title'].tolist())
category_corpus = "".join(df['category'].tolist())
content_corpus = "".join(df['content_text'].tolist())
print(title_corpus)

In [None]:
test_list = ['aa', 'bb', 'cc']
','.join(test_list)

# 13주차 3강 실습

### 1. konlpy를 이용하여 키워드 추출하기
- 코드 실행을 위해 아나콘다 프롬프터에 설치한다
- 명령어 : pip install konlpy, pip install jpype1, pip install jpype1-py3

### [ 키워드 가다듬기 ]
- 한글자 키워드 제거

In [None]:
from konlpy.tag import Okt
from collections import Counter

In [None]:
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(content_text)
count = Counter(nonus)
count

In [None]:
remove_char_counter = Counter({x : count[x] for x in count if len(x) > 1})
print(remove_char_counter)

### 2. 불용어 제거하기

In [None]:
# 한국어 약식 불용어사진 예시 파일 (https://www.ranks.nl/stopwords/korean)
korean_stopwords_path = "korean_stopwords.txt"

with open(korean_stopwords_path, encoding = 'utf8') as f:
    stopwords = f.readlines()

stopwords = [x.strip() for x in stopwords]
print(stopwords[:10])

In [None]:
namu_wiki_stopwords = ['상위', '문서', '내용', '누설', '아래', '해당',
                       '설명', '표기', '추가', '모든', '사용', '매우', 
                       '가장', '줄거리', '요소', '상황', '편집', '틀',
                       '경우', '때문', '모습', '정도', '이후', '사실',
                       '생각', '인물', '이름', '년월']

for stopword in namu_wiki_stopwords:
    stopwords.append(stopword)

In [None]:
remove_char_counter = Counter({x : remove_char_counter[x] for x in count if x not in stopwords})
print(remove_char_counter)

### 3. 시각화 : 워드 클라우드 시각화

### [ 나무위키 키워드 시각화 ]

##### 한글 폰트 다운로드받기
- http://hangeul.naver.com/webfont/NanumGothic/NanumGothic.ttf

- 제목 키워드

In [None]:
import random
import pytagcloud
import webbrowser

ranked_tags = remove_char_counter.most_common(40)
taglist = pytagcloud.make_tags(ranked_tags, maxsize = 80)
pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size = (900, 600),
                            fontname = 'NanumGothic', rectangular = False)

In [5]:
from IPython.display import Image
Image(filename = 'wordcloud.jpg')

FileNotFoundError: [Errno 2] No such file or directory: 'wordcloud.jpg'

In [10]:
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(title_corpus)
count = Counter(nouns)

remove_char_counter = Counter({x : count[x] for x in count if len(x) > 1})
remove_char_counter = Counter({x : remove_char_counter[x] for x in count if x not in stopwords})

ranked_tags = remove_char_counter.most_common(40)
taglist = pytagcloud.make_tags(ranked_tags, maxsize=80)
pytagcloud.create_tag_image(taglist, 'title_wordcloud.jpg', size = (900, 600),
                            fontname = 'NanumGothic', rectangular = False)

Image(filename = 'title_wordcloud.jpg')

NameError: name 'Okt' is not defined

- 카테고리 키워드

In [9]:
nouns_tagger = Okt()
nouns = nouns_tagger.nouns(category_corpus)
count = Counter(nouns)

remove_char_counter = Counter({x : count[x] for x in count if len(x) > 1})
remove_char_counter = Counter({x : remove_char_counter[x] for x in count if x not in stopwords})

ranked_tags = remove_char_counter.most_common(40)
taglist = pytagcloud.make_tags(ranked_tags, maxsize=80)
pytagcloud.create_tag_image(taglist, 'category_wordcloud.jpg', size = (900, 600),
                            fontname = 'NanumGothic', rectangular = False)

Image(filename = 'category_wordcloud.jpg')

NameError: name 'Okt' is not defined