In [1]:
# konlpy
# mecab
# requests
# bs4

# Keyword extraction use TF-IDF
## 1. Introduction
## 2. crwaling

In [44]:
import requests
import re
import math
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from scipy.sparse import coo_matrix

In [11]:
def get_news_by_url(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    article = soup.select_one('#articleBodyContents')
    text = article.get_text().replace("\n", "")

    # text preprocessing
    start_pos = re.search(r"\w+@\w+\.\w+(.\w+)?", text).start()
    content = text[:start_pos-1]
    doc = re.sub("[^가-힣 \d]", " ", content)
    doc = re.sub("\s+", " ", doc)

    return doc


docs = []
docs.append(get_news_by_url(
    'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=105&oid=018&aid=0004430108'))
docs.append(get_news_by_url(
    'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=101&oid=001&aid=0011614790'))
docs.append(get_news_by_url(
    'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=102&oid=014&aid=0004424362'))
docs.append(get_news_by_url(
    'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=101&oid=119&aid=0002402191'))
docs.append(get_news_by_url(
    'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=101&oid=030&aid=0002882728'))

## 3. Tokenization by space

In [15]:
doc_ls = [doc.split() for doc in docs]
doc_ls[0][:10]

['과기정통부',
 '22일',
 '유영민',
 '장관',
 '등',
 '참석해',
 '기념행사2021년까지',
 '1516억원',
 '투입',
 '5100여종']

## 4. Stop word processing

In [28]:
from konlpy.tag import Mecab
mecab = Mecab()

preprocessed_docs = []

for doc in docs:
    token_list = []
    for token in mecab.pos(doc):
        if token[1] in ['NNG', 'NNP', 'VV']:
            token_list.append(token[0])
    preprocessed_docs.append(token_list)

preprocessed_docs[1][:10]

['한국전력공사', '한국전력공사', '제공', '나주', '연합뉴스', '송', '형', '기자', '한국전력', '전력']

## 5. Give a unique ID to words

In [27]:
from collections import defaultdict

word2id = defaultdict(lambda: len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
print()




## 6. Make DTM

In [30]:
DTM = np.zeros((len(doc_ls), len(word2id)), dtype=int)
for i, doc in enumerate(doc_ls):
    for token in doc:
        DTM[i, word2id[token]] += 1

DTM

array([[4, 2, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 1]])

## 7. Compute TF

In [32]:
def computeTF(DTM):
    doc_len = len(doc_ls)
    word_len = len(word2id)
    tf = np.zeros((doc_len, word_len))
    for doc_idx in range(doc_len):
        for word_idx in range(word_len):
            tf[doc_idx, word_idx] = DTM[doc_idx, word_idx]/DTM[doc_idx].sum()

    return tf

## 8. Compute IDF

In [36]:
def computeIDF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    idf = np.zeros(word_len)
    for i in range(word_len):
        idf[i] = -math.log10(np.count_nonzero(DTM[:,i])/doc_len)
        
    return idf
# np.count_nonzero 사용 

## 9. TF-IDF

In [40]:
def computeTFIDF(DTM):
    tf = computeTF(DTM)
    idf = computeIDF(DTM)
    
    tfidf = np.zeros(tf.shape)
    for doc_idx in range(tf.shape[0]):
        for word_idx in range(tf.shape[1]):
            tfidf[doc_idx, word_idx] = tf[doc_idx, word_idx]* idf[word_idx]
    return tfidf

  
#computeTFIDF(DTM)

## 10. Keywords_extraction

In [41]:
def sort_keywords(keywords):
    return sorted(zip(keywords.col, keywords.data), key=lambda x: (x[1], x[0]), reverse=True)


def extract_keywords(feature_names, sorted_keywords, n=5):
    return [(feature_names[idx], score) for idx, score in sorted_keywords[:n]]

In [60]:
#추출할 문서 번호
doc_number = 4
sorted_keywords = sort_keywords(coo_matrix(computeTFIDF(DTM)[doc_number,:]))

feature_names = list(word2id.keys())
# 사용자가 지정한 갯수만큼 키워드 추출
keywords = extract_keywords(feature_names, sorted_keywords, 5)

print("\n===== 원문 =====")
print(docs[doc_number][:100])
print("\n=== 핵심키워드 ===")
for k in keywords:
    print(k)


===== 원문 =====
금융결제원 신용정보원 등 지정비씨카드 추가 포함 여부도 논의시행령 발표한 후 빠르게 추진하반기 데이터 유통 판매 급물살정부가 금융결제원과 신용정보원 등 업권별 9곳의 마이데이터 중

=== 핵심키워드 ===
('마이데이터', 0.053766923410462984)
('신용정보원', 0.03584461560697532)
('이동근기자', 0.01792230780348766)
('진행하고', 0.01792230780348766)
('실무회의를', 0.01792230780348766)


# sklearn

In [107]:
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer




## join

In [108]:
preprocessed_docs = []

for doc in docs:
    token_list = []
    for token in mecab.pos(doc):
        if token[1] in ['NNG', 'NNP', 'VV']:
            token_list.append(token[0])
    preprocessed_docs.append(" ".join(token_list))

#preprocessed_docs

In [109]:
tfidf_vect = TfidfVectorizer(max_df=0.85, max_features=10000,smooth_idf=True, use_idf=True)
word_count = tfidf_vect.fit_transform(preprocessed_docs)
#print(type(word_count))

<class 'scipy.sparse.csr.csr_matrix'>


In [110]:
def sort_keywords(keywords):
    return sorted(zip(keywords.col, keywords.data), key=lambda x: (x[1], x[0]), reverse=True)


def extract_keywords(feature_names, sorted_keywords, n=5):
    return [(feature_names[idx], score) for idx, score in sorted_keywords[:n]]

In [111]:
doc_number = 1
doc = preprocessed_docs[doc_number]  # 핵심키워드 추출할 문서 조회

feature_names = tfidf_vect.get_feature_names()
tfidf_vect = tfidf_vect.transform([doc])
sorted_keywords = sort_keywords(tfidf_vect.tocoo())


# 사용자가 지정한 갯수만큼 키워드 추출
keywords = extract_keywords(feature_names, sorted_keywords, 5)

print("\n===== 원문 =====")
print(docs[doc_number][:100])
print("\n=== 핵심키워드 ===")
for k in keywords:
    print(k)


===== 원문 =====
한국전력공사 한국전력공사 제공 나주 연합뉴스 송형일 기자 한국전력은 전력 데이터를 활용한 신 서비스 개발 경진대회가 8월 20일 한전 아트센터에서 개최된다고 18일 밝혔다 산업통상

=== 핵심키워드 ===
('한전', 0.3798224183465947)
('전력', 0.3798224183465947)
('대회', 0.2278934510079568)
('서비스', 0.20349720018162895)
('기업', 0.18386300133936764)
