<a href="https://colab.research.google.com/github/dscoool/datastructure/blob/main/NLP_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### 자연어처리(2) - 다량의 뉴스기사 분석

이번에는 여러 개의 뉴스기사를 파일로부터 가져와 \
분석을 실시해 보겠습니다!!


In [None]:
import sys
import pandas as pd
import numpy as np
from data_loader import get_news_corpus_as_list

docs = get_news_corpus_as_list(n_docs=1000)
docs = [doc for doc in docs if doc]
print("분석하려는 뉴스기사의 수:", len(docs))


분석하려는 뉴스기사의 수: 965


komoran 모듈을 로딩합니다.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Komoran

komoran = Komoran()

* min_count - 유효단어(word bag)에 포함시킬 최소 숫자.
예를 들어, '북한'이 10회 이상 등장하면 '북한'이라는 어휘는
분석대상에 포함하도록 합니다.

* bigram_extractor - 총 뉴스 중 10번 이상 등장한 분석어휘를 선택하는 기능

* to_bigram - 두 개의 분석어휘 집단을 서로 병합하는 기능.


In [None]:
from collections import Counter

def bigram_extractor(docs, min_count=10):

    def to_bigram(tokens):
        bigrams = [(t0, t1) for t0, t1 in zip(tokens, tokens[1:])]
        return bigrams

    bigram_counter = Counter(
        [bigram for doc in docs for bigram in
         to_bigram(komoran.pos(doc, join=True)) if doc
        ]
    )

    bigram_dictionary = {
        bigram:count for bigram, count in bigram_counter.items()
        if count >= min_count
    }

    return bigram_dictionary

bigrams = bigram_extractor(docs)
len(bigrams)

2835

임의로 다섯개의 분석어휘를 살펴봅니다.

In [None]:
#[서울, 연합뉴스, 경찰, 관계자, 들, 이]라는 단어가 가장 많이 등장하였습니다.
list(bigrams)[:5]

[('서울/NNP', '연합뉴스/NNP'),
 ('연합뉴스/NNP', '경찰/NNG'),
 ('경찰/NNG', '관계자/NNG'),
 ('관계자/NNG', '들/XSN'),
 ('들/XSN', '이/JKS')]

자주 등장한 어휘 100위에서 199위까지를 살펴보도록 할까요?

In [None]:
list(bigrams)[100:200]

[('폭행/NNG', '용의자/NNP'),
 ('용의자/NNP', '가/JKS'),
 ('조사/NNG', '를/JKO'),
 ('를/JKO', '벌이/VV'),
 ('경찰관/NNP', '에게/JKB'),
 ('경찰관/NNP', '이/JKS'),
 ('이/JKS', '숨지/VV'),
 ('숨지/VV', '었/EP'),
 ('다/EC', '19/SN'),
 ('맞/VV', '았/EP'),
 ('되/XSV', '어/EC'),
 ('현장/NNG', '에서/JKB'),
 ('조사/NNG', '하/XSV'),
 ('하/XSV', '던/ETM'),
 ('김/NNP', '모/NNP'),
 ('45/SN', '씨/NNB'),
 ('쏘/VV', 'ㄴ/ETM'),
 ('ㄴ/ETM', '사제총/NNP'),
 ('기/ETN', '에/JKB'),
 ('에/JKB', '맞/VV'),
 ('ㄴ/ETM', '뒤/NNG'),
 ('병원/NNG', '에/JKB'),
 ('옮기/VV', '어/EC'),
 ('었/EP', '으나/EC'),
 ('다/EC', '사진/NNG'),
 ('사진/NNG', '은/JX'),
 ('신고/NNG', '를/JKO'),
 ('를/JKO', '받/VV'),
 ('받/VV', '고/EC'),
 ('출동/NNG', '하/XSV'),
 ('앞/NNG', '에/JKB'),
 ('었/EP', '던/ETM'),
 ('을/JKO', '챙기/VV'),
 ('다/EC', '김/NNP'),
 ('김/NNP', '경위/NNG'),
 ('경위/NNG', '는/JX'),
 ('접근/NNG', '하/XSV'),
 ('은/JX', '성씨/NNP'),
 ('성씨/NNP', '가/JKS'),
 ('ㄴ/ETM', '10/SN'),
 ('도착/NNG', '하/XSV'),
 ('았/EP', '을/ETM'),
 ('을/ETM', '때/NNG'),
 ('이/JKS', '없/VA'),
 ('없/VA', '었/EP'),
 ('었/EP', '고/EC'),
 ('을/JKO', '하/VV'),
 ('하/VV', '

BigramTokenizer 함수를 정의합니다.

In [None]:
class BigramTokenizer:

    def __init__(self, bigrams, tagger):
        self.bigrams = bigrams
        self.tagger = tagger

    def __call__(self, sent):
        if not sent:
            return []

        unigrams = self.tagger.pos(sent, join=True)

        bigrams = [(t0, t1) for t0, t1 in zip(unigrams, unigrams[1:])]
        bigrams = [bigram for bigram in bigrams if bigram in self.bigrams]
        bigrams = ['%s-%s' % (t0, t1) for t0, t1 in bigrams]

        return unigrams + bigrams

bigram_tokenizer = BigramTokenizer(bigrams, komoran)

docs에 있는 965개의 뉴스 중 하나를 골라, \
분석해 봅시다.\
아래 docs[100] 의 숫자를 바꾸면 됩니다.

In [None]:
sent = docs[220].split('  ')[0]
sent

'서울 연합뉴스 고웅석 기자 현대자동차는 20일부터 11월 7일까지 자사 채용 사이트 를 통해 마케팅 부문 경력사원 지원서류를 접수한다고 밝혔다'

bigram_tokenizer 기능을 사용하여 \
해당 문장을 분석해 봅시다.

In [None]:
bigram_tokenizer(sent)

['서울/NNP',
 '연합뉴스/NNP',
 '고웅석/NA',
 '기자/NNG',
 '현대자동차/NNP',
 '는/JX',
 '20/SN',
 '일/NNB',
 '부터/JX',
 '11월 7일/NNP',
 '까지/JX',
 '자사/NNP',
 '채용/NNG',
 '사이트/NNG',
 '를/JKO',
 '통하/VV',
 '아/EC',
 '마케팅/NNP',
 '부문/NNG',
 '경력/NNP',
 '사원/NNP',
 '지원/NNG',
 '서류/NNG',
 '를/JKO',
 '접수/NNG',
 '하/XSV',
 'ㄴ다고/EC',
 '밝히/VV',
 '었/EP',
 '다/EC',
 '서울/NNP-연합뉴스/NNP',
 '는/JX-20/SN',
 '20/SN-일/NNB',
 '일/NNB-부터/JX',
 '를/JKO-통하/VV',
 '통하/VV-아/EC',
 '하/XSV-ㄴ다고/EC',
 'ㄴ다고/EC-밝히/VV',
 '밝히/VV-었/EP',
 '었/EP-다/EC']

이를 지난 주에 학습했던 CountVectorizer를 사용하여 \
TF-IDF 형태로 만들어 봅시다!!

In [None]:
vectorizer = CountVectorizer(tokenizer=bigram_tokenizer)
x = vectorizer.fit_transform(docs)

뉴스를 TF-IDF 행렬로 변환한 것이 x 입니다.
x가 (가로,세로) 얼마나 되는지 살펴봅시다.

In [None]:
x.shape

(965, 18835)

tf 벡터를 데이터프레임(dataframe)형태로 바꾸어 실제 출력해 봅시다.

In [None]:
tf = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())
# 변환한 내용 출력하기!!
tf



Unnamed: 0,0/SN,00/SN,000/SN,0015/SN,003550/SN,01/SN,02/SN,0209/SN,021/SN,0279/SN,...,힐러리 클린턴/NNP,힐러리/NNP,힐링/NNP,힘/NNG,힘겹/VA,힘들/VA,힘쓰/VV,힘입/VV,힘입/VV-어/EC,힙합/NNP
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
961,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
962,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
963,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


이전 시간에 했던 TF-IDF 실습을 이어서 적용해 보겠습니다!!

In [None]:
D = len(tf)
df = tf.astype(bool).sum(axis=0)
idf = np.log((D+1) / (df+1)) + 1

In [None]:
df

0/SN          56
00/SN          4
000/SN         7
0015/SN        1
003550/SN      1
              ..
힘들/VA         11
힘쓰/VV          4
힘입/VV         14
힘입/VV-어/EC    13
힙합/NNP         1
Length: 18835, dtype: int64

In [None]:
idf

0/SN          3.830113
00/SN         6.263726
000/SN        5.793722
0015/SN       7.180017
003550/SN     7.180017
                ...   
힘들/VA         5.388257
힘쓰/VV         6.263726
힘입/VV         5.165114
힘입/VV-어/EC    5.234107
힙합/NNP        7.180017
Length: 18835, dtype: float64

In [None]:
# TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = tf * idf
tfidf = tfidf / np.linalg.norm(tfidf, axis=1, keepdims=True) #정규화 normalisation

In [None]:
tfidf

Unnamed: 0,0/SN,00/SN,000/SN,0015/SN,003550/SN,01/SN,02/SN,0209/SN,021/SN,0279/SN,...,힐러리 클린턴/NNP,힐러리/NNP,힐링/NNP,힘/NNG,힘겹/VA,힘들/VA,힘쓰/VV,힘입/VV,힘입/VV-어/EC,힙합/NNP
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.021077,0.0,0.0


In [None]:
tfidf[:]['연합뉴스/NNP']

0      0.000000
1      0.020034
2      0.023786
3      0.027759
4      0.065555
         ...   
960    0.009293
961    0.017634
962    0.013285
963    0.004249
964    0.012639
Name: 연합뉴스/NNP, Length: 965, dtype: float64