In [4]:
!pip3 install -U gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K    100% |████████████████████████████████| 24.2MB 41kB/s  eta 0:00:01    59% |███████████████████             | 14.4MB 5.7MB/s eta 0:00:02
[?25hCollecting numpy>=1.11.3 (from gensim)
  Using cached https://files.pythonhosted.org/packages/45/b2/6c7545bb7a38754d63048c7696804a0d947328125d81bf12beaa692c3ae3/numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl
Collecting scipy>=0.18.1 (from gensim)
  Using cached https://files.pythonhosted.org/packages/c8/89/63171228d5ced148f5ced50305c89e8576ffc695a90b58fe5bb602b910c2/scipy-1.5.4-cp36-cp36m-manylinux1_x86_64.whl
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/ea/54/01525817b6f31533d308968b814999f7e666b2234f39a55cbe5de7c1ff99/smart_open-4.1.2-py3-none-any.whl (111kB)
[K    100% |█████████████████████████

In [5]:
# Import modules
import os
import re
import pickle
import itertools
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm_notebook
from gensim import corpora, models
import string

# NLP modules
from konlpy.tag import Mecab, Okt
from khaiii import khaiii

# Visualization modules
import seaborn as sns
import matplotlib.pyplot as plt

# Factor Analysis (PCA)
from sklearn.decomposition import PCA

# Chi square test
from scipy.stats import chi2_contingency

# Visualization setting
sns.set(style="ticks", palette="Set1")
plt.rcParams["font.family"] = 'NanumGothic'

In [20]:
def read_documents(input_file_name):
    corpus = []
#     with open(input_file_name, 'rb') as f:
#         temp_corpus=pickle.load()
        
#     for page in temp_corpus:
#         corpus += page
        
    return corpus
    
def text_cleaning(docs):
    # 한국어를 제외한 글자 제거 함수
    cleaned_docs = []
    
    for doc in docs:
        temp_doc = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', str(doc))
        cleaned_docs.append(temp_doc)
    return cleaned_docs

def define_stopwords(path):
    SW = set()
    # 불용어 추가
    for i in string.punctuation:
        SW.add(i)
        
    with open(path, 'rb') as f:
        for word in f:
            SW.add(word)
            
    return SW

def text_tokenizing(corpus, tokenizer):
    # 명사 추출/ 형태소 분석
    mecab = Mecab()
    token_corpus = []
    
    if tokenizer == "noun":
        for n in tqdm_notebook(range(len(corpus)), desc = "PreProcessing"):
            token_text = mecab.nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            
            token_corpus.append(token_text)
            
    elif tokenized == "morph":
        for n in tqdm_notebook(range(len(corpus)), desc = "PreProcessing"):
            token_text = mecab.morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)
            
    elif tokenized == "word":
        for n in tqdm_notebook(range(len(corpus)), desc = "PreProcessing"):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)
        
    return token_corpus

In [28]:
#Read Data
original_dat = pd.read_csv('./results_crawl.csv')
comp1_topic = original_dat[original_dat['company']=='더팩트']['text'].dropna().tolist()
SW = define_stopwords('stopwords-ko.txt')
cleaned_text = text_cleaning(comp1_topic)
tokenized_text = text_tokenizing(cleaned_text, tokenizer="noun")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


PreProcessing:   0%|          | 0/250 [00:00<?, ?it/s]

In [29]:
print(tokenized_text[0])

['대전', '동부', '경찰서', '아동', '청소년', '보호', '법률', '위반', '혐의', '고교', '기간제', '교사', '여성', '구속', '송치', '유필', '제공', '경찰', '교사', '지위', '이용', '성범죄', '판단', '윤용', '기자', '대전', '고등학교', '여교사', '남학생', '제자', '관계', '형사', '처벌', '처지', '사건', '남학생', '해당', '여교사', '관계', '폭로', '대전', '동부', '경찰서', '아동', '청소년', '보호', '법률', '위반', '혐의', '고교', '기간제', '교사', '여성', '구속', '송치', '경찰', '지난해', '자신', '근무', '고등학교', '제자', '대전', '모텔', '관계', '혐의', '학교', '상담', '사실', '다만', '진술', '실제', '처벌', '미지', '수다', '경찰', '조사', '성폭행', '상반', '주장', '경찰', '교사', '지위', '이용해', '관계', '판단', '사건', '검찰', '사건', '학교', '직후', '사직서', '제출', '경찰', '관계자', '사안', '구체', '내용', '확인', '공연', '비하인드', '사진', '방법', '가입', '아이돌', '순위', '스타', '투표', '저작', '특종', '무단', '전재', '배포', '금지']


In [30]:
dictionary = corpora.Dictionary(tokenized_text)
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

In [31]:
print(dictionary)

Dictionary(7477 unique tokens: ['가입', '검찰', '경찰', '경찰서', '고교']...)


In [32]:
corpus[0][:5]

[(0, 1), (1, 1), (2, 5), (3, 2), (4, 2)]

In [33]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0]

[(1, 0.030687277126903743),
 (2, 0.25062180056891736),
 (3, 0.14069467761325147),
 (4, 0.1862601884155591),
 (5, 0.1550387798751212),
 (7, 0.18805203700063233),
 (8, 0.02711525621807379),
 (9, 0.35241827968887174),
 (10, 0.08605653369577423),
 (11, 0.05473663453640678),
 (12, 0.0665850326385445),
 (14, 0.24870300549643487),
 (15, 9.026659633116386e-05),
 (16, 0.24870300549643487),
 (17, 0.031041953522143298),
 (18, 0.1087407984779985),
 (19, 0.23702414688683673),
 (20, 0.12108665823589898),
 (21, 0.1087407984779985),
 (23, 0.09960912187007279),
 (26, 0.09402601850031617),
 (27, 0.08254491422312218),
 (29, 0.10108380740332731),
 (30, 0.035800948239804835),
 (31, 0.06336218904407329),
 (32, 0.09960912187007279),
 (34, 0.08399841759985383),
 (35, 0.12435150274821744),
 (36, 0.06336218904407329),
 (37, 0.07486674099192812),
 (38, 0.24870300549643487),
 (39, 0.1087407984779985),
 (42, 0.056883161381780054),
 (43, 0.12983202201642732),
 (45, 0.24870300549643487),
 (46, 0.1074710296083965),
 

In [36]:
model = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary)

In [38]:
model.show_topic(0, 10)

[('로또', 0.014351159),
 ('당첨', 0.011130583),
 ('번호', 0.010881089),
 ('복권', 0.008077438),
 ('기자', 0.007732418),
 ('서울', 0.007702239),
 ('판매점', 0.006598958),
 ('대통령', 0.00644859),
 ('의원', 0.005309927),
 ('검찰', 0.0049346923)]

In [40]:
NUM_TOPICS = 3
NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]
    
    return corpus, dictionary

def print_topic_words(model):
    print("\nPrinting topic words.")
    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print("Topic ID: {}".format(topic_id))
        
        for topic_word, prob in topic_word_probs:
            print("\t{}\t{}".format(topic_word, prob))
            
        print("\n")

In [42]:
corpus, dictionary = build_doc_term_mat(tokenized_text)
model = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha="auto", eta="auto")
print_topic_words(model)

Building document-term matrix.

Printing topic words.
Topic ID: 0
	대통령	0.006458444986492395
	기자	0.006270280573517084
	의원	0.005691975355148315
	검찰	0.004489465616643429
	금지	0.004321454558521509
	코로나	0.003944133874028921
	사진	0.003918851725757122
	방법	0.003896558890119195
	민주당	0.003819718724116683
	서울	0.0035866815596818924
	혐의	0.0033416009973734617
	아이돌	0.0033056919928640127
	주택	0.003245334839448333
	수사	0.0032406356185674667
	국민	0.0031968061812222004
	선고	0.003151848679408431
	가입	0.0030638815369457006
	무단	0.0030092899687588215
	관련	0.002996420953422785
	공연	0.00299412221647799
	배포	0.00298882438801229
	비하인드	0.002974020317196846
	투표	0.002950745401903987
	후보	0.002942788414657116
	저작	0.002906152280047536
	지난해	0.002830454846844077
	순위	0.0026695493143051863
	총장	0.0026424552779644728
	삼성전자	0.0026280907914042473
	사건	0.002596677979454398


Topic ID: 1
	의원	0.008067059330642223
	기자	0.007271035574376583
	서울	0.006608653347939253
	장관	0.005729539319872856
	사진	0.005277465097606182
	국민	0.005123837850987911
	대통

In [63]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.display(data)