## Topic Modeling NAVER article with gensim

> Process the text to apply topic modeling.

> Try gensim, a topic modeling library.

### 1. Load the gensim library for topic modeling

In [15]:
from tqdm.notebook import tqdm # progress bar
from konlpy.tag import Mecab # Load Stemmer like Mecab, Okt, etc.
import string # special characters
import warnings # library for removing warning notifications
from gensim import corpora # Import vectorizer module and LDA model used by gensim.
from gensim import models

import numpy as np
import re
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning) # Ignore any warning notifications

### 2. Create the text preprocessing function

In [16]:
def read_documents(input_file_name):
    
    corpus = []
    
    # Reads a PK file, converts it to a list, and returns it
    with open(input_file_name,'rb') as f :
        temp_corpus = pickle.load(f)
    for page in temp_corpus:
        corpus += page
    
    return corpus

def text_cleaning(docs):
    # Let's modify the function to remove non-Korean characters for convenience
    cleaned_docs = []
    for doc in docs:
        temp_doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        cleaned_docs.append(temp_doc)
        
    return cleaned_docs

def define_stopwords(path):
    
    SW = set()
    # How to add a stopword 1
    # Add special characters
    for i in string.punctuation:
        SW.add(i)
    
    # How to add a stopword 2
    # Add directly to stopwords-en.txt
    
    with open(path) as f:
        for word in f:
            SW.add(word)

    return SW


def text_tokenizing(corpus, tokenizer):

    # Create a function that allows you to choose between noun extraction and stemming
    mecab = Mecab()
    token_corpus = []
    
    #Let's make our progress visible using tqdm
    if tokenizer == "noun":
        for n in tqdm(range(len(corpus)), desc="Preprocessing"):
            token_text = mecab.nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
                
            token_corpus.append(token_text)
            
    elif tokenizer == "morph":
        for n in tqdm(range(len(corpus)), desc="Preprocessing"):
            token_text = mecab.morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)

    elif tokenizer == "word":
        for n in tqdm(range(len(corpus)), desc="Preprocessing"):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)
        

    return token_corpus

# The (main) code to call the function
input_file_name = "/Users/shim/dl-python-SentimentAnalysis/Crawling /naver_news_content.pk"
documents = read_documents(input_file_name)
SW = define_stopwords("/Users/shim/dl-python-SentimentAnalysis/Konlpy/stopwords-ko.txt")
cleaned_text = text_cleaning(documents)
tokenized_text = text_tokenizing(cleaned_text, tokenizer="noun") #tokenizer= "noun" or "morph" or "word"

Preprocessing:   0%|          | 0/11 [00:00<?, ?it/s]

The process of reading a document is no different from the previous case of word embedding. The next step is to create the document-word matrix.

In [17]:
# Print the result
print(tokenized_text[0])

['자연어', '코딩', '시대', '대비', '대체', '가능', '개발자', '육성', '재양', '프로그램', '교육', '개월', '합숙', '훈련', '돌입', '개월', '합숙', '기반', '협업', '자기', '주도', '학습', '기본기', '개발자', '양성', '소프트웨어', '양성', '프로그램', '래프', '정글', '정글', '기획', '운영', '래프', '장병규', '사회', '장사진', '김정한', '원장', '서울대', '시흥', '캠퍼스', '소식', '파이낸셜', '뉴스', '인터뷰', '코딩', '능력', '코드', '인공지능', '기반', '개발', '도구', '활용', '성과', '전산학', '기본기', '프로그래밍', '종말론', '제기', '정도', '파이썬', '프로그래밍', '언어', '자연어', '입력', '코딩', '시대', '가운데', '대체', '성과', '기본기', '공통', '진단', '활용', '성과', '슈퍼', '개발자', '가능', '개발자', '양극', '현상', '주목', '지적', '엔지니어', '기본기', '무엇', '의장', '컴퓨터', '엔지니어', '컴퓨터', '이해', '성과', '엔지니어', '기본기', '핵심', '오퍼레이팅', '시스템', '강조', '정글', '이스트', '핀토스', '고난', '프로젝트', '정글', '커리큘럼', '마련', '개발자', '커리어', '전환', '희망', '지원자', '선발', '육성', '래프', '사회', '책임', '프로그램', '산업', '개발자', '구인난', '해소', '청년', '고용', '창출', '기여', '정글', '단계', '규모', '확대', '목표', '배출', '교육', '합격', '연령', '중반', '개월', '모집', '과정', '선발', '교육', '개월', '합숙', '교육', '돌입', '교육', '과정', '자료', '구조', '알고리즘', '서버', '운영', '체제', '교육', '프로젝트', '나

### 3. Identify functions to use for topic modeling

In [18]:
# Create a document-word matrix
# Learn a vocabulary
dictionary  = corpora.Dictionary(tokenized_text)
# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in tokenized_text] # is like countvectorizer

In [19]:
# Verify Dictionary
print(dictionary)

Dictionary<342 unique tokens: ['가능', '가운데', '가이드', '강의', '강조']...>


In [20]:
# Verify corpus 
corpus[0][:5]

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1)]

In [21]:
# Create a TFIDF document-word matrix
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0][:5]

[(0, 0.1080277676265431),
 (1, 0.05401388381327155),
 (2, 0.05401388381327155),
 (3, 0.041166973856661404),
 (4, 0.041166973856661404)]

In [22]:
# Create LDA model
model = models.ldamodel.LdaModel(corpus,num_topics=3, id2word=dictionary)

In [23]:
# Verify the result of LDA 
model.show_topic(2,10)

[('교육', 0.030630322),
 ('캠프', 0.0240161),
 ('웍스', 0.023783728),
 ('수업', 0.020658387),
 ('네트', 0.019673763),
 ('사회', 0.019657675),
 ('올리브', 0.018687256),
 ('코딩', 0.018185265),
 ('학생', 0.017440354),
 ('프로그램', 0.012963635)]

### 4.Adding topic modeling to complete the code

In [24]:
# Add variables to determine the number of topics and keywords
NUM_TOPICS = 3
NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    # Create a document-word matrix function
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]

    return corpus, dictionary


def print_topic_words(model):

    # Functions that output the results of topic modeling
    print("\nPrinting topic words.\n")

    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print("Topic ID:{}".format(topic_id))

        for topic_word, prob in topic_word_probs:
            print("\t{}\t{}".format(topic_word,prob))

        print("\n")

# Create a document-term matrix,
corpus, dictionary = build_doc_term_mat(tokenized_text)

# Run LDA
model = models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS , id2word = dictionary , alpha = "auto", 
                                eta = "auto")
# output the result
print_topic_words(model)


Building document-term matrix.

Printing topic words.

Topic ID:0
	교육	0.03584854304790497
	네트	0.026227917522192
	웍스	0.026089496910572052
	올리브	0.025210974738001823
	사회	0.02299782633781433
	캠프	0.02280508354306221
	코딩	0.019223717972636223
	수업	0.016512738540768623
	학생	0.016118500381708145
	중학교	0.015838325023651123
	해결	0.015302609652280807
	로봇	0.015248022973537445
	문제	0.014668306335806847
	프로그램	0.013692420907318592
	커리큘럼	0.01259820256382227
	지역	0.01256298553198576
	레고	0.01138236466795206
	제주	0.011142410337924957
	활용	0.010378992184996605
	과정	0.009834756143391132
	지원	0.009772478602826595
	환경	0.00956029910594225
	올해	0.00937320664525032
	도입	0.009344318881630898
	대상	0.009093319065868855
	기술	0.008763272315263748
	파이썬	0.008414496667683125
	주제	0.008196600712835789
	참여	0.008177914656698704
	진행	0.008140284568071365


Topic ID:1
	교육	0.023565499112010002
	수업	0.023048026487231255
	웍스	0.020543357357382774
	사회	0.020526057109236717
	코딩	0.019396208226680756
	캠프	0.016711032018065453
	올리브	0.01662822999060154


### 5. Visualizing the results of topic modeling with pyLDAvis

In [25]:
!pip install pyldavis



In [26]:
# load pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

# Enable pyLDAvis to run in jupyter notebook.
pyLDAvis.enable_notebook()

# Run pyLDAvis
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data

  default_term_info = default_term_info.sort_values(
