## Topic Modeling NAVER article with gensim

> Process the text to apply topic modeling.

> Try gensim, a topic modeling library.

### 1. Load the gensim library for topic modeling

In [1]:
from tqdm.notebook import tqdm # progress bar
from konlpy.tag import Mecab # Load Stemmer like Mecab, Okt, etc.
import string # special characters
import warnings # library for removing warning notifications
from gensim import corpora # Import vectorizer module and LDA model used by gensim.
from gensim import models

import numpy as np
import re
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning) # Ignore any warning notifications

### 2. Create the text preprocessing function

In [2]:
def read_documents(input_file_name):
    
    corpus = []
    
    # Reads a PK file, converts it to a list, and returns it
    with open(input_file_name,'rb') as f :
        temp_corpus = pickle.load(f)
    for page in temp_corpus:
        corpus += page
    
    return corpus

def text_cleaning(docs):
    # Let's modify the function to remove non-Korean characters for convenience
    cleaned_docs = []
    for doc in docs:
        temp_doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
        cleaned_docs.append(temp_doc)
        
    return cleaned_docs

def define_stopwords(path):
    
    SW = set()
    # How to add a stopword 1
    # Add special characters
    for i in string.punctuation:
        SW.add(i)
    
    # How to add a stopword 2
    # Add directly to stopwords-en.txt
    
    with open(path) as f:
        for word in f:
            SW.add(word)

    return SW


def text_tokenizing(corpus, tokenizer):

    # Create a function that allows you to choose between noun extraction and stemming
    mecab = Mecab()
    token_corpus = []
    
    #Let's make our progress visible using tqdm
    if tokenizer == "noun":
        for n in tqdm(range(len(corpus)), desc="Preprocessing"):
            token_text = mecab.nouns(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
                
            token_corpus.append(token_text)
            
    elif tokenizer == "morph":
        for n in tqdm(range(len(corpus)), desc="Preprocessing"):
            token_text = mecab.morphs(corpus[n])
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)

    elif tokenizer == "word":
        for n in tqdm(range(len(corpus)), desc="Preprocessing"):
            token_text = corpus[n].split()
            token_text = [word for word in token_text if word not in SW and len(word) > 1]
            token_corpus.append(token_text)
        

    return token_corpus

# The (main) code to call the function
input_file_name = "/Users/shim/dl-python-SentimentAnalysis/Crawling /naver_news_content.pk"
documents = read_documents(input_file_name)
SW = define_stopwords("/Users/shim/dl-python-SentimentAnalysis/Konlpy/stopwords-ko.txt")
cleaned_text = text_cleaning(documents)
tokenized_text = text_tokenizing(cleaned_text, tokenizer="noun") #tokenizer= "noun" or "morph" or "word"

Preprocessing:   0%|          | 0/11 [00:00<?, ?it/s]

The process of reading a document is no different from the previous case of word embedding. The next step is to create the document-word matrix.

In [3]:
# Print the result
print(tokenized_text[0])

['자연어', '코딩', '시대', '대비', '대체', '가능', '개발자', '육성', '재양', '프로그램', '교육', '개월', '합숙', '훈련', '돌입', '개월', '합숙', '기반', '협업', '자기', '주도', '학습', '기본기', '개발자', '양성', '소프트웨어', '양성', '프로그램', '래프', '정글', '정글', '기획', '운영', '래프', '장병규', '사회', '장사진', '김정한', '원장', '서울대', '시흥', '캠퍼스', '소식', '파이낸셜', '뉴스', '인터뷰', '코딩', '능력', '코드', '인공지능', '기반', '개발', '도구', '활용', '성과', '전산학', '기본기', '프로그래밍', '종말론', '제기', '정도', '파이썬', '프로그래밍', '언어', '자연어', '입력', '코딩', '시대', '가운데', '대체', '성과', '기본기', '공통', '진단', '활용', '성과', '슈퍼', '개발자', '가능', '개발자', '양극', '현상', '주목', '지적', '엔지니어', '기본기', '무엇', '의장', '컴퓨터', '엔지니어', '컴퓨터', '이해', '성과', '엔지니어', '기본기', '핵심', '오퍼레이팅', '시스템', '강조', '정글', '이스트', '핀토스', '고난', '프로젝트', '정글', '커리큘럼', '마련', '개발자', '커리어', '전환', '희망', '지원자', '선발', '육성', '래프', '사회', '책임', '프로그램', '산업', '개발자', '구인난', '해소', '청년', '고용', '창출', '기여', '정글', '단계', '규모', '확대', '목표', '배출', '교육', '합격', '연령', '중반', '개월', '모집', '과정', '선발', '교육', '개월', '합숙', '교육', '돌입', '교육', '과정', '자료', '구조', '알고리즘', '서버', '운영', '체제', '교육', '프로젝트', '나

### 3. Identify functions to use for topic modeling

In [4]:
# Create a document-word matrix
# Learn a vocabulary
dictionary  = corpora.Dictionary(tokenized_text)
# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in tokenized_text] # is like countvectorizer

In [9]:
# Verify Dictionary
print(dictionary)

Dictionary<342 unique tokens: ['가능', '가운데', '가이드', '강의', '강조']...>


In [11]:
# Verify corpus 
corpus[0][:5]

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1)]

In [12]:
# Create a TFIDF document-word matrix
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0][:5]

[(0, 0.1080277676265431),
 (1, 0.05401388381327155),
 (2, 0.05401388381327155),
 (3, 0.041166973856661404),
 (4, 0.041166973856661404)]

In [15]:
# Create LDA model
model = models.ldamodel.LdaModel(corpus,num_topics=3, id2word=dictionary)

In [17]:
# Verify the result of LDA 
model.show_topic(2,10)

[('교육', 0.032709572),
 ('사회', 0.026290283),
 ('수업', 0.02454356),
 ('올리브', 0.02402457),
 ('코딩', 0.022991514),
 ('웍스', 0.022338945),
 ('네트', 0.020883633),
 ('캠프', 0.020300325),
 ('중학교', 0.01706676),
 ('학생', 0.01604399)]

### 4.Adding topic modeling to complete the code

In [19]:
# Add variables to determine the number of topics and keywords
NUM_TOPICS = 3
NUM_TOPIC_WORDS = 30

def build_doc_term_mat(documents):
    # Create a document-word matrix function
    print("Building document-term matrix.")
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(document) for document in documents]

    return corpus, dictionary


def print_topic_words(model):

    # Functions that output the results of topic modeling
    print("\nPrinting topic words.\n")

    for topic_id in range(model.num_topics):
        topic_word_probs = model.show_topic(topic_id, NUM_TOPIC_WORDS)
        print("Topic ID:{}".format(topic_id))

        for topic_word, prob in topic_word_probs:
            print("\t{}\t{}".format(topic_word,prob))

        print("\n")

# Create a document-term matrix,
corpus, dictionary = build_doc_term_mat(tokenized_text)

# Run LDA
model = models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS , id2word = dictionary , alpha = "auto", 
                                eta = "auto")
# output the result
print_topic_words(model)


Building document-term matrix.

Printing topic words.

Topic ID:0
	교육	0.03205902874469757
	네트	0.024408437311649323
	코딩	0.022968394681811333
	올리브	0.02290395088493824
	웍스	0.02217511087656021
	사회	0.019853178411722183
	수업	0.017888279631733894
	중학교	0.01741364412009716
	캠프	0.017183668911457062
	학생	0.01649763621389866
	커리큘럼	0.015212050639092922
	프로그램	0.013360374607145786
	과정	0.012681773863732815
	로봇	0.012678198516368866
	지역	0.011723075062036514
	레고	0.011592663824558258
	올해	0.011523707769811153
	문제	0.011290381662547588
	해결	0.01119607500731945
	지원	0.010355943813920021
	제주	0.009912112727761269
	충북	0.009749880991876125
	기술	0.009683496318757534
	대상	0.0088921207934618
	환경	0.008875895291566849
	주제	0.008509043604135513
	활용	0.00834006816148758
	도입	0.008178863674402237
	제공	0.008163277059793472
	참여	0.008067055605351925


Topic ID:1
	캠프	0.02726460061967373
	웍스	0.025471234694123268
	교육	0.02478213608264923
	사회	0.024059046059846878
	올리브	0.02402391843497753
	네트	0.02270052395761013
	수업	0.022065849974751472
	코

### 5. Visualizing the results of topic modeling with pyLDAvis

In [20]:
!pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m221.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting joblib>=1.2.0
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting numexpr
  Downloading numexpr-2.8.4-cp39-cp39-macosx_10_9_x86_64.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.9/99.9 kB[0m [31m291.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: funcy, numexpr, joblib, pyldavis
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.1
    Uninstalling joblib-1.1.1:
      Successfully uninstalled joblib-1.1.1
Successfully installed funcy-2.0 joblib-1.2.0 numexpr-2.8.4 pyldavis-3.4.0


In [21]:
# load pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

# Enable pyLDAvis to run in jupyter notebook.
pyLDAvis.enable_notebook()

# Run pyLDAvis
data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
data

  default_term_info = default_term_info.sort_values(
