<a href="https://colab.research.google.com/github/dudwn98/iipl_topic_modeling/blob/main/Khaiii%EC%99%80_KoNLPy_%EC%84%A4%EC%B9%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
!git clone https://github.com/kakao/khaiii.git

In [None]:
!pip install cmake

In [None]:
!mkdir build
!cd build && cmake /content/khaiii
!cd /content/build/ && make all
!cd /content/build/ && make resource
!cd /content/build && make install
!cd /content/build && make package_python
!pip install /content/build/package_python

In [None]:
pip install konlpy

In [None]:
# Import modules
import os
import pickle
import itertools
import numpy as np
import pandas as pd
from collections import Counter

# NLP modules
from konlpy.tag import Mecab, Okt
from khaiii import khaiii

# Visualization modules
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Factor Analysis (PCA)
from sklearn.decomposition import PCA

# ANOVA modules
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Chi square test
from scipy.stats import chi2_contingency

# Visualization setting
sns.set(style="ticks", palette="Set1")
plt.rcParams["font.family"] = 'NanumGothic'

In [None]:
#Read Data
original_dat = pd.read_csv('/content/data/results_crawl.csv')

original_dat.values

In [None]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [None]:
!bash install_mecab-ko_on_colab190912.sh

In [None]:
okt = Okt()
mecab = Mecab()
khaiii_api = khaiii.KhaiiiApi()

main_topic = original_dat['title'].tolist()

In [None]:
def khaiii_nouns(api, sentence):
    sentence_list = list()
    try:
        for parsed_words in api.analyze(sentence):
            for seg in parsed_words.morphs:
                if seg.tag[0] == 'N':
                    sentence_list.append(seg.lex)
    except:
        sentence_list = list()
    return sentence_list

def khaiii_morphs(api, sentence):
    sentence_list = list()
    try:
        for parsed_words in api.analyze(sentence):
            for seg in parsed_words.morphs:
                sentence_list.append(seg.lex)
    except:
        sentence_list = list()
    return sentence_list

def top_k_words(counter, p):
    top_list = list()
    for k, v in counter.items():
        if v >= np.quantile(list(counter.values()), (1-p)):
            top_list.append((k, v))
    return top_list

In [None]:
main_topic_morphs = list(itertools.chain(*[okt.morphs(x) for x in main_topic]))
main_topic_morphs_khaiii = list(itertools.chain(*[khaiii_morphs(khaiii_api, x) for x in main_topic]))
main_topic_nouns = list(itertools.chain(*[okt.morphs(x) for x in main_topic]))
main_topic_nouns_khaiii = list(itertools.chain(*[khaiii_nouns(khaiii_api, x) for x in main_topic]))

In [None]:
main_counter = Counter()
main_counter.update(main_topic_nouns_khaiii)

main_top_list = top_k_words(main_counter, 0.1)

In [None]:
print('품사 태깅 결과')
print('KoNLPy POS Tagging:')
print(okt.pos(main_topic[3]))
print('-'*100)
print()

print('Khaiii POS Tagging:')
for word in khaiii_api.analyze(main_topic[3]):
    print(word)

In [None]:
wordcloud = WordCloud(
    font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf',
    width = 700,
    height = 700,
    background_color="white",
)

wordcloud = wordcloud.generate_from_frequencies({k: v for k, v in main_top_list})
plt.figure(figsize=(8, 8))
plt.title('주요 단어 빈도 Wordcloud \n', size=30)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()