## 라이브러리 로드

In [None]:
# !pip install koreanize-matplotlib

In [None]:
# 필요 라이브러리를 로드합니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import koreanize_matplotlib

## 데이터 로드
* [KLUE Benchmark](https://klue-benchmark.com/)

In [None]:
# read_json 으로 url 데이터 불러오기
url = "https://raw.githubusercontent.com/KLUE-benchmark/KLUE/main/klue_benchmark/ynat-v1.1/ynat-v1.1_train.json"
df = pd.read_json(url)
df.shape

In [None]:
df.head()

In [None]:
df = df[df["label"] == "생활문화"]
df.shape

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvect = TfidfVectorizer()
tfidfvect

In [None]:
dtm_tfidf = tfidfvect.fit_transform(df["title"].dropna())
dtm_tfidf

In [None]:
df_dtm = pd.DataFrame(dtm_tfidf.toarray(), columns=tfidfvect.get_feature_names_out())
df_dtm.sum().nlargest(50).plot.bar(figsize=(15, 4))

## LDA 토픽모델링

In [None]:
# 주제별로 상위 키워드를 막대그래프로 시각화합니다.
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(1, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        # 역순 정렬
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}",
                     fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
# LDA_model

from sklearn.decomposition import LatentDirichletAllocation

NUM_TOPICS = 5
LDA_model = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42)
# LDA 모델을 학습시킵니다.
LDA_model.fit(dtm_tfidf)

In [None]:
n_top_words = 20

plot_top_words(
    LDA_model, tfidfvect.get_feature_names_out(), n_top_words, "Topics in LDA model (LatentDirichletAllocation)"
)

## NMF 토픽모델링

In [None]:
# nmf_model
from sklearn.decomposition import NMF

n_samples = 2000
n_features = 1000
n_components = 5
n_top_words = 20

# Fit the NMF model
print(
    "Fitting the NMF model (Frobenius norm) with tf-idf features, "
    "n_samples=%d and n_features=%d..." % (n_samples, n_features)
)

nmf_model = NMF(n_components=n_components, random_state=42, l1_ratio=0.5).fit(df_dtm)

In [None]:
plot_top_words(
    nmf_model, tfidfvect.get_feature_names_out(), n_top_words, "Topics in NMF model (Frobenius norm)"
)