In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora, models

# 필요 시 아래 주석 제거하여 다운로드
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

def preprocess_text(text):
    if pd.isnull(text):
        return []
    if isinstance(text, list):
        text = ' '.join(text)
    if isinstance(text, str) and ('<' in text and '>' in text):
        text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"\$.*?\$", "", text)
    text = re.sub(r"\\\(.*?\\\)", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = text.lower()

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]

    pos_tags = pos_tag(tokens)
    allowed = {'NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ'}
    filtered = [w for w,pos in pos_tags if pos in allowed]

    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in filtered]

def run_lda_by_year_tfidf(data_path,
                          year_column='year',
                          text_columns=['title','abstract','keywords'],
                          num_topics=20,
                          passes=15,
                          random_state=42,
                          no_below=5,
                          no_above=0.9):
    df = pd.read_csv(data_path)
    # 연도 컬럼이 날짜인 경우 연도로 변환
    if year_column in df.columns and df[year_column].dtype == 'object' and pd.to_datetime(df[year_column], errors='coerce').notna().any():
        df['year'] = pd.to_datetime(df[year_column], errors='coerce').dt.year
        year_column = 'year'
    years = sorted(df[year_column].dropna().astype(int).unique())
    models_by_year = {}

    for year in years:
        year_df = df[df[year_column] == year].fillna('')
        docs = [
            preprocess_text(' '.join([str(year_df[col].iloc[i]) for col in text_columns]))
            for i in range(len(year_df))
        ]
        if not any(docs):
            continue

        dictionary = corpora.Dictionary(docs)
        dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        bow = [dictionary.doc2bow(doc) for doc in docs]
        tfidf = models.TfidfModel(bow)
        tfidf_corpus = tfidf[bow]

        lda = models.LdaModel(corpus=tfidf_corpus,
                              id2word=dictionary,
                              num_topics=num_topics,
                              passes=passes,
                              random_state=random_state)
        models_by_year[year] = {'model': lda, 'dictionary': dictionary}

    return models_by_year

def print_topics_by_year(models_by_year, topn=10):
    for year, data in sorted(models_by_year.items()):
        lda = data['model']
        print(f"\n===== {year} 년도 토픽 모델링 ({lda.num_topics} topics) =====")
        for t in range(lda.num_topics):
            terms = lda.show_topic(t, topn)
            topic_terms = ', '.join([f"{w} ({prob:.3f})" for w, prob in terms])
            print(f"토픽 {t+1:02d}: {topic_terms}")

if __name__ == '__main__':
    path = 'data/02_journal_2020_2025.csv'
    lda_models = run_lda_by_year_tfidf(
        data_path=path,
        year_column='date',               # 원본에 'date' 컬럼이 있으면 연도로 변환합니다
        text_columns=['title','abstract','keywords'],
        num_topics=20,
        passes=10,
        no_below=5,
        no_above=0.9
    )
    print_topics_by_year(lda_models, topn=10)


===== 2020 년도 토픽 모델링 (20 topics) =====
토픽 01: news (0.015), market (0.010), firm (0.010), medium (0.007), crowdsourcing (0.007), consumption (0.006), crowd (0.006), standard (0.006), platform (0.006), norm (0.006)
토픽 02: training (0.008), speech (0.007), satisfaction (0.007), auction (0.007), maturity (0.006), sale (0.005), significance (0.005), automated (0.005), mitigation (0.005), quality (0.005)
토픽 03: sharing (0.008), database (0.008), game (0.007), option (0.007), safety (0.006), gaming (0.006), forum (0.006), attribute (0.006), security (0.006), device (0.006)
토픽 04: chatbots (0.011), worker (0.008), service (0.007), manufacturing (0.006), sensitivity (0.006), agency (0.006), right (0.005), convenience (0.005), exploration (0.004), period (0.004)
토픽 05: gamification (0.014), participation (0.009), rating (0.008), group (0.008), decision (0.007), reputation (0.007), game (0.007), activity (0.006), disaster (0.006), choice (0.006)
토픽 06: blockchain (0.020), trust (0.008), technol

In [2]:
if __name__ == '__main__':
    path = 'data/02_article_2020_2025.csv'
    lda_models = run_lda_by_year_tfidf(
        data_path=path,
        year_column='date',               # 원본에 'date' 컬럼이 있으면 연도로 변환합니다
        text_columns=['title','content','keywords'],
        num_topics=20,
        passes=10,
        no_below=5,
        no_above=0.9
    )
    print_topics_by_year(lda_models, topn=10)


===== 2020 년도 토픽 모델링 (20 topics) =====
토픽 01: reply (0.004), death (0.003), mac (0.003), chatbots (0.003), sensing (0.002), fashion (0.002), promising (0.002), labeled (0.002), finished (0.002), classification (0.002)
토픽 02: farm (0.006), crop (0.005), agriculture (0.004), commission (0.004), lawmaker (0.004), automate (0.004), rpa (0.003), india (0.003), habit (0.003), plant (0.003)
토픽 03: trump (0.003), sport (0.003), rekognition (0.003), shareholder (0.003), telecom (0.002), attack (0.002), rob (0.002), radiation (0.002), opening (0.002), pitching (0.002)
토픽 04: musk (0.015), neuralink (0.015), elon (0.010), vehicle (0.008), mobility (0.005), brain (0.004), telecom (0.004), rover (0.003), tesla (0.003), kit (0.003)
토픽 05: clearview (0.017), dynamic (0.013), boston (0.012), spot (0.012), police (0.011), study (0.007), patient (0.006), privacy (0.006), health (0.005), doctor (0.005)
토픽 06: actor (0.003), scraped (0.003), entirety (0.003), fascinating (0.003), wealth (0.002), manufact