In [4]:
import pandas as pd

journal=pd.read_csv('data/02_journal_2020_2025.csv')
articles=pd.read_csv('data/03_article_2020_2025.csv')

In [5]:
print(journal.value_counts(subset=['date']))

date
2024    1275
2021    1222
2022    1222
2023    1210
2020    1127
2025     724
Name: count, dtype: int64


In [6]:
print(articles.value_counts(subset=['date']))

date  
2024.0    3176
2023.0    2699
2025.0    2457
2022.0     704
2021.0     676
2020.0     641
Name: count, dtype: int64


# 토픽모델링 돌리기(년도별로) - 2025.07.15

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora, models
import nltk

# 필요 시 아래 주석 제거하여 다운로드
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# 🔸 전처리 함수 (토큰 리스트 반환)
def preprocess_text(text):
    if pd.isnull(text):
        return []
    if isinstance(text, list):
        text = ' '.join(text)
    if isinstance(text, str) and ('<' in text and '>' in text):
        text = BeautifulSoup(text, "html.parser").get_text()
    # 수식, 특수문자 제거
    text = re.sub(r"\$.*?\$", "", text)
    text = re.sub(r"\\\(.*?\\\)", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = text.lower()

    # 토큰화
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]

    # 품사 필터링 (명사, 동사 계열)
    pos_tags = pos_tag(tokens)
    allowed = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
    filtered = [w for w, pos in pos_tags if pos in allowed]

    # 표제어
    lemmatizer = WordNetLemmatizer()
    lemma_tokens = [lemmatizer.lemmatize(w) for w in filtered]
    return lemma_tokens

# 🔸 연도별 LDA TF-IDF 파이프라인 함수
def run_lda_by_year_tfidf(
    data_path,
    year_column='year',
    text_columns=['title', 'abstract', 'keywords'],
    num_topics=20,
    passes=15,
    random_state=42,
    no_below=5,
    no_above=0.9
):
    df = pd.read_csv(data_path)
    years = sorted(df[year_column].dropna().unique())
    print(f"🔎 Processing years: {years}")

    models_by_year = {}

    for year in years:
        print(f"\n📅 Year: {year}")
        year_df = df[df[year_column] == year].fillna('')
        merged = year_df[text_columns].agg(' '.join, axis=1).tolist()
        docs = [preprocess_text(txt) for txt in merged]
        if not any(docs):
            print(f"⚠️ No docs for {year}")
            continue

        # 사전 및 BoW
        dictionary = corpora.Dictionary(docs)
        # 빈도 기반 필터링: 최소 no_below, 최대 no_above 비율
        dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

        # TF-IDF
        tfidf = models.TfidfModel(bow_corpus)
        tfidf_corpus = tfidf[bow_corpus]

        # LDA
        lda = models.LdaModel(
            corpus=tfidf_corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=passes,
            random_state=random_state
        )

        print(f"🔹 LDA Topics for {year} (topics={num_topics}):")
        for idx, topic in lda.print_topics(-1):
            print(f"Topic {idx}: {topic}")

        models_by_year[year] = {
            'model': lda,
            'dictionary': dictionary,
            'tfidf': tfidf
        }

    return models_by_year

# ✨ 사용 예시
if __name__ == '__main__':
    path = 'data/02_journal_2020_2025.csv'
    lda_results = run_lda_by_year_tfidf(
        data_path=path,
        year_column='date',
        text_columns=['title', 'abstract', 'keywords'],
        num_topics=10,
        passes=10,
        no_below=5,
        no_above=0.9
    )
    # 결과 활용: lda_results[2021]['model'] 등


🔎 Processing years: [2020, 2021, 2022, 2023, 2024, 2025]

📅 Year: 2020
🔹 LDA Topics for 2020 (topics=10):
Topic 0: 0.006*"data" + 0.005*"platform" + 0.005*"system" + 0.005*"model" + 0.004*"learning" + 0.004*"design" + 0.004*"information" + 0.004*"research" + 0.004*"service" + 0.004*"firm"
Topic 1: 0.010*"gamification" + 0.009*"brand" + 0.006*"contest" + 0.005*"chatbot" + 0.005*"engagement" + 0.004*"dashboard" + 0.004*"flow" + 0.004*"medium" + 0.004*"identity" + 0.003*"weight"
Topic 2: 0.005*"product" + 0.005*"protection" + 0.005*"website" + 0.004*"workplace" + 0.004*"advice" + 0.004*"wearable" + 0.004*"explainability" + 0.004*"fake" + 0.004*"controller" + 0.003*"entry"
Topic 3: 0.009*"sharing" + 0.006*"mhealth" + 0.005*"senior" + 0.004*"policy" + 0.004*"ict" + 0.004*"targeting" + 0.003*"participation" + 0.003*"continuance" + 0.003*"disclosure" + 0.003*"offline"
Topic 4: 0.008*"fund" + 0.006*"covid" + 0.005*"auction" + 0.004*"investor" + 0.004*"asset" + 0.004*"sharing" + 0.004*"option" 

In [3]:
# ✨ 사용 예시
path = 'data/03_article_2020_2025.csv'
lda_results2 = run_lda_by_year_tfidf(
    data_path=path,
    year_column='date',
    text_columns=['title', 'content', 'keywords'],
    num_topics=10,
    passes=10,
    no_below=5,
    no_above=0.9)


🔎 Processing years: [2020.0, 2021.0, 2022.0, 2023.0, 2024.0, 2025.0]

📅 Year: 2020.0
🔹 LDA Topics for 2020.0 (topics=10):
Topic 0: 0.004*"music" + 0.004*"song" + 0.004*"eurovision" + 0.003*"balloon" + 0.003*"contest" + 0.003*"lyric" + 0.002*"navigation" + 0.002*"genre" + 0.002*"reinforcement" + 0.001*"semiconductor"
Topic 1: 0.005*"symptom" + 0.004*"azure" + 0.003*"quantum" + 0.002*"scam" + 0.002*"jackson" + 0.002*"reply" + 0.002*"agriculture" + 0.002*"prescription" + 0.002*"advice" + 0.001*"openai"
Topic 2: 0.005*"deepfakes" + 0.004*"deepfake" + 0.004*"gpt" + 0.003*"court" + 0.003*"minister" + 0.002*"attack" + 0.002*"journalist" + 0.002*"grade" + 0.002*"cat" + 0.002*"official"
Topic 3: 0.003*"robot" + 0.003*"google" + 0.003*"data" + 0.003*"facebook" + 0.002*"image" + 0.002*"technology" + 0.002*"robotics" + 0.002*"china" + 0.002*"recognition" + 0.002*"user"
Topic 4: 0.002*"clothes" + 0.002*"walmart" + 0.002*"continent" + 0.002*"inspection" + 0.002*"cough" + 0.002*"inventory" + 0.002*"l