In [1]:
import pandas as pd

journal=pd.read_csv('data/02_journal_2020_2025.csv')
articles=pd.read_csv('data/04_article_2020_2025.csv')

In [2]:
journal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6780 entries, 0 to 6779
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         6780 non-null   object
 1   date          6780 non-null   int64 
 2   abstract      6780 non-null   object
 3   keywords      6780 non-null   object
 4   authors       6774 non-null   object
 5   affiliations  6780 non-null   object
dtypes: int64(1), object(5)
memory usage: 317.9+ KB


In [3]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14032 entries, 0 to 14031
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         14032 non-null  object 
 1   content       14032 non-null  object 
 2   date          14032 non-null  float64
 3   affiliations  14032 non-null  object 
 4   keywords      7604 non-null   object 
dtypes: float64(1), object(4)
memory usage: 548.3+ KB


In [4]:
print(journal.value_counts(subset=['date']))

date
2024    1275
2021    1222
2022    1222
2023    1210
2020    1127
2025     724
Name: count, dtype: int64


In [5]:
print(articles.value_counts(subset=['affiliations']))

affiliations       
techcrunch             3407
verge                  2606
the guardian           2420
BBC                    1864
Wall Street Journal     962
Techcrunch              921
New York Times          769
the_verge               670
CNN                     413
Name: count, dtype: int64


# 토픽모델링 돌리기(년도별로) - 2025.07.15

In [12]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora, models
import nltk

# 필요 시 아래 주석 제거하여 다운로드
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# 🔸 전처리 함수 (토큰 리스트 반환)
def preprocess_text(text):
    if pd.isnull(text):
        return []
    if isinstance(text, list):
        text = ' '.join(text)
    if isinstance(text, str) and ('<' in text and '>' in text):
        text = BeautifulSoup(text, "html.parser").get_text()
    # 수식, 특수문자 제거
    text = re.sub(r"\$.*?\$", "", text)
    text = re.sub(r"\\\(.*?\\\)", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = text.lower()

    # 토큰화
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]

    # 품사 필터링 (명사, 동사 계열)
    pos_tags = pos_tag(tokens)
    allowed = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
    filtered = [w for w, pos in pos_tags if pos in allowed]

    # 표제어
    lemmatizer = WordNetLemmatizer()
    lemma_tokens = [lemmatizer.lemmatize(w) for w in filtered]
    return lemma_tokens

# 🔸 연도별 LDA TF-IDF 파이프라인 함수
def run_lda_by_year_tfidf(
    data_path,
    year_column='year',
    text_columns=['title', 'abstract', 'keywords'],
    num_topics=20,
    passes=15,
    random_state=42,
    no_below=5,
    no_above=0.9
):
    df = pd.read_csv(data_path)
    years = sorted(df[year_column].dropna().unique())
    print(f"🔎 Processing years: {years}")

    models_by_year = {}

    for year in years:
        print(f"\n📅 Year: {year}")
        year_df = df[df[year_column] == year].fillna('')
        merged = year_df[text_columns].agg(' '.join, axis=1).tolist()
        docs = [preprocess_text(txt) for txt in merged]
        if not any(docs):
            print(f"⚠️ No docs for {year}")
            continue

        # 사전 및 BoW
        dictionary = corpora.Dictionary(docs)
        # 빈도 기반 필터링: 최소 no_below, 최대 no_above 비율
        dictionary.filter_extremes(no_below=no_below, no_above=no_above)
        bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

        # TF-IDF
        tfidf = models.TfidfModel(bow_corpus)
        tfidf_corpus = tfidf[bow_corpus]

        # LDA
        lda = models.LdaModel(
            corpus=tfidf_corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=passes,
            random_state=random_state
        )

        print(f"🔹 LDA Topics for {year} (topics={num_topics}):")
        for idx, topic in lda.print_topics(-1):
            print(f"Topic {idx}: {topic}")

        models_by_year[year] = {
            'model': lda,
            'dictionary': dictionary,
            'tfidf': tfidf
        }

    return models_by_year

# ✨ 사용 예시
if __name__ == '__main__':
    path = 'data/02_journal_2020_2025.csv'
    lda_results = run_lda_by_year_tfidf(
        data_path=path,
        year_column='date',
        text_columns=['title', 'abstract'],
        num_topics=10,
        passes=10,
        no_below=15,
        no_above=0.8
    )
    # 결과 활용: lda_results[2021]['model'] 등


🔎 Processing years: [2020, 2021, 2022, 2023, 2024, 2025]

📅 Year: 2020
🔹 LDA Topics for 2020 (topics=10):
Topic 0: 0.018*"investor" + 0.017*"market" + 0.016*"firm" + 0.012*"price" + 0.012*"student" + 0.009*"option" + 0.009*"trading" + 0.009*"mining" + 0.008*"gender" + 0.008*"stock"
Topic 1: 0.018*"movement" + 0.016*"blockchain" + 0.010*"job" + 0.009*"feature" + 0.009*"culture" + 0.008*"affordances" + 0.007*"message" + 0.007*"object" + 0.007*"creation" + 0.007*"value"
Topic 2: 0.022*"trust" + 0.015*"blockchain" + 0.010*"error" + 0.009*"consumption" + 0.008*"model" + 0.008*"machine" + 0.007*"governance" + 0.007*"collection" + 0.006*"traffic" + 0.006*"system"
Topic 3: 0.058*"privacy" + 0.017*"concern" + 0.012*"party" + 0.010*"user" + 0.010*"twitter" + 0.009*"sharing" + 0.009*"panel" + 0.008*"tracking" + 0.008*"policy" + 0.008*"data"
Topic 4: 0.018*"medium" + 0.013*"regulation" + 0.010*"price" + 0.010*"twitter" + 0.010*"stock" + 0.009*"effect" + 0.008*"agency" + 0.008*"message" + 0.008*"po

In [13]:
# ✨ 사용 예시
path = 'data/04_article_2020_2025.csv'
lda_results2 = run_lda_by_year_tfidf(
    data_path=path,
    year_column='date',
    text_columns=['title', 'content'],
    num_topics=10,
    passes=10,
    no_below=15,
    no_above=0.8)


🔎 Processing years: [2020.0, 2021.0, 2022.0, 2023.0, 2024.0, 2025.0]

📅 Year: 2020.0
🔹 LDA Topics for 2020.0 (topics=10):
Topic 0: 0.002*"switch" + 0.001*"patent" + 0.001*"hate" + 0.001*"facebook" + 0.001*"advertising" + 0.001*"robot" + 0.001*"marketing" + 0.001*"aid" + 0.001*"data" + 0.001*"said"
Topic 1: 0.019*"tiktok" + 0.015*"bytedance" + 0.003*"huawei" + 0.002*"export" + 0.002*"carbon" + 0.002*"china" + 0.001*"trump" + 0.001*"deal" + 0.001*"telecom" + 0.001*"corp"
Topic 2: 0.016*"chip" + 0.008*"intel" + 0.007*"robotics" + 0.007*"venture" + 0.007*"quantum" + 0.006*"capital" + 0.006*"startup" + 0.006*"investment" + 0.006*"huawei" + 0.006*"china"
Topic 3: 0.003*"forecast" + 0.001*"weather" + 0.001*"retailer" + 0.001*"gpt" + 0.001*"brain" + 0.001*"deepfake" + 0.001*"ad" + 0.001*"voice" + 0.001*"musk" + 0.001*"data"
Topic 4: 0.001*"voice" + 0.001*"alexa" + 0.001*"satellite" + 0.001*"council" + 0.001*"skill" + 0.001*"style" + 0.001*"facebook" + 0.001*"weather" + 0.001*"music" + 0.001*"s