# Topic Modeling - academia
- title, abstract만 갖고 토픽 모델링 진행
## 전처리
- NER로 사람 이름 체크 및 제거
- 불용어로 Information System, Research, Study 제거
- 각 키워드를 소문자 변환 후에 띄어쓰기 제거 및 복수형태 통일
## 모델
- coherence score 체크해서 최적 토픽 개수 찾기 1-20까지


In [14]:
import pandas as pd

indu=pd.read_csv('data/05_article_2023_2025.csv')
indu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11672 entries, 0 to 11671
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         11672 non-null  object 
 1   content       11672 non-null  object 
 2   date          11672 non-null  float64
 3   affiliations  11672 non-null  object 
 4   keywords      6252 non-null   object 
dtypes: float64(1), object(4)
memory usage: 456.1+ KB


In [15]:
indu.head()

Unnamed: 0,title,content,date,affiliations,keywords
0,ArtificialIntelligence(A Special Report) --- H...,The current generation of college students is ...,2024.0,Wall Street Journal,
1,ArtificialIntelligence(A Special Report) --- T...,ChatGPT is barely two years old. And yet it's ...,2024.0,Wall Street Journal,
2,ArtificialIntelligence(A Special Report) --- F...,The race for AI dominance launched a stampede ...,2025.0,Wall Street Journal,
3,Crunchbase UsesArtificialIntelligenceTo Predic...,"Crunchbase, the firm best known for its startu...",2025.0,Wall Street Journal,
4,On the Clock: Bosses' Mental Fitness Set for A...,Bosses already live in fear that a verbal miss...,2024.0,Wall Street Journal,


In [21]:
# !pip install spacy nltk gensim
# !python -m spacy download en_core_web_sm

import re
import pandas as pd
import spacy
import nltk

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

# ——— NLTK 리소스 다운로드 ———
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# ——— spaCy 로드 (NER 용) ———
nlp = spacy.load("en_core_web_sm")

# ——— 사용자 정의 불용어 ———
CUSTOM_STOPWORDS = {'informationsystem', 'research', 'study'}


# ——— NLTK 리소스 다운로드 ———
nltk.download('punkt')
nltk.download('wordnet')

# ——— spaCy 모델 로드 ———
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

# ——— 사용자 정의 불용어 (소문자/공백제거 후 기준) ———
CUSTOM_STOPWORDS = {'informationsystem', 'research', 'study','musk','altman','trump','elon','grok'}


def preprocess_text(text: str) -> list[str]:
    """
    1) NaN 또는 리스트 처리
    2) HTML 태그 및 수식 제거
    3) 알파벳·공백 외 문자 제거 → 소문자화
    4) 토큰화 → NLTK 불용어 & 길이 >2 필터
    5) POS 태그 필터링 (명사·동사 계열)
    6) Lemmatize
    7) spaCy NER로 PERSON 엔터티 제거
    8) 사용자 정의 불용어 제외
    """
    # 1) Null / 리스트 처리
    if pd.isnull(text):
        return []
    if isinstance(text, list):
        text = ' '.join(text)

    # 2) HTML, LaTeX 수식 제거
    if '<' in text and '>' in text:
        text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"\$.*?\$", "", text)
    text = re.sub(r"\\\(.*?\\\)", "", text)

    # 3) 알파벳·공백 외 제거 → 소문자
    text = re.sub(r"[^a-zA-Z\s]", " ", text).lower()

    # 4) 토큰화 → 불용어·길이 필터
    stop_words = set(stopwords.words('english'))
    tokens = [
        w for w in word_tokenize(text)
        if w not in stop_words and len(w) > 2
    ]

    # 5) POS 태깅 → 명사·동사 계열만
    allowed = {'NN','NNS','NNP','NNPS'}
    tokens = [w for w, pos in pos_tag(tokens) if pos in allowed]

    # 6) 표제어 추출
    lemm = WordNetLemmatizer()
    tokens = [lemm.lemmatize(w) for w in tokens]

    # 7) spaCy NER 적용 → PERSON 제거
    doc = nlp(" ".join(tokens))
    tokens = [tok.text for tok in doc if tok.ent_type_ != 'PERSON']

    # 8) 사용자 정의 불용어 제거
    return [w for w in tokens if w not in CUSTOM_STOPWORDS]


# ——— 데이터 불러오기 ———
# df = pd.read_csv('your_data.csv')  # title, abstract 컬럼 포함
# 예시: title + abstract 합치기
indu['text'] = indu['title'].fillna('') + ' ' + indu['content'].fillna('')

# ——— 전처리 적용 ———
texts = [preprocess_text(doc) for doc in indu['text']]

# ——— Gensim Dictionary & Corpus 생성 ———
dictionary = corpora.Dictionary(texts)
# (선택) 너무 드문 단어/전체의 절반 이상 등장 단어 필터링
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(txt) for txt in texts]

# ——— 최적 토픽 수 탐색 (1~20) ———
coherence_scores = []
for k in range(1, 21):
    lda = LdaModel(corpus=corpus,
                   id2word=dictionary,
                   num_topics=k,
                   random_state=42,
                   passes=10)
    cm = CoherenceModel(model=lda,
                        texts=texts,
                        dictionary=dictionary,
                        coherence='c_v')
    score = cm.get_coherence()
    coherence_scores.append((k, score))
    print(f"Num Topics = {k:2d} → Coherence = {score:.4f}")

# ——— 최적 토픽 수 출력 ———
best_k, best_score = max(coherence_scores, key=lambda x: x[1])
print(f"\nOptimal #Topics = {best_k} with coherence {best_score:.4f}")

[nltk_data] Downloading package punkt to /Users/choihj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/choihj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/choihj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/choihj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/choihj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/choihj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Num Topics =  1 → Coherence = 0.2888
Num Topics =  2 → Coherence = 0.3500
Num Topics =  3 → Coherence = 0.4138
Num Topics =  4 → Coherence = 0.4109
Num Topics =  5 → Coherence = 0.4481
Num Topics =  6 → Coherence = 0.4465
Num Topics =  7 → Coherence = 0.4726
Num Topics =  8 → Coherence = 0.4724
Num Topics =  9 → Coherence = 0.4691
Num Topics = 10 → Coherence = 0.4568
Num Topics = 11 → Coherence = 0.4690
Num Topics = 12 → Coherence = 0.4609
Num Topics = 13 → Coherence = 0.4790
Num Topics = 14 → Coherence = 0.4802
Num Topics = 15 → Coherence = 0.4912
Num Topics = 16 → Coherence = 0.4967
Num Topics = 17 → Coherence = 0.4852
Num Topics = 18 → Coherence = 0.5094
Num Topics = 19 → Coherence = 0.5183
Num Topics = 20 → Coherence = 0.5140

Optimal #Topics = 19 with coherence 0.5183


In [22]:
# pip install spacy nltk gensim beautifulsoup4
# python -m spacy download en_core_web_sm

import re
import pandas as pd
import spacy
import nltk

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

# ——— NLTK 리소스 다운로드 ———
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# ——— spaCy 로드 (NER 용) ———
nlp = spacy.load("en_core_web_sm")

# ——— 사용자 정의 불용어 ———
CUSTOM_STOPWORDS = {'informationsystem', 'research', 'study','musk','altman','trump','elon','grok'}

def preprocess_text(text: str) -> list[str]:
    """
    1) NaN 또는 리스트 처리
    2) HTML 태그 및 수식 제거
    3) 알파벳·공백 외 문자 제거 → 소문자화
    4) 토큰화 → NLTK 불용어 & 길이 >2 필터
    5) POS 태그 필터링 (명사·동사 계열)
    6) Lemmatize
    7) spaCy NER로 PERSON 엔터티 제거
    8) 사용자 정의 불용어 제외
    """
    # 1) Null / 리스트 처리
    if pd.isnull(text):
        return []
    if isinstance(text, list):
        text = ' '.join(text)

    # 2) HTML, LaTeX 수식 제거
    if '<' in text and '>' in text:
        text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"\$.*?\$", "", text)
    text = re.sub(r"\\\(.*?\\\)", "", text)

    # 3) 알파벳·공백 외 제거 → 소문자
    text = re.sub(r"[^a-zA-Z\s]", " ", text).lower()

    # 4) 토큰화 → 불용어·길이 필터
    stop_words = set(stopwords.words('english'))
    tokens = [
        w for w in word_tokenize(text)
        if w not in stop_words and len(w) > 2
    ]

    # 5) POS 태깅 → 명사·동사 계열만
    allowed = {'NN','NNS','NNP','NNPS'}
    tokens = [w for w, pos in pos_tag(tokens) if pos in allowed]

    # 6) 표제어 추출
    lemm = WordNetLemmatizer()
    tokens = [lemm.lemmatize(w) for w in tokens]

    # 7) spaCy NER 적용 → PERSON 제거
    doc = nlp(" ".join(tokens))
    tokens = [tok.text for tok in doc if tok.ent_type_ != 'PERSON']

    # 8) 사용자 정의 불용어 제거
    return [w for w in tokens if w not in CUSTOM_STOPWORDS]


def run_lda_pipeline(
    df: pd.DataFrame,
    text_cols: list[str] = ['title', 'content'],
    num_topics: int = 19,
    no_below: int = 20,
    no_above: float = 0.5,
    passes: int = 10
) -> tuple[LdaModel, corpora.Dictionary, list, float]:
    """
    1) df[text_cols] 합치고 preprocess_text 적용
    2) Dictionary & Corpus 생성 (filter_extremes)
    3) LDA 학습(num_topics)
    4) 토픽 키워드 출력 & coherence 계산
    Returns: (lda_model, dictionary, corpus, coherence_score)
    """
    df = df.copy()
    df['__text'] = df[text_cols].fillna('').agg(' '.join, axis=1)
    texts = [preprocess_text(doc) for doc in df['__text']]

    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    corpus = [dictionary.doc2bow(txt) for txt in texts]

    lda = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=passes
    )

    print(f"\n=== Top {num_topics} Topics ===")
    for tid, terms in lda.show_topics(num_topics=num_topics, num_words=10, formatted=False):
        print(f"Topic {tid:2d}: {', '.join([t for t,_ in terms])}")

    cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')
    coh = cm.get_coherence()
    print(f"\nCoherence (c_v) = {coh:.4f}")

    return lda, dictionary, corpus, coh

[nltk_data] Downloading package punkt to /Users/choihj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/choihj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/choihj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/choihj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [23]:
lda_model, dictionary, corpus, coherence = run_lda_pipeline(indu)


=== Top 19 Topics ===
Topic  0: customer, data, business, startup, product, platform, service, team, founder, venture
Topic  1: robot, system, car, vehicle, technology, security, food, city, drone, robotics
Topic  2: meta, medium, election, platform, account, campaign, post, user, facebook, content
Topic  3: job, student, university, technology, work, school, worker, science, researcher, tool
Topic  4: data, system, risk, technology, law, safety, use, information, tool, privacy
Topic  5: data, energy, power, center, water, project, electricity, centre, plant, demand
Topic  6: game, character, player, world, fan, video, week, sport, star, event
Topic  7: openai, copilot, board, ceo, team, microsoft, employee, week, safety, month
Topic  8: news, story, medium, publisher, article, perplexity, site, content, search, product
Topic  9: image, tool, video, google, user, photo, feature, content, creator, generator
Topic 10: thing, way, world, something, lot, work, kind, idea, question, day
To

In [26]:
indu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11672 entries, 0 to 11671
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         11672 non-null  object 
 1   content       11672 non-null  object 
 2   date          11672 non-null  float64
 3   affiliations  11672 non-null  object 
 4   keywords      6252 non-null   object 
 5   text          11672 non-null  object 
dtypes: float64(1), object(5)
memory usage: 547.3+ KB


# 모델 저장 및 distribution 저장

In [42]:
import pandas as pd
from gensim import corpora, models

# 1. LDA 모델 저장
# 학습이 끝난 lda_model 객체가 있다고 가정합니다.
lda_model.save('indu/lda_model.model')

# 2. 문서-토픽 분포 추출
# corpus: LDA에 입력했던 Bow/BOW-TFIDF 형태의 말뭉치 리스트
# min_probability=0 으로 설정하면, 확률이 0인 토픽도 반환해줍니다.
doc_topics = [
    lda_model.get_document_topics(doc, minimum_probability=0)
    for doc in corpus
]

# 3. 분포를 DataFrame으로 변환
# 각 문서별로 토픽 확률만 추출 → 리스트 of lists
distribution = [
    [prob for _, prob in topic_dist]
    for topic_dist in doc_topics
]

# 컬럼명 생성 (예: 'Topic_0', 'Topic_1', ...)
num_topics = lda_model.num_topics
columns = [f'Topic_{i}' for i in range(num_topics)]

df_dist = pd.DataFrame(distribution, columns=columns)

# 4. CSV로 저장
df_dist.to_csv('data/05_industry_lda_document_topic_distribution.csv', index=False)

print("LDA 모델과 문서-토픽 분포 CSV 저장이 완료되었습니다.")

LDA 모델과 문서-토픽 분포 CSV 저장이 완료되었습니다.


In [43]:
import pandas as pd
from gensim import corpora, models

# —————————————————————————————————————
# (이전 코드로 얻은) document–topic distribution DataFrame: df_dist
# 컬럼명은 'Topic_0', 'Topic_1', … 'Topic_{num_topics-1}'
# —————————————————————————————————————

# 1) 제외할 토픽 번호 리스트 지정
# 예시: 토픽 1, 9, 17을 제외하고 싶으면
topics_to_exclude = [2, 7, 8,10,11,17,18,6,12]

# 2) 제외할 토픽을 뺀 컬럼 리스트 생성
remaining_cols = [
    col for col in df_dist.columns
    if int(col.split('_')[1]) not in topics_to_exclude
]

# 3) 제외된 컬럼들 중 가장 확률이 높은 토픽 이름(예: 'Topic_5')을 찾아서,
#    숫자 부분만 정수로 추출해 'dominant_topic_excl' 컬럼에 저장
df_dist['dominant_topic_excl'] = (
    df_dist[remaining_cols]
      .idxmax(axis=1)
      .apply(lambda x: int(x.split('_')[1]))
)

# 4) 결과 확인
print(df_dist[['dominant_topic_excl']].head())

   dominant_topic_excl
0                    3
1                   14
2                    5
3                    0
4                   16


In [44]:
print(df_dist.value_counts(subset=['dominant_topic_excl'],sort=False))

dominant_topic_excl
0                      1650
1                       592
3                      1025
4                      1651
5                       663
9                      1052
13                      647
14                     2360
15                     1139
16                      893
Name: count, dtype: int64


In [45]:
df_dist.to_csv('data/05_industry_lda_document_topic_distribution.csv', index=False)

In [46]:
df_dist.head()

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11,Topic_12,Topic_13,Topic_14,Topic_15,Topic_16,Topic_17,Topic_18,dominant_topic_excl
0,0.050902,0.012399,0.000184,0.776838,0.000184,0.000184,0.024984,0.000184,0.000184,0.000184,0.111138,0.000184,0.000184,0.021345,0.000184,0.000184,0.000184,0.000184,0.000184,3
1,0.000164,0.061899,0.000164,0.000164,0.000164,0.026284,0.112314,0.000164,0.078295,0.02932,0.335656,0.000164,0.040525,0.020994,0.250363,0.000164,0.042878,0.000164,0.000164,14
2,0.000258,0.000258,0.000258,0.000258,0.000258,0.76843,0.000258,0.00874,0.020093,0.041604,0.02901,0.000258,0.000258,0.000258,0.000258,0.000258,0.000258,0.000258,0.128764,5
3,0.708301,0.000253,0.000253,0.000253,0.000253,0.000253,0.000253,0.000253,0.032466,0.000253,0.000253,0.000253,0.083215,0.000253,0.156662,0.006727,0.000253,0.000253,0.009335,0
4,0.000246,0.026555,0.036876,0.134522,0.000246,0.000246,0.000246,0.173942,0.000246,0.050926,0.194634,0.000246,0.055463,0.076475,0.073283,0.000246,0.175108,0.000246,0.000246,16


In [58]:
import pandas as pd
import numpy as np
import re

# === 파일 경로 ===
path_orig = "data/05_industry_2023_2025.csv"                      # 원본
path_dist = "data/05_industry_lda_document_topic_distribution.csv"  # LDA 분포
out_path  = "data/topic_industry/industry_with_dominant_topic.csv"

# === 로드 ===
orig = pd.read_csv(path_orig, low_memory=False)
dist = pd.read_csv(path_dist, low_memory=False)

# (옵션) content가 content_text로 되어 있으면 맞춰주기
if "content" not in orig.columns and "content_text" in orig.columns:
    orig = orig.rename(columns={"content_text": "content"})

# 1) 원본에 dominant_topic이 이미 있으면 그대로 사용
if "dominant_topic" not in orig.columns:
    # 2) 없으면 분포파일에서 가져오거나 계산
    if "dominant_topic_excl" in dist.columns:
        dom_series = dist["dominant_topic_excl"]
    else:
        # Topic_0, Topic_1 ... 형태에서 argmax로 dominant topic 계산
        topic_cols = [c for c in dist.columns if re.match(r"^Topic_\d+$", c)]
        if not topic_cols:
            raise ValueError("분포 파일에 Topic_0, Topic_1 ... 형식의 컬럼이 없습니다.")
        topic_arr = dist[topic_cols].to_numpy()
        best_idx = topic_arr.argmax(axis=1)  # 각 행에서 최고 확률의 컬럼 인덱스
        topic_nums = np.array([int(re.search(r"\d+", c).group()) for c in topic_cols])
        dom_series = pd.Series(topic_nums[best_idx], index=dist.index, name="dominant_topic")

    # 행 순서 기준으로 dominant_topic만 원본에 붙이기
    orig["__rowid__"] = range(len(orig))
    dist["__rowid__"] = range(len(dist))
    tmp = pd.DataFrame({"__rowid__": dist["__rowid__"], "dominant_topic": dom_series})
    orig = orig.merge(tmp, on="__rowid__", how="left").drop(columns="__rowid__")

# 3) 최종 필요한 컬럼만 남기기
wanted = ["title", "content", "affiliations", "date","keywords", "dominant_topic"]
existing = [c for c in wanted if c in orig.columns]
missing  = [c for c in wanted if c not in orig.columns]

final = orig[existing].copy()
final.to_csv(out_path, index=False, encoding="utf-8-sig")

print(f"저장 완료: {out_path}  (rows={len(final)}, cols={len(final.columns)})")
if missing:
    print("다음 컬럼은 데이터에 없어 제외되었습니다:", missing)

저장 완료: data/topic_industry/industry_with_dominant_topic.csv  (rows=11672, cols=6)


In [59]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11672 entries, 0 to 11671
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           11672 non-null  object 
 1   content         11672 non-null  object 
 2   affiliations    11672 non-null  object 
 3   date            11672 non-null  float64
 4   keywords        6252 non-null   object 
 5   dominant_topic  11672 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 547.3+ KB


In [62]:
import os
import re
import pandas as pd

# === 입력: 바로 'final'이 메모리에 없으면 직전 결과 파일에서 읽음 ===
if 'final' not in globals():
    final = pd.read_csv("data/topic_industry/merged_min_columns.csv", low_memory=False)

# 사용 컬럼 확인 (필수: dominant_topic)
required = {"title", "content", "affiliations","date", "keywords", "dominant_topic"}
missing = [c for c in required if c not in final.columns]
if missing:
    raise ValueError(f"다음 컬럼이 없습니다: {missing}")

# 저장 경로 (이미 존재한다고 하셨으므로 생성 생략해도 되지만, 안전하게 유지)
out_dir = "data/topic_industry"  # ← 요청 경로
os.makedirs(out_dir, exist_ok=True)

# dominant_topic이 있는 행만
df_topics = final.dropna(subset=['dominant_topic']).copy()

# 토픽 값 정리(숫자면 int로, 아니면 문자열 유지)
def normalize_topic(x):
    try:
        return int(float(x))
    except:
        return str(x)

df_topics['__topic_norm__'] = df_topics['dominant_topic'].apply(normalize_topic)

# 파일명 안전하게 만드는 함수
def safe_name(s):
    return re.sub(r'[^0-9A-Za-z._-]+', '_', str(s))

# 토픽별 저장
for t, g in df_topics.groupby('__topic_norm__', sort=True):
    fname = f"dominant_topic_{safe_name(t)}.csv"
    out_path = os.path.join(out_dir, fname)
    g.drop(columns=['__topic_norm__']).to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"Saved: {out_path} (rows={len(g)})")

Saved: data/topic_industry/dominant_topic_0.csv (rows=1650)
Saved: data/topic_industry/dominant_topic_1.csv (rows=592)
Saved: data/topic_industry/dominant_topic_3.csv (rows=1025)
Saved: data/topic_industry/dominant_topic_4.csv (rows=1651)
Saved: data/topic_industry/dominant_topic_5.csv (rows=663)
Saved: data/topic_industry/dominant_topic_9.csv (rows=1052)
Saved: data/topic_industry/dominant_topic_13.csv (rows=647)
Saved: data/topic_industry/dominant_topic_14.csv (rows=2360)
Saved: data/topic_industry/dominant_topic_15.csv (rows=1139)
Saved: data/topic_industry/dominant_topic_16.csv (rows=893)
