In [15]:
import re
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")


In [16]:
DATA_DIR = Path("../data")
OUTPUT_DIR = Path("../output")
OUTPUT_DIR.mkdir(exist_ok=True)

posts_df = pd.read_csv(DATA_DIR / "the-reddit-dataset-dataset-posts.csv")
comments_df = pd.read_csv(DATA_DIR / "the-reddit-dataset-dataset-comments.csv")

posts_df = posts_df[posts_df["selftext"].notna()].copy()
comments_df = comments_df[comments_df["body"].notna()].copy()

posts_df.shape, comments_df.shape


((15316, 13), (54846, 11))

In [18]:
# 텍스트 정제
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+", "", t)
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()

posts_df["clean_text"] = posts_df["selftext"].apply(clean_text)
comments_df["clean_text"] = comments_df["body"].apply(clean_text)


In [19]:
#SentenceTransformer 임베딩
embedder = SentenceTransformer("all-MiniLM-L6-v2")

post_embeddings = embedder.encode(
    posts_df["clean_text"].tolist(),
    show_progress_bar=True
)

comment_embeddings = embedder.encode(
    comments_df["clean_text"].tolist(),
    show_progress_bar=True
)


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 7a766c3f-0f7b-45e2-a691-3cf4ac11d89b)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Batches:   0%|          | 0/479 [00:00<?, ?it/s]

Batches:   0%|          | 0/1714 [00:00<?, ?it/s]

In [None]:
# # print(posts_df.columns)
# # print(comments_df.columns)

# Index(['index', 'type', 'id', 'subreddit.id', 'subreddit.name',
#        'subreddit.nsfw', 'created_utc', 'permalink', 'domain', 'url',
#        'selftext', 'title', 'score', 'clean_text', 'topic'],
#       dtype='object')
# Index(['index', 'type', 'id', 'subreddit.id', 'subreddit.name',
#        'subreddit.nsfw', 'created_utc', 'permalink', 'body', 'sentiment',
#        'score', 'clean_text'],
#       dtype='object')

In [20]:
# BERTopic 학습 (Posts)
topic_model_posts = BERTopic(verbose=True)

post_topics, _ = topic_model_posts.fit_transform(
    posts_df["clean_text"],
    post_embeddings
)

posts_df["topic"] = post_topics
topic_model_posts.get_topic_info().head()


2025-12-15 10:13:20,740 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-15 10:13:24,638 - BERTopic - Dimensionality - Completed ✓
2025-12-15 10:13:24,639 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-15 10:13:25,203 - BERTopic - Cluster - Completed ✓
2025-12-15 10:13:25,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-15 10:13:25,652 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4108,-1_to_of_the_and,"[to, of, the, and, data, for, that, is, in, on]",[hey i know this question is really basic but ...
1,0,2411,0_deleted_lawdog_scrubbed_cleaned,"[deleted, lawdog, scrubbed, cleaned, platform,...","[deleted, deleted, deleted]"
2,1,1931,1_removed___,"[removed, , , , , , , , , ]","[removed, removed, removed]"
3,2,340,2_companies_stock_company_loan,"[companies, stock, company, loan, credit, cryp...",[hi amp x b a couple of months ago i found tha...
4,3,205,3_comments_reddit_submissions_comment,"[comments, reddit, submissions, comment, subre...",[the full reddit submission corpus is now avai...


In [21]:
# BERTopic 학습 (Comments)
topic_model_comments = BERTopic(verbose=True)

comment_topics, _ = topic_model_comments.fit_transform(
    comments_df["clean_text"],
    comment_embeddings
)

comments_df["topic"] = comment_topics
topic_model_comments.get_topic_info().head()


2025-12-15 10:13:28,855 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-15 10:13:49,090 - BERTopic - Dimensionality - Completed ✓
2025-12-15 10:13:49,093 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-15 10:13:54,162 - BERTopic - Cluster - Completed ✓
2025-12-15 10:13:54,175 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-15 10:13:55,269 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,19984,-1_amp_that_to_the,"[amp, that, to, the, of, in, data, it, is, and]",[i was going to paste some of the links i used...
1,0,2629,0_deleted_gridmet_meta_saved,"[deleted, gridmet, meta, saved, , , , , , ]","[deleted, deleted, deleted]"
2,1,982,1_removed_comment_aw_sticky,"[removed, comment, aw, sticky, visible, loaded...","[removed, removed, removed]"
3,2,906,2_dataset_datasets_link_found,"[dataset, datasets, link, found, thanks, here,...","[what s in the dataset, what dataset, what dat..."
4,3,802,3_lmk_approx_um_mm,"[lmk, approx, um, mm, answer, since, time, get...","[, mm, if you get an answer can you lmk]"


In [23]:
# Noise(-1) 제거
posts_df_valid = posts_df[posts_df["topic"] != -1].copy()
comments_df_valid = comments_df[comments_df["topic"] != -1].copy()

posts_df_valid.shape, comments_df_valid.shape


((11208, 15), (34862, 13))

In [25]:
TOP_N = 10

top_topics = (
    posts_df_valid["topic"]
    .value_counts()
    .head(TOP_N)
    .index
)

posts_top = posts_df_valid[posts_df_valid["topic"].isin(top_topics)]
comments_top = comments_df_valid[comments_df_valid["topic"].isin(top_topics)]


In [26]:
# Post vs Comment 토픽 분포 비교
post_dist = posts_top["topic"].value_counts(normalize=True)
comment_dist = comments_top["topic"].value_counts(normalize=True)

topic_compare = pd.concat(
    [
        post_dist.rename("post_ratio"),
        comment_dist.rename("comment_ratio")
    ],
    axis=1
).fillna(0)

topic_compare


Unnamed: 0_level_0,post_ratio,comment_ratio
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.408436,0.306196
1,0.327122,0.114372
2,0.057598,0.105521
3,0.034728,0.093408
4,0.033373,0.080363
5,0.032187,0.07058
6,0.028291,0.067901
7,0.027274,0.065805
8,0.025919,0.050664
9,0.025072,0.04519


In [28]:
# Subreddit별 대표 토픽
posts_sub = posts_df_valid.rename(columns={"subreddit.name": "subreddit"})
posts_sub = posts_sub[posts_sub["subreddit"].notna()]

subreddit_top_topic = (
    posts_sub.groupby("subreddit")["topic"]
    .value_counts()
    .groupby(level=0)
    .head(1)
    .reset_index(name="count")
)

subreddit_top_topic.head()


Unnamed: 0,subreddit,topic,count
0,datasets,0,2411


In [29]:
def get_topic_keywords(model, topic_id, n=10):
    return ", ".join([w for w, _ in model.get_topic(topic_id)[:n]])

summary_rows = []

for topic_id in top_topics:
    summary_rows.append({
        "topic": topic_id,
        "keywords": get_topic_keywords(topic_model_posts, topic_id),
        "examples": (
            posts_df_valid[posts_df_valid["topic"] == topic_id]
            .head(3)["clean_text"]
            .tolist()
        )
    })

topic_summary_df = pd.DataFrame(summary_rows)
topic_summary_df


Unnamed: 0,topic,keywords,examples
0,0,"deleted, lawdog, scrubbed, cleaned, platform, ...","[deleted, deleted, deleted]"
1,1,"removed, , , , , , , , ,","[removed, removed, removed]"
2,2,"companies, stock, company, loan, credit, crypt...",[i am interested to have a huge list of compan...
3,3,"comments, reddit, submissions, comment, subred...",[is there a data set or data base out there th...
4,4,"election, political, elections, votes, vote, v...",[want to see detail info of how political peop...
5,5,"covid, cases, gt, coronavirus, csv, country, v...",[a quick google search of party breakdown of c...
6,6,"images, image, object, dataset, detection, mod...",[i want to classify if an image contains a con...
7,7,"music, songs, lt, gt, song, artist, spotify, k...",[wondering if there is any dataset out there w...
8,8,"crime, police, shootings, crimes, gun, mass, v...",[hey guys checkout this new dataset on various...
9,9,"weather, temperature, hourly, climate, precipi...",[i m working on a project and am looking for a...


In [30]:
posts_df_valid.to_csv(
    OUTPUT_DIR / "posts_with_topics.csv",
    index=False
)

comments_df_valid.to_csv(
    OUTPUT_DIR / "comments_with_topics.csv",
    index=False
)

topic_compare.to_csv(
    OUTPUT_DIR / "post_comment_topic_distribution.csv"
)

subreddit_top_topic.to_csv(
    OUTPUT_DIR / "subreddit_top_topics.csv",
    index=False
)

topic_summary_df.to_csv(
    OUTPUT_DIR / "topic_summary_table.csv",
    index=False
)
