In [51]:
import pandas as pd
import numpy as np
from pathlib import Path

OUTPUT_DIR = Path("../output")
OUTPUT_DIR.mkdir(exist_ok=True)


In [54]:
posts_df = pd.read_csv(OUTPUT_DIR / "posts_with_topics.csv")
print(posts_df.columns)


Index(['index', 'type', 'id', 'subreddit.id', 'subreddit.name',
       'subreddit.nsfw', 'created_utc', 'permalink', 'domain', 'url',
       'selftext', 'title', 'score', 'clean_text', 'topic'],
      dtype='object')


In [55]:
posts_df = posts_df.drop_duplicates(subset="id")

posts_df["created_utc"] = pd.to_datetime(
    posts_df["created_utc"], unit="s", errors="coerce"
)

In [59]:
posts_df["hour"] = posts_df["created_utc"].dt.hour
posts_df["dayofweek"] = posts_df["created_utc"].dt.dayofweek
posts_df["is_weekend"] = posts_df["dayofweek"].isin([5, 6]).astype(int)


In [56]:
# 텍스트 길이
posts_df["title_length"] = posts_df["title"].fillna("").str.len()
posts_df["selftext_length"] = posts_df["selftext"].fillna("").str.len()

posts_df["total_text_length"] = (
    posts_df["title_length"] + posts_df["selftext_length"]
)

In [None]:
# 토픽 피처
assert "topic" in posts_df.columns

topic_dummies = pd.get_dummies(posts_df["topic"], prefix="topic")
posts_df = pd.concat([posts_df, topic_dummies], axis=1)

In [57]:
posts_df["log_score"] = np.log1p(posts_df["score"])
posts_df["log_text_length"] = np.log1p(posts_df["total_text_length"])

In [60]:
final_cols = (
    [
        "id",
        "subreddit.name",
        "topic",
        "score",
        "log_score",
        "title_length",
        "selftext_length",
        "total_text_length",
        "log_text_length",
        "hour",
        "dayofweek",
        "is_weekend",
    ]
    + [c for c in posts_df.columns if c.startswith("topic_")]
)

posts_final = posts_df[final_cols]

In [62]:
posts_final.to_csv(
    OUTPUT_DIR / "posts_feature_engineered.csv",
    index=False
)

posts_final.head()

Unnamed: 0,id,subreddit.name,topic,score,log_score,title_length,selftext_length,total_text_length,log_text_length,hour,dayofweek,is_weekend
0,t4f9bf,datasets,1,7,2.079442,21,9,30,3.433987,18,1,0
1,t4euxw,datasets,32,1,0.693147,89,522,611,6.416732,18,1,0
2,t4e0bb,datasets,1,1,0.693147,45,9,54,4.007333,18,1,0
3,t49fq0,datasets,153,1,0.693147,42,177,219,5.393628,14,1,0
4,t47wiw,datasets,6,2,1.098612,79,123,202,5.313206,13,1,0
