In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [2]:
DATA_DIR = Path("../data")
OUTPUT_DIR = Path("../output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
posts_file = DATA_DIR / "the-reddit-dataset-dataset-posts.csv"
comments_file = DATA_DIR / "the-reddit-dataset-dataset-comments.csv"
decor_file = DATA_DIR / "cleaned_decor.csv"

In [4]:
for f in [posts_file, comments_file, decor_file]:
    if not f.exists():
        raise FileNotFoundError(f"{f} 파일 존재 하지 않음.")

In [5]:
posts_df = pd.read_csv(posts_file)
comments_df = pd.read_csv(comments_file)
decor_df = pd.read_csv(decor_file)

In [6]:
def clean_columns(df):
    df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_")
    return df

In [7]:
posts_df = clean_columns(posts_df)
comments_df = clean_columns(comments_df)
decor_df = clean_columns(decor_df)

In [8]:
if 'id' in posts_df.columns:
    posts_df = posts_df.drop_duplicates(subset=['id'])
    
if 'id' in comments_df.columns:
    comments_df = comments_df.drop_duplicates(subset=['id'])
decor_df = decor_df.drop_duplicates()

In [9]:
posts_df['title'] = posts_df['title'].fillna('')
posts_df['selftext'] = posts_df['selftext'].fillna('')
comments_df['body'] = comments_df['body'].fillna('')
decor_df = decor_df.dropna()  

In [10]:
if 'created_utc' in posts_df.columns:
    posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'],  unit='s',errors='coerce')
    posts_df['year'] = posts_df['created_utc'].dt.year
    posts_df['month'] = posts_df['created_utc'].dt.month

if 'created_utc' in comments_df.columns:
    comments_df['created_utc'] = pd.to_datetime(comments_df['created_utc'], errors='coerce')
    comments_df['year'] = comments_df['created_utc'].dt.year
    comments_df['month'] = comments_df['created_utc'].dt.month

In [11]:
def clean_text(text):
    text = str(text).lower().strip()
    text = re.sub(r'\s+', ' ', text)           
    text = re.sub(r'[^a-z0-9\s]', '', text) 
    return text

posts_df['title'] = posts_df['title'].apply(clean_text)
posts_df['selftext'] = posts_df['selftext'].apply(clean_text)
comments_df['body'] = comments_df['body'].apply(clean_text)
if 'text' in decor_df.columns:
    decor_df['text'] = decor_df['text'].apply(clean_text)


In [12]:
posts_df['title_len'] = posts_df['title'].str.len()
posts_df['selftext_len'] = posts_df['selftext'].str.len()
comments_df['body_len'] = comments_df['body'].str.len()


In [13]:
posts_df.to_csv(OUTPUT_DIR / "posts_cleaned.csv", index=False)
comments_df.to_csv(OUTPUT_DIR / "comments_cleaned.csv", index=False)
decor_df.to_csv(OUTPUT_DIR / "decor_cleaned.csv", index=False)
