## Data Download

In [6]:
import os
import json
import argparse
import random
import re
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path
from praw import Reddit
from praw.models import MoreComments
from bs4 import BeautifulSoup

# Set Environment Vars
load_dotenv()

True

In [7]:
from typing import Union


def init_reddit():
    return Reddit(
        client_id=os.environ["REDDIT_CLIENT_ID"],
        client_secret=os.environ["REDDIT_CLIENT_SECRET"],
        user_agent=os.environ["REDDIT_USER_AGENT"],
    )

def clean_text(txt: str) -> str:
    # strip HTML/Markdown
    txt = BeautifulSoup(txt, "html.parser").get_text()
    # remove code fences
    txt = re.sub(r"```[\s\S]*?```", "", txt)
    # collapse whitespace
    return re.sub(r"\s+", " ", txt).strip()

def scrape(subreddits, target):
    reddit = init_reddit()
    qa = []
    per_sub = (target // len(subreddits)) + 1

    for sub in subreddits:
        for post in reddit.subreddit(sub).hot(limit=per_sub * 2):
            if len(qa) >= target:
                break

            q = post.title.strip()
            post.comments.replace_more(limit=0)
            comments = [c for c in post.comments if not isinstance(c, MoreComments)]
            if not comments:
                continue

            # pick highest-scoring
            a = max(comments, key=lambda c: c.score).body.strip()

            # quick length check
            if len(q.split()) < 3 or len(a.split()) < 5:
                continue

            qa.append({
                "id": post.id,
                "subreddit": sub,
                "question": q,
                "answer": a,
                "url": f"https://reddit.com{post.permalink}"
            })
        if len(qa) >= target:
            break

    return qa[:target]

def preprocess(qa_raw):
    cleaned = []
    for item in qa_raw:
        q = clean_text(item["question"])
        a = clean_text(item["answer"])
        # enforce word‐count bounds
        if not (20 <= len(q.split()) <= 128):
            continue
        if not (20 <= len(a.split()) <= 256):
            continue

        cleaned.append({
            "question": q,
            "answer": a,
            "subreddit": item["subreddit"],
            "url": item["url"],
        })
    return cleaned

def split_and_save(df, out_dir: Union[str, Path]):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df)
    train_end = int(n * 0.8)
    val_end   = train_end + int(n * 0.1)

    splits = {
        "train": df.iloc[:train_end],
        "val":   df.iloc[train_end:val_end],
        "test":  df.iloc[val_end:]
    }
    # os.makedirs(out_dir, exist_ok=True)
    for name, split_df in splits.items():
        path = os.path.join(out_dir, f"{name}.csv")
        split_df.to_csv(path, index=False)
        print(f"→ {name}: {len(split_df)} examples → {path}")

In [8]:


# assume notebook lives in project_root/notebooks/
NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

RAW_DIR       = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"


from types import SimpleNamespace

args = SimpleNamespace(
    total=50,
    subs=["explainlikeimfive", "AskScience"],
    out=PROCESSED_DIR
)

print(f"Scraping {args.total} posts from {len(args.subs)} subreddits…")
raw = scrape(args.subs, args.total)
print(f"Scraped {len(raw)} raw Q&A; cleaning…")
cleaned = preprocess(raw)
print(f"Kept {len(cleaned)} after cleaning; splitting…")
df = pd.DataFrame(cleaned)
split_and_save(df, args.out)
# Optionally: save raw JSON
# os.makedirs("data/raw", exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)
with open(RAW_DIR / "qa_raw.json","w",encoding="utf-8") as f:
    json.dump(raw, f, ensure_ascii=False, indent=2)
print("Done.")

Scraping 50 posts from 2 subreddits…
Scraped 50 raw Q&A; cleaning…
Kept 8 after cleaning; splitting…
→ train: 6 examples → /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/train.csv
→ val: 0 examples → /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/val.csv
→ test: 2 examples → /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/test.csv
Done.


## Data Preprocessing

## Sample Dataset for Smoke Tests