## Data Download

In [1]:
import os
import json
import argparse
import random
import re
import time
import prawcore
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path
from praw import Reddit
from praw.models import MoreComments
from typing import Union

# Set Environment Vars
load_dotenv()

True

In [33]:
reddit = init_reddit()
post = next(reddit.subreddit("koreatravel").top(limit=1))
print(post.title)
print("------------------------------------")
print(post.selftext.strip())
print("---------------------------")
post.comments.replace_more(limit=0) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
comments = [c for c in post.comments if not isinstance(c, MoreComments)]
highest_score_comment = max(comments, key=lambda c: c.score).body.strip()
print(highest_score_comment)
print(post.permalink)
# post.comments.replace_more(limit=0)


First time in South Korea
------------------------------------
It was my first time in South Korea and I’ve been to a couple of places.


My favorites would be Yangjae citizen forest (almost no crowd on a weekday), Eunpyeong Hanok Village (a bit far but, nice temperatures, quiet and beautiful sceneries)

And thanks to a friend, I was able to go inside Hanam UN Village (Hanamdong UN Village hill yeah 🎵) the most(?) expensive place in Gangnam

I like Suwon as well.


Things I noticed was: it is incredibly quiet (sometimes I hear nothing I thought I got deaf)

There are mirrors everywhere.

There are couples EVERYWHERE.

Food is great, but eating out means shelling out, usually a minimum of 10,000 won (and that is not cheap coming from a third world country)

Subway can be confusing, don’t worry, even 
the locals get lost 😂, plus there are helpful people wearing red vest to help you. 

It can get overwhelming, I felt like I was bombarded by ads wherever I go, it is quiet with regards to n

In [8]:
def init_reddit() -> None:
    return Reddit(
        client_id=os.environ["REDDIT_CLIENT_ID"],
        client_secret=os.environ["REDDIT_CLIENT_SECRET"],
        user_agent=os.environ["REDDIT_USER_AGENT"],
    )

def clean_text(txt: str) -> str:
    # strip HTML/Markdown
    txt = BeautifulSoup(txt, "html.parser").get_text()
    # remove code fences
    txt = re.sub(r"```[\s\S]*?```", "", txt)
    # collapse whitespace
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def scrape(sub_size_map):
    reddit = init_reddit()
    qa = [] # the Q/A posts to train the model

    for sub, sub_size in sub_size_map.items():
        got_size = 0
        for post in reddit.subreddit(sub).hot(limit=None): # , time_filter="all"
            # don't need to scrape more if got_size already matches sub_size
            if got_size >= sub_size:
                break
            
            try:
                # skip any link or image post, no posts with score lower than 1, no pinned/mod posts, ban any “over 18” content, no locked thread, no crossposts
                if (not post.is_self or post.removed_by_category
                    or post.score < 1 or post.stickied or post.over_18
                    or post.locked or hasattr(post, "crosspost_parent")):
                    continue
                
                # if (post.removed_by_category or post.score < 1 
                #     or post.stickied or post.over_18
                #     or post.locked or hasattr(post, "crosspost_parent")):
                #     continue
                
                # get the question as a merge of the title and body of the post
                title = post.title.strip()
                body = post.selftext.strip()
                q = "\n\n".join(filter(None, [title, body]))
                
                # get the answer as the highest sore comment
                post.comments.replace_more(limit=0) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
                comments = post.comments.list()
                if not comments: # if no comments at all, we can't create a Q/A pair dataset
                    continue
                top_comment = max(comments, key=lambda c: c.score)            
                if top_comment.score < 1: # exclude posts with comments under 2 upvotes 
                    continue
                a = top_comment.body.strip()

                # length sanitation
                if len(q.split()) < 3 or len(a.split()) < 6:
                    continue
                
                qa.append({
                    "id": post.id,
                    "subreddit": sub,
                    "question": q,
                    "answer": a,
                    "url": f"https://reddit.com{post.permalink}"
                })
                got_size += 1
                
            except prawcore.exceptions.TooManyRequests as e:
                # reddit tells you how many seconds to wait
                retry_after = int(e.response.headers.get("Retry-After", 60))
                print(f"Rate limited—sleeping {retry_after}s...")
                time.sleep(retry_after)
                # and then retry the same post
                continue

            except Exception as e:
                print(f"Skipping post {post.id} due to {type(e).__name__}: {e}")
                continue
                
        print(f"- collected {got_size}/{sub_size} samples from r/{sub}")
    
    return qa

def preprocess(qa_raw):
    cleaned = []
    for item in qa_raw:
        q = clean_text(item["question"])
        a = clean_text(item["answer"])

        cleaned.append({
            "question": q,
            "answer": a,
            "subreddit": item["subreddit"],
            "url": item["url"],
        })
    return cleaned

def split_and_save(df, out_dir: Union[str, Path]):
    # create the dir path if not existing
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    # randomize the df rows, and reset to a fresh index(and droping the old one)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df)
    train_end = int(n * 0.75)
    val_end   = train_end + int(n * 0.15)

    splits = {
        "train": df.iloc[:train_end],
        "val":   df.iloc[train_end:val_end],
        "test":  df.iloc[val_end:]
    }
    
    for name, split_df in splits.items():
        path = os.path.join(out_dir, f"{name}.csv")
        split_df.to_csv(path, index=False)
        print(f"Saved {name} set: {len(split_df)} examples -> {path}")

In [9]:


NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

RAW_DIR       = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

# sub_size_map = {
#     "AskScience": 16,
#     "explainlikeimfive": 4,
# }

# sub_size_map = {
#     "koreatravel": 20,
#     "travel": 20,
#     "seoul": 30,
#     "koreanfood": 20,
#     "korea": 20,
#     "solotravel": 20,
#     "digitalnomad": 20,
# }

sub_size_map = {
    "koreatravel": 400,
    "korea": 50,
    "southkorea": 50,
    # "travel": 100,
    "seoul": 150,
    "Living_in_Korea": 100,
    "solotravel": 100,
    "digitalnomad": 50,
    "askscience": 50,
    "AskHistorians": 50,
}

out=PROCESSED_DIR

print(f"Scraping {sum(sub_size_map.values())} posts from {len(sub_size_map)} subreddits...")
raw = scrape(sub_size_map)
print(f"Scraped {len(raw)} raw Q&A; cleaning...")
cleaned = preprocess(raw)
print(f"Kept {len(cleaned)} after cleaning; splitting...")
df = pd.DataFrame(cleaned)
split_and_save(df, PROCESSED_DIR)

# Save raw data JSON
RAW_DIR.mkdir(parents=True, exist_ok=True)
with open(RAW_DIR / "qa_raw.json","w",encoding="utf-8") as f:
    json.dump(raw, f, ensure_ascii=False, indent=2)
print("Done.")

Scraping 1000 posts from 9 subreddits...
- collected 400/400 samples from r/koreatravel
- collected 50/50 samples from r/korea
- collected 50/50 samples from r/southkorea
- collected 150/150 samples from r/seoul
- collected 100/100 samples from r/Living_in_Korea
- collected 100/100 samples from r/solotravel
- collected 50/50 samples from r/digitalnomad
- collected 50/50 samples from r/askscience
- collected 50/50 samples from r/AskHistorians
Scraped 1000 raw Q&A; cleaning...
Kept 1000 after cleaning; splitting...
Saved train set: 750 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/train.csv
Saved val set: 150 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/val.csv
Saved test set: 100 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/test.csv
Done.


In [88]:
reddit.auth.limits

{'remaining': 996.0, 'reset_timestamp': 1750972201.9752533, 'used': 4}

In [34]:
def init_reddit() -> None:
    return Reddit(
        client_id=os.environ["REDDIT_CLIENT_ID"],
        client_secret=os.environ["REDDIT_CLIENT_SECRET"],
        user_agent=os.environ["REDDIT_USER_AGENT"],
    )

def clean_text(txt: str) -> str:
    # strip HTML/Markdown
    txt = BeautifulSoup(txt, "html.parser").get_text()
    # remove code fences
    txt = re.sub(r"```[\s\S]*?```", "", txt)
    # collapse whitespace
    return re.sub(r"\s+", " ", txt).strip()

def scrape(subreddits, sample_size):
    reddit = init_reddit()
    qa = [] # our Q/A posts to train the model
    per_sub = (sample_size // len(subreddits)) + 1 # per sub sample_size with a safety buffer of 1

    for sub in subreddits:
        for post in reddit.subreddit(sub).hot(limit=per_sub * 2):
            # don't need to scrape more if qa already matches sample_size
            if len(qa) >= sample_size:
                break
            
            # get the question as a merge of the title and body of the post
            title = post.title.strip()
            body = post.selftext.strip()
            q = f"{title}\n\n{body}"
            
            # get the answer as the highest sore comment
            post.comments.replace_more(limit=0) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
            comments = [c for c in post.comments if not isinstance(c, MoreComments)]
            if not comments: # if no comments at all, we can't create a Q/A pair dataset
                continue
            highest_score_a = max(comments, key=lambda c: c.score).body.strip()

            # quick length sanitation
            if len(q.split()) < 5 or len(highest_score_a.split()) < 10:
                continue

            qa.append({
                "id": post.id,
                "subreddit": sub,
                "question": q,
                "answer": highest_score_a,
                "url": f"https://reddit.com{post.permalink}"
            })
        if len(qa) >= sample_size:
            break

    return qa[:sample_size]

def preprocess(qa_raw):
    cleaned = []
    for item in qa_raw:
        q = clean_text(item["question"])
        a = clean_text(item["answer"])

        cleaned.append({
            "question": q,
            "answer": a,
            "subreddit": item["subreddit"],
            "url": item["url"],
        })
    return cleaned

def split_and_save(df, out_dir: Union[str, Path]):
    # create the dir path if not existing
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    # randomize the df rows, and reset to a fresh index(and droping the old one)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df)
    train_end = int(n * 0.75)
    val_end   = train_end + int(n * 0.15)

    splits = {
        "train": df.iloc[:train_end],
        "val":   df.iloc[train_end:val_end],
        "test":  df.iloc[val_end:]
    }
    
    for name, split_df in splits.items():
        path = os.path.join(out_dir, f"{name}.csv")
        split_df.to_csv(path, index=False)
        print(f"Saved {name} set: {len(split_df)} examples -> {path}")

In [None]:

NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

RAW_DIR       = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"


from types import SimpleNamespace

args = SimpleNamespace(
    total=50,
    subs=["explainlikeimfive", "AskScience"],
    out=PROCESSED_DIR
)

print(f"Scraping {args.total} posts from {len(args.subs)} subreddits...")
raw = scrape(args.subs, args.total)
print(f"Scraped {len(raw)} raw Q&A; cleaning...")
cleaned = preprocess(raw)
print(f"Kept {len(cleaned)} after cleaning; splitting...")
df = pd.DataFrame(cleaned)
split_and_save(df, args.out)

# Save raw data JSON
RAW_DIR.mkdir(parents=True, exist_ok=True)
with open(RAW_DIR / "qa_raw.json","w",encoding="utf-8") as f:
    json.dump(raw, f, ensure_ascii=False, indent=2)
print("Done.")

Scraping 50 posts from 2 subreddits...
Scraped 50 raw Q&A; cleaning...
Kept 50 after cleaning; splitting...
Saved train set: 37 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/train.csv
Saved val set: 7 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/val.csv
Saved test set: 6 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/test.csv
Done.


## Data Preprocessing

## Sample Dataset for Smoke Tests