## Data Preprocessing

In [3]:
import os
import json
import argparse
import random
import re
import time
import prawcore
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import pandas as pd
from pathlib import Path
from praw import Reddit
from praw.models import MoreComments
from typing import Union

# Set Environment Vars
load_dotenv()

True

In [None]:
scripts_dir = Path().resolve()
project_root = scripts_dir.parent

example_file_dir = scripts_dir.parent / "example/example_file"
example_file_dir

example_file = Path(example_file_dir)
example_file

example_file.mkdir(parents=True, exist_ok=True)
example_file

NameError: name '__file__' is not defined

In [8]:
def init_reddit() -> None:
    return Reddit(
        client_id=os.environ["REDDIT_CLIENT_ID"],
        client_secret=os.environ["REDDIT_CLIENT_SECRET"],
        user_agent=os.environ["REDDIT_USER_AGENT"],
    )

def clean_text(txt: str) -> str:
    # strip HTML/Markdown
    txt = BeautifulSoup(txt, "html.parser").get_text()
    # remove code fences
    txt = re.sub(r"```[\s\S]*?```", "", txt)
    # collapse whitespace
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def scrape(sub_size_map):
    reddit = init_reddit()
    qa = [] # the Q/A posts to train the model

    for sub, sub_size in sub_size_map.items():
        got_size = 0
        for post in reddit.subreddit(sub).hot(limit=None): # , time_filter="all"
            # don't need to scrape more if got_size already matches sub_size
            if got_size >= sub_size:
                break
            
            try:
                # skip any link or image post, no posts with score lower than 1, no pinned/mod posts, ban any “over 18” content, no locked thread, no crossposts
                if (not post.is_self or post.removed_by_category
                    or post.score < 1 or post.stickied or post.over_18
                    or post.locked or hasattr(post, "crosspost_parent")):
                    continue
                
                # if (post.removed_by_category or post.score < 1 
                #     or post.stickied or post.over_18
                #     or post.locked or hasattr(post, "crosspost_parent")):
                #     continue
                
                # get the question as a merge of the title and body of the post
                title = post.title.strip()
                body = post.selftext.strip()
                q = "\n\n".join(filter(None, [title, body]))
                
                # get the answer as the highest sore comment
                post.comments.replace_more(limit=0) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
                comments = post.comments.list()
                if not comments: # if no comments at all, we can't create a Q/A pair dataset
                    continue
                top_comment = max(comments, key=lambda c: c.score)            
                if top_comment.score < 1: # exclude posts with comments under 2 upvotes 
                    continue
                a = top_comment.body.strip()

                # length sanitation
                if len(q.split()) < 3 or len(a.split()) < 6:
                    continue
                
                qa.append({
                    "id": post.id,
                    "subreddit": sub,
                    "question": q,
                    "answer": a,
                    "url": f"https://reddit.com{post.permalink}"
                })
                got_size += 1
                
            except prawcore.exceptions.TooManyRequests as e:
                # reddit tells you how many seconds to wait
                retry_after = int(e.response.headers.get("Retry-After", 60))
                print(f"Rate limited—sleeping {retry_after}s...")
                time.sleep(retry_after)
                # and then retry the same post
                continue

            except Exception as e:
                print(f"Skipping post {post.id} due to {type(e).__name__}: {e}")
                continue
                
        print(f"- collected {got_size}/{sub_size} samples from r/{sub}")
    
    return qa

def preprocess(qa_raw):
    cleaned = []
    for item in qa_raw:
        q = clean_text(item["question"])
        a = clean_text(item["answer"])

        cleaned.append({
            "question": q,
            "answer": a,
            "subreddit": item["subreddit"],
            "url": item["url"],
        })
    return cleaned

def split_and_save(df, out_dir: Union[str, Path]):
    # create the dir path if not existing
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    
    # randomize the df rows, and reset to a fresh index(and droping the old one)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df)
    train_end = int(n * 0.75)
    val_end   = train_end + int(n * 0.15)

    splits = {
        "train": df.iloc[:train_end],
        "val":   df.iloc[train_end:val_end],
        "test":  df.iloc[val_end:]
    }
    
    for name, split_df in splits.items():
        path = os.path.join(out_dir, f"{name}.csv")
        split_df.to_csv(path, index=False)
        print(f"Saved {name} set: {len(split_df)} examples -> {path}")

In [9]:


NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

RAW_DIR       = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

# sub_size_map = {
#     "AskScience": 16,
#     "explainlikeimfive": 4,
# }

# sub_size_map = {
#     "koreatravel": 20,
#     "travel": 20,
#     "seoul": 30,
#     "koreanfood": 20,
#     "korea": 20,
#     "solotravel": 20,
#     "digitalnomad": 20,
# }

sub_size_map = {
    "koreatravel": 400,
    "korea": 50,
    "southkorea": 50,
    "seoul": 150,
    "Living_in_Korea": 100,
    "solotravel": 100,
    "digitalnomad": 50,
    "askscience": 50,
    "AskHistorians": 50,
}

out=PROCESSED_DIR

print(f"Scraping {sum(sub_size_map.values())} posts from {len(sub_size_map)} subreddits...")
raw = scrape(sub_size_map)
print(f"Scraped {len(raw)} raw Q&A; cleaning...")
cleaned = preprocess(raw)
print(f"Kept {len(cleaned)} after cleaning; splitting...")
df = pd.DataFrame(cleaned)
split_and_save(df, PROCESSED_DIR)

# Save raw data JSON
RAW_DIR.mkdir(parents=True, exist_ok=True)
with open(RAW_DIR / "qa_raw.json","w",encoding="utf-8") as f:
    json.dump(raw, f, ensure_ascii=False, indent=2)
print("Done.")

Scraping 1000 posts from 9 subreddits...
- collected 400/400 samples from r/koreatravel
- collected 50/50 samples from r/korea
- collected 50/50 samples from r/southkorea
- collected 150/150 samples from r/seoul
- collected 100/100 samples from r/Living_in_Korea
- collected 100/100 samples from r/solotravel
- collected 50/50 samples from r/digitalnomad
- collected 50/50 samples from r/askscience
- collected 50/50 samples from r/AskHistorians
Scraped 1000 raw Q&A; cleaning...
Kept 1000 after cleaning; splitting...
Saved train set: 750 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/train.csv
Saved val set: 150 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/val.csv
Saved test set: 100 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/data/processed/test.csv
Done.


In [13]:
from korea_travel_guide.data import split_and_save, preprocess

scripts_dir = Path().resolve()
project_root = scripts_dir.parent

raw_path = project_root / "test-data/raw/qa_raw.json"

out_dir = project_root / "test-data/processed"
out_dir.mkdir(parents=True, exist_ok=True)

# 2) Load it into memory
with raw_path.open("r", encoding="utf-8") as f:
    raw = json.load(f)        # raw is now your list of {id, question, answer, …}

print(f"Scraped {len(raw)} raw Q&A; cleaning...")
cleaned = preprocess(raw)

print(f"Kept {len(cleaned)} after cleaning; splitting...")
df = pd.DataFrame(cleaned)
split_and_save(df, out_dir)
print("Done.")


Scraped 1000 raw Q&A; cleaning...
Kept 1000 after cleaning; splitting...
Saved train set: 800 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/test-data/processed/train.csv
Saved val set: 100 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/test-data/processed/val.csv
Saved test set: 100 examples -> /mnt/wsl/Dt5vhdx/projects/korea-travel-guide/test-data/processed/test.csv
Done.


In [88]:
reddit.auth.limits

{'remaining': 996.0, 'reset_timestamp': 1750972201.9752533, 'used': 4}

In [None]:
reddit = init_reddit()
post = next(reddit.subreddit("koreatravel").top(limit=1))
print(post.title)
print("------------------------------------")
print(post.selftext.strip())
print("---------------------------")
post.comments.replace_more(limit=0) # replace_more(limit=0) prevents getting more comments that are yet to be fetched. We just need the best comments.
comments = [c for c in post.comments if not isinstance(c, MoreComments)]
highest_score_comment = max(comments, key=lambda c: c.score).body.strip()
print(highest_score_comment)
print(post.permalink)
# post.comments.replace_more(limit=0)


First time in South Korea
------------------------------------
It was my first time in South Korea and I’ve been to a couple of places.


My favorites would be Yangjae citizen forest (almost no crowd on a weekday), Eunpyeong Hanok Village (a bit far but, nice temperatures, quiet and beautiful sceneries)

And thanks to a friend, I was able to go inside Hanam UN Village (Hanamdong UN Village hill yeah 🎵) the most(?) expensive place in Gangnam

I like Suwon as well.


Things I noticed was: it is incredibly quiet (sometimes I hear nothing I thought I got deaf)

There are mirrors everywhere.

There are couples EVERYWHERE.

Food is great, but eating out means shelling out, usually a minimum of 10,000 won (and that is not cheap coming from a third world country)

Subway can be confusing, don’t worry, even 
the locals get lost 😂, plus there are helpful people wearing red vest to help you. 

It can get overwhelming, I felt like I was bombarded by ads wherever I go, it is quiet with regards to n

## Sample Dataset for Smoke Tests

## Tokenize and Reformat

In [1]:
from korea_travel_guide.utils import load_environ_vars
from korea_travel_guide.data import tokenize_and_format
from datasets import load_dataset

load_environ_vars()

  from .autonotebook import tqdm as notebook_tqdm


Logged into Hugging Face Hub


In [2]:
scripts_dir = Path().resolve()
project_root = scripts_dir.parent

ds = load_dataset(
    "csv",
    data_files={
      "train": str(project_root / "data/processed/train.csv"),
      "validation": str(project_root / "data/processed/val.csv"),
      "test": str(project_root / "data/processed/test.csv"),
    }
)
ds

NameError: name 'Path' is not defined

In [38]:
ds_tok, tok = tokenize_and_format(ds)
ds_tok

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [None]:
from transformers import AutoTokenizer

ds_test = ds["train"].select(range(3))

tok = AutoTokenizer.from_pretrained("facebook/bart-base")
model_inputs = tok(
    ds_test["question"],
    max_length=1024,
    truncation=True
)
print(model_inputs) # returns input_ids and attention_mask

with tok.as_target_tokenizer():
    labels = tok(
        ds_test["answer"],
        max_length=256,
        truncation=True
    )
print(labels) # returns input_ids and attention_mask

model_inputs["labels"] = labels["input_ids"]
print(model_inputs) # returns model_inputs with a new target column called "labels" with the answer input_ids




{'input_ids': [[0, 33601, 328, 20280, 328, 38, 524, 883, 597, 36, 330, 24452, 1046, 43, 8, 4959, 11, 5188, 8697, 454, 1084, 1236, 21347, 4, 4356, 95, 259, 19, 127, 1041, 98, 38, 33, 10, 319, 9, 481, 86, 53, 38, 218, 17, 27, 90, 2489, 705, 605, 143, 964, 259, 25, 38, 2307, 62, 11, 5, 382, 328, 38, 64, 1994, 103, 449, 24452, 4, 546, 13, 4412, 1816, 964, 198, 127, 1046, 7, 213, 66, 3482, 50, 4835, 19, 328, 2], [0, 17312, 16752, 191, 10380, 16, 7789, 419, 8061, 10975, 17312, 16752, 191, 7, 1642, 296, 6, 2016, 10198, 1914, 149, 983, 47620, 13082, 640, 330, 1688, 13745, 1657, 1097, 16624, 4, 3548, 1344, 4, 175, 73, 2926, 73, 844, 1244, 12, 4124, 12, 1646, 73, 11535, 73, 37555, 73, 17312, 16752, 12, 4162, 12, 560, 12, 43230, 12, 23023, 12, 18888, 12, 9946, 7349, 12, 13364, 5182, 12, 11672, 12, 3583, 1397, 73, 1922, 32620, 3272, 322, 20, 1101, 16582, 9779, 4237, 36, 530, 5273, 43, 6126, 14, 5, 12034, 74, 9097, 1733, 2779, 5807, 6, 19, 1895, 1786, 11, 5, 4669, 3806, 148, 5, 1390, 137, 9592, 420



{'input_ids': [[0, 6179, 59, 825, 24674, 2628, 443, 116, 2], [0, 1121, 1101, 6, 47, 218, 75, 486, 110, 247, 22, 530, 33594, 41667, 653, 18, 110, 477, 2230, 116, 280, 2370, 8, 2238, 32, 80, 430, 11991, 116, 3394, 74, 348, 802, 7586, 2], [0, 417, 8693, 40, 28, 18815, 8, 21084, 1973, 4, 2333, 5, 2737, 40, 889, 5, 1492, 15, 49, 998, 4, 14620, 5, 1842, 31, 2238, 114, 47, 240, 7, 4, 17668, 35, 259, 18, 5, 2178, 1842, 6, 24, 16, 11, 2370, 350, 35, 1205, 640, 417, 8693, 4, 642, 687, 260, 4, 1043, 4, 32059, 73, 417, 8693, 73, 428, 4311, 73, 8458, 2546, 73, 2619, 612, 33871, 114, 47, 236, 7, 697, 6152, 1493, 6, 356, 23, 1437, 46873, 15264, 21402, 45209, 7487, 15375, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [15]:
ds_tok["train"][0]

{'input_ids': tensor([    0, 33601,   328, 20280,   328,    38,   524,   883,   597,    36,
           330, 24452,  1046,    43,     8,  4959,    11,  5188,  8697,   454,
          1084,  1236, 21347,     4,  4356,    95,   259,    19,   127,  1041,
            98,    38,    33,    10,   319,     9,   481,    86,    53,    38,
           218,    17,    27,    90,  2489,   705,   605,   143,   964,   259,
            25,    38,  2307,    62,    11,     5,   382,   328,    38,    64,
          1994,   103,   449, 24452,     4,   546,    13,  4412,  1816,   964,
           198,   127,  1046,     7,   213,    66,  3482,    50,  4835,    19,
           328,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([    0,  6