# PTT Multi-Board Crawler

A Python script to crawl multiple PTT boards and organize content based on recommendations.

## Structure:
1. Configuration and Setup
2. Utility Functions
3. Web Scraping Functions
4. Data Processing Functions
5. Main Execution

In [12]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
import random
import re
import os
from tqdm.auto import tqdm

# Constants and configurations
BOARDS = ["Tech_Job"]  # List of boards to crawl
NUM_PAGES = 15  # Number of pages to crawl for each board

DEFAULT_CONFIG = {
    'BASE_URL': "https://www.ptt.cc",
    'HEADERS': {"User-Agent": "Mozilla/5.0"},
    'ENCODING': 'utf-8',
    'THRESHOLDS': [20, 30, 40],  # List of recommendation thresholds for filtering
    'DEFAULT_THRESHOLD': 20,
    'SLEEP_TIME_RANGE': (0.1, 0.3),  # General operations sleep time
    'PAGE_SLEEP_TIME_RANGE': (0.2, 0.5),  # Between pages sleep time
}

def update_config_for_board(board_name):
    """Create board-specific configuration
    
    Args:
        board_name (str): Name of the PTT board
        
    Returns:
        dict: Configuration dictionary for the specified board
    """
    return {
        **DEFAULT_CONFIG,  # Include all default settings
        'BOARD_NAME': board_name,
        'BOARD_URL': f"{DEFAULT_CONFIG['BASE_URL']}/bbs/{board_name}/index.html",
        'DATA_DIR': os.path.join(os.getcwd(), board_name)
    }

In [13]:
# Utility Functions
def safe_request(url, headers=None):
    """Make a safe HTTP request with error handling"""
    try:
        headers = headers or DEFAULT_CONFIG['HEADERS']
        res = requests.get(url, headers=headers)
        res.encoding = DEFAULT_CONFIG['ENCODING']
        return res
    except Exception as e:
        print(f"⚠️ Request failed for {url}: {e}")
        return None

def random_sleep(range_tuple=None):
    """Sleep for a random duration within the specified range"""
    sleep_range = range_tuple or DEFAULT_CONFIG['SLEEP_TIME_RANGE']
    time.sleep(round(random.uniform(*sleep_range), 2))

def get_data_path(filename, config):
    """Get the full path for a data file in the board's directory
    
    Args:
        filename (str): Name of the file
        config (dict): Current board configuration
        
    Returns:
        str: Absolute path to the file
    """
    path = os.path.join(config['DATA_DIR'], filename)
    dir_path = os.path.dirname(path)
    
    # Create directory if it doesn't exist
    try:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path, mode=0o777, exist_ok=True)
        # Ensure the directory has write permissions
        os.chmod(dir_path, 0o777)
    except Exception as e:
        print(f"⚠️ Error creating directory {dir_path}: {e}")
        return None
    
    return path

def safe_write_csv(data, filename, mode='a', fieldnames=None):
    """Safely write data to CSV with proper encoding and quoting"""
    try:
        with open(filename, mode=mode, encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(
                f, 
                fieldnames=fieldnames,
                quoting=csv.QUOTE_ALL,  # Quote all fields
                escapechar='\\',  # Use backslash as escape character
                doublequote=True  # Double quotes within fields
            )
            if f.tell() == 0:
                writer.writeheader()
            if isinstance(data, list):
                writer.writerows(data)
            else:
                writer.writerow(data)
        return True
    except Exception as e:
        print(f"❌ Error writing to {filename}: {e}")
        return False

def clean_content(text):
    """Clean text content by removing extra whitespace and newlines"""
    if not isinstance(text, str):
        return text
    cleaned = re.sub(r'\n\s*\n', '\n', text)
    return cleaned.strip()

In [14]:
# Web Scraping Functions
def get_latest_index(board_url):
    """Get the latest page index from PTT board"""
    res = safe_request(board_url)
    if not res:
        return None
    
    soup = BeautifulSoup(res.text, "html.parser")
    prev_btn = soup.select_one("div.btn-group-paging a.btn.wide:nth-child(2)")
    
    if prev_btn and 'href' in prev_btn.attrs:
        match = re.search(r"index(\d+)\.html", prev_btn["href"])
        if match:
            return int(match.group(1)) + 1
    return None

def crawl_page(index, config):
    """Crawl a single page of PTT posts"""
    url = f"{config['BASE_URL']}/bbs/{config['BOARD_NAME']}/index{index}.html"
    print(f"\n📄 抓取頁面: {url}")
    
    res = safe_request(url)
    if not res:
        return []
        
    soup = BeautifulSoup(res.text, "html.parser")
    articles = []
    
    for div in soup.select("div.r-ent"):
        title_tag = div.select_one("div.title > a")
        date_tag = div.select_one("div.meta > div.date")
        nrec_tag = div.select_one("div.nrec")
        
        if title_tag and date_tag:
            record = {
                "title": title_tag.text.strip(),
                "date": date_tag.text.strip(),
                "link": f"{config['BASE_URL']}{title_tag['href']}",
                "nrec": nrec_tag.text.strip() if nrec_tag else "0"
            }
            articles.append(record)
            
    return articles

def get_structured_content(url):
    """Extract structured content from a PTT post"""
    res = safe_request(url)
    if not res:
        return {}

    soup = BeautifulSoup(res.text, "html.parser")
    main_content = soup.select_one("div#main-content")
    if not main_content:
        return {}

    # Remove comments
    for tag in main_content.find_all(['div', 'span'], recursive=False):
        tag.extract()

    text = main_content.get_text(separator="\n").strip()
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    urls = [a['href'] for a in main_content.find_all('a', href=True)]

    # Parse content structure
    title = source = ""
    content_lines = []
    content_started = False
    
    for line in lines:
        if "標題" in line or "標題:" in line:
            title = re.sub(r'^.*?標題[:,：]\s*', '', line).strip()
        elif "作者" in line or "作者:" in line:
            source = re.sub(r'^.*?作者[:,：]\s*', '', line).strip()
        else:
            if not content_started:
                if "看板" in line or "時間" in line:
                    continue
                content_started = True
            content_lines.append(line)

    return {
        "title": title,
        "source": source,
        "urls": urls,
        "content": "\n".join(content_lines).strip()
    }

In [18]:
# Data Processing Functions
import datetime

def process_recommendations(df, threshold=None):
    """Process and filter recommendations from the dataframe"""
    threshold = threshold or DEFAULT_CONFIG['DEFAULT_THRESHOLD']
    
    # Convert recommendations to numeric values
    df['nrec'] = pd.to_numeric(
        df['nrec'].replace('爆', '100').replace('X', '-1'), 
        errors='coerce'
    ).fillna(0)
    
    # Filter posts with recommendations >= threshold
    return df[df['nrec'] >= threshold]

def process_and_save_data(df, config, threshold=None, filename=None):
    """Process and save filtered data"""
    threshold = threshold or config['DEFAULT_THRESHOLD']
    if filename is None:
        filename = f'{config["BOARD_NAME"]}_above_{threshold}_rec.csv'
    
    # Filter and save data
    filtered_df = process_recommendations(df, threshold)
    output_path = get_data_path(filename, config)
    filtered_df.to_csv(output_path, encoding='utf-8-sig', index=False, quoting=csv.QUOTE_ALL)
    
    print(f"✅ Successfully saved {len(filtered_df)} records with {threshold}+ recommendations to {output_path}")
    print(f"📁 File size: {os.path.getsize(output_path) / 1024:.1f} KB")
    
    return filtered_df

def crawl_posts(config, num_pages=None):
    """Main function to crawl posts from PTT"""
    num_pages = num_pages or NUM_PAGES
    latest_index = get_latest_index(config['BOARD_URL'])
    
    if not latest_index:
        print("❌ 無法取得最新頁碼")
        return None
        
    all_articles = []
    for page_index in tqdm(range(latest_index, latest_index - num_pages, -1)):
        try:
            articles = crawl_page(page_index, config)
            all_articles.extend(articles)
            random_sleep(config['PAGE_SLEEP_TIME_RANGE'])
        except Exception as e:
            print(f"⚠️ 發生錯誤於 index{page_index}: {e}")
    
    return pd.DataFrame(all_articles)

def crawl_post_content(filtered_data: pd.DataFrame, config: dict):
    """Crawl content for filtered posts"""
    output_file = get_data_path(f'{config["BOARD_NAME"]}_content.csv', config)
    
    # Check existing data
    existing_links = set()
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_csv(output_file, quoting=csv.QUOTE_ALL, encoding='utf-8-sig')
            existing_links = set(existing_df["link"].dropna().tolist())
            print(f"📁 已存在資料筆數：{len(existing_links)}")
        except Exception as e:
            print(f"⚠️ 無法讀取 {output_file}，錯誤：{e}")
            # If there's an error reading the file, it might be corrupted
            # Rename it and start fresh
            if os.path.exists(output_file):
                backup_file = output_file + '.backup'
                os.rename(output_file, backup_file)
                print(f"ℹ️ Moved problematic file to {backup_file}")

    # Filter new links
    to_crawl = filtered_data[~filtered_data["link"].isin(existing_links)]
    print(f"🚀 準備抓取新連結筆數：{len(to_crawl)}")

    # Process each post
    fieldnames = ["title", "date", "link", "nrec", "source", "content", "urls"]
    for _, row in tqdm(to_crawl.iterrows(), total=len(to_crawl)):
        structured_content = get_structured_content(row["link"])
        
        record = {
            "title": row["title"],
            "date": row["date"],
            "link": row["link"],
            "nrec": row["nrec"],
            "source": structured_content.get("source", ""),
            "content": structured_content.get("content", ""),
            "urls": "|".join(structured_content.get("urls", []))
        }
        
        safe_write_csv(record, output_file, fieldnames=fieldnames)
        random_sleep()

    print("✅ 全部內文抓取完成！")

def process_content_data(config):
    """Process and save different versions of content data"""
    content_file = get_data_path(f'{config["BOARD_NAME"]}_content.csv', config)
    if not os.path.exists(content_file):
        print(f"⚠️ Content file not found: {content_file}")
        return

    # Read and clean content
    try:
        result = pd.read_csv(content_file, quoting=csv.QUOTE_ALL, encoding='utf-8-sig')
    except Exception as e:
        print(f"⚠️ Error reading content file: {e}")
        return

    result['content'] = result['content'].apply(clean_content)
    
    # Convert recommendations to numeric
    result['nrec'] = pd.to_numeric(
        result['nrec'].replace('爆', '100').replace('X', '-1'), 
        errors='coerce'
    ).fillna(0)
    
    # Filter for posts above threshold and save cleaned content
    result = result[result['nrec'] >= config['DEFAULT_THRESHOLD']]
    result.to_csv(content_file, encoding='utf-8-sig', index=False, quoting=csv.QUOTE_ALL)
    print(f"✅ Saved {len(result)} cleaned records with {config['DEFAULT_THRESHOLD']}+ recommendations to content file")
    
def is_date_within_this_week(date_str):
    """Check if a date string is within the current week
    
    Args:
        date_str (str): Date string in format like "12/25" or " 1/01"
        
    Returns:
        bool: True if date is within current week, False otherwise
    """
    try:
        # Clean the date string
        date_str = date_str.strip()
        
        # Get current date and year
        now = datetime.datetime.now()
        current_year = now.year
        
        # Parse the date (assuming format "MM/DD" or "M/DD")
        if '/' in date_str:
            month, day = date_str.split('/')
            month, day = int(month), int(day)
        else:
            return False
            
        # Create datetime object for this year
        post_date = datetime.datetime(current_year, month, day)
        
        # If the date is in the future (next year scenario), use previous year
        if post_date > now:
            post_date = datetime.datetime(current_year - 1, month, day)
        
        # Get start of current week (Monday)
        today = now.date()
        days_since_monday = today.weekday()
        week_start = today - datetime.timedelta(days=days_since_monday)
        week_end = week_start + datetime.timedelta(days=6)
        
        # Check if post date is within this week
        post_date_only = post_date.date()
        return week_start <= post_date_only <= week_end
        
    except Exception as e:
        print(f"⚠️ Error parsing date '{date_str}': {e}")
        return False

def is_date_within_two_weeks(date_str):
    """Check if a date string is within the last two weeks
    
    Args:
        date_str (str): Date string in format like "12/25" or " 1/01"
        
    Returns:
        bool: True if date is within last two weeks, False otherwise
    """
    try:
        # Clean the date string
        date_str = date_str.strip()
        
        # Get current date and year
        now = datetime.datetime.now()
        current_year = now.year
        
        # Parse the date (assuming format "MM/DD" or "M/DD")
        if '/' in date_str:
            month, day = date_str.split('/')
            month, day = int(month), int(day)
        else:
            return False
            
        # Create datetime object for this year
        post_date = datetime.datetime(current_year, month, day)
        
        # If the date is in the future (next year scenario), use previous year
        if post_date > now:
            post_date = datetime.datetime(current_year - 1, month, day)
        
        # Get date range for last two weeks
        today = now.date()
        two_weeks_ago = today - datetime.timedelta(days=14)
        
        # Check if post date is within last two weeks
        post_date_only = post_date.date()
        return two_weeks_ago <= post_date_only <= today
        
    except Exception as e:
        print(f"⚠️ Error parsing date '{date_str}': {e}")
        return False

def is_date_within_n_days(date_str, n_days):
    """Check if a date string is within the last N days
    
    Args:
        date_str (str): Date string in format like "12/25" or " 1/01"
        n_days (int): Number of days to look back from today
        
    Returns:
        bool: True if date is within last N days, False otherwise
    """
    try:
        # Clean the date string
        date_str = date_str.strip()
        
        # Get current date and year
        now = datetime.datetime.now()
        current_year = now.year
        
        # Parse the date (assuming format "MM/DD" or "M/DD")
        if '/' in date_str:
            month, day = date_str.split('/')
            month, day = int(month), int(day)
        else:
            return False
            
        # Create datetime object for this year
        post_date = datetime.datetime(current_year, month, day)
        
        # If the date is in the future (next year scenario), use previous year
        if post_date > now:
            post_date = datetime.datetime(current_year - 1, month, day)
        
        # Get date range for last N days
        today = now.date()
        n_days_ago = today - datetime.timedelta(days=n_days)
        
        # Check if post date is within last N days
        post_date_only = post_date.date()
        return n_days_ago <= post_date_only <= today
        
    except Exception as e:
        print(f"⚠️ Error parsing date '{date_str}': {e}")
        return False

def filter_data_by_this_week(df):
    """Filter dataframe to only include posts from this week
    
    Args:
        df (pd.DataFrame): DataFrame containing posts with 'date' column
        
    Returns:
        pd.DataFrame: Filtered DataFrame with only this week's posts
    """
    if df.empty or 'date' not in df.columns:
        print("⚠️ DataFrame is empty or missing 'date' column")
        return df
    
    # Apply date filter
    mask = df['date'].apply(is_date_within_this_week)
    filtered_df = df[mask].copy()
    
    print(f"📅 Original posts: {len(df)}")
    print(f"📅 Posts from this week: {len(filtered_df)}")
    
    return filtered_df

def filter_data_by_two_weeks(df):
    """Filter dataframe to only include posts from the last two weeks
    
    Args:
        df (pd.DataFrame): DataFrame containing posts with 'date' column
        
    Returns:
        pd.DataFrame: Filtered DataFrame with only last two weeks' posts
    """
    if df.empty or 'date' not in df.columns:
        print("⚠️ DataFrame is empty or missing 'date' column")
        return df
    
    # Apply date filter
    mask = df['date'].apply(is_date_within_two_weeks)
    filtered_df = df[mask].copy()
    
    print(f"📅 Original posts: {len(df)}")
    print(f"📅 Posts from last two weeks: {len(filtered_df)}")
    
    return filtered_df

def filter_data_by_n_days(df, n_days):
    """Filter dataframe to only include posts from the last N days
    
    Args:
        df (pd.DataFrame): DataFrame containing posts with 'date' column
        n_days (int): Number of days to look back from today
        
    Returns:
        pd.DataFrame: Filtered DataFrame with only last N days' posts
    """
    if df.empty or 'date' not in df.columns:
        print("⚠️ DataFrame is empty or missing 'date' column")
        return df
    
    # Apply date filter
    mask = df['date'].apply(lambda x: is_date_within_n_days(x, n_days))
    filtered_df = df[mask].copy()
    
    print(f"📅 Original posts: {len(df)}")
    print(f"📅 Posts from last {n_days} days: {len(filtered_df)}")
    
    return filtered_df

def save_weekly_data(df, config, suffix="_this_week"):
    """Save filtered weekly data to CSV file
    
    Args:
        df (pd.DataFrame): DataFrame to save
        config (dict): Board configuration
        suffix (str): Suffix to add to filename
        
    Returns:
        str: Path to saved file
    """
    if df.empty:
        print("⚠️ No data to save")
        return None
    
    # Create filename with weekly suffix
    filename = f"{config['BOARD_NAME']}{suffix}.csv"
    output_path = get_data_path(filename, config)
    
    if output_path is None:
        return None
    
    try:
        # Save the data
        df.to_csv(output_path, encoding='utf-8-sig', index=False, quoting=csv.QUOTE_ALL)
        
        print(f"✅ Successfully saved {len(df)} weekly records to {output_path}")
        print(f"📁 File size: {os.path.getsize(output_path) / 1024:.1f} KB")
        
        return output_path
        
    except Exception as e:
        print(f"❌ Error saving weekly data to {output_path}: {e}")
        return None

def save_n_days_data(df, config, n_days, suffix=None):
    """Save filtered N days data to CSV file
    
    Args:
        df (pd.DataFrame): DataFrame to save
        config (dict): Board configuration
        n_days (int): Number of days used for filtering
        suffix (str): Optional suffix to add to filename
        
    Returns:
        str: Path to saved file
    """
    if df.empty:
        print("⚠️ No data to save")
        return None
    
    # Create filename with N days suffix
    if suffix:
        filename = f"{config['BOARD_NAME']}_{n_days}_days_{suffix}.csv"
    else:
        filename = f"{config['BOARD_NAME']}_last_{n_days}_days.csv"
    
    output_path = get_data_path(filename, config)
    
    if output_path is None:
        return None
    
    try:
        # Save the data
        df.to_csv(output_path, encoding='utf-8-sig', index=False, quoting=csv.QUOTE_ALL)
        
        print(f"✅ Successfully saved {len(df)} records from last {n_days} days to {output_path}")
        print(f"📁 File size: {os.path.getsize(output_path) / 1024:.1f} KB")
        
        return output_path
        
    except Exception as e:
        print(f"❌ Error saving {n_days} days data to {output_path}: {e}")
        return None

def process_weekly_content(config):
    """Process and save weekly content data
    
    Args:
        config (dict): Board configuration
    """
    # Read existing content data
    content_file = get_data_path(f'{config["BOARD_NAME"]}_content.csv', config)
    
    if not os.path.exists(content_file):
        print(f"⚠️ Content file not found: {content_file}")
        return
    
    try:
        # Load content data
        df = pd.read_csv(content_file, quoting=csv.QUOTE_ALL, encoding='utf-8-sig')
        
        # Filter by this week
        weekly_df = filter_data_by_this_week(df)
        
        if not weekly_df.empty:
            # Save weekly content
            save_weekly_data(weekly_df, config, suffix="_content_this_week")
        else:
            print("📅 No posts found from this week")
            
    except Exception as e:
        print(f"⚠️ Error processing weekly content: {e}")

def process_two_weeks_content(config):
    """Process and save two weeks content data
    
    Args:
        config (dict): Board configuration
    """
    # Read existing content data
    content_file = get_data_path(f'{config["BOARD_NAME"]}_content.csv', config)
    
    if not os.path.exists(content_file):
        print(f"⚠️ Content file not found: {content_file}")
        return
    
    try:
        # Load content data
        df = pd.read_csv(content_file, quoting=csv.QUOTE_ALL, encoding='utf-8-sig')
        
        # Filter by last two weeks
        two_weeks_df = filter_data_by_two_weeks(df)
        
        if not two_weeks_df.empty:
            # Save two weeks content
            save_weekly_data(two_weeks_df, config, suffix="_content_two_weeks")
        else:
            print("📅 No posts found from the last two weeks")
            
    except Exception as e:
        print(f"⚠️ Error processing two weeks content: {e}")

In [None]:
# Main Execution
for board in BOARDS:
    print(f"\n🔄 Processing board: {board}")
    
    # Update config for current board
    config = update_config_for_board(board)
    
    # Step 1: Crawl posts
    print("\n📥 Crawling posts...")
    df = crawl_posts(config, NUM_PAGES)
    if df is None or df.empty:
        print("❌ No data retrieved, skipping board")
        continue
        
    # Save raw data
    raw_filename = f"{board}_raw.csv"
    df.to_csv(
        get_data_path(raw_filename, config),
        encoding='utf-8-sig',
        index=False,
        quoting=csv.QUOTE_ALL
    )
    print(f"✅ Saved {len(df)} raw records to {raw_filename}")
    
    # Step 2: Filter and process data
    print("\n🔍 Processing recommendations...")
    filtered_df = process_and_save_data(df, config, threshold = 20)
    
    # Step 3: Crawl content for filtered posts
    print("\n📚 Crawling content for filtered posts...")
    crawl_post_content(filtered_df, config)
    
    # Step 4: Process content data
    print("\n📊 Processing content data...")
    process_content_data(config)
    
print("\n✨ All boards processed successfully!")


🔄 Processing board: Tech_Job

📥 Crawling posts...


  0%|          | 0/15 [00:00<?, ?it/s]


📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4002.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4001.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4000.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3999.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3998.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3997.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3996.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3995.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3994.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3993.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3992.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3991.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3990.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3989.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3988.html
✅ Saved 285 raw records to Tech_Job_raw.csv

🔍 Processing recommendations...
✅ Successfully saved 74 records with 20+ recommendations to c:\Users\benin\RAG_lear

  0%|          | 0/10 [00:00<?, ?it/s]

✅ 全部內文抓取完成！

📊 Processing content data...
✅ Saved 80 cleaned records with 20+ recommendations to content file

✨ All boards processed successfully!


In [16]:
# Execute main function for board 'Gossiping' with threshold 20
board = "Gossiping"
filtered_threshold = 10
num_pages = 15
config = update_config_for_board(board)

print(f"\n🔄 Processing board: {board}")

# Step 1: Crawl posts
print("\n📥 Crawling posts...")
df = crawl_posts(config, NUM_PAGES)
if df is None or df.empty:
    print("❌ No data retrieved, skipping board")
else:
    # Save raw data
    raw_filename = f"{board}_raw.csv"
    df.to_csv(
        get_data_path(raw_filename, config),
        encoding='utf-8-sig',
        index=False,
        quoting=csv.QUOTE_ALL
    )
    print(f"✅ Saved {len(df)} raw records to {raw_filename}")

    # Step 2: Filter and process data
    print("\n🔍 Processing recommendations...")
    filtered_df = process_and_save_data(df, config, threshold=filtered_threshold)

    # Step 3: Crawl content for filtered posts
    print("\n📚 Crawling content for filtered posts...")
    crawl_post_content(filtered_df, config)

    # Step 4: Process content data
    print("\n📊 Processing content data...")
    process_content_data(config)

print("\n✨ Board 'Gossiping' processed successfully!")


🔄 Processing board: Gossiping

📥 Crawling posts...


  0%|          | 0/15 [00:00<?, ?it/s]


📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39402.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39401.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39401.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39400.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39400.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39399.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39399.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39398.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39398.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39397.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39397.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39396.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39396.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39395.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39395.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39394.html

📄 抓取頁面: https://www.ptt.cc/bbs/Gossiping/index39394.html

📄 抓取頁面: https

  0%|          | 0/34 [00:00<?, ?it/s]

✅ 全部內文抓取完成！

📊 Processing content data...
✅ Saved 81 cleaned records with 20+ recommendations to content file

✨ Board 'Gossiping' processed successfully!


In [19]:
# Execute main function for board 'EuropeTravel' with N days filter (date only)
board = "EuropeTravel"
num_pages = 10
n_days = 14  # Variable N - number of days to filter from today
config = update_config_for_board(board)

print(f"\n🔄 Processing board: {board}")
print(f"📅 Filtering posts from last {n_days} days")

# Step 1: Crawl posts
print("\n📥 Crawling posts...")
df = crawl_posts(config, num_pages)
if df is None or df.empty:
    print("❌ No data retrieved, skipping board")
else:
    # Save raw data
    raw_filename = f"{board}_raw.csv"
    df.to_csv(
        get_data_path(raw_filename, config),
        encoding='utf-8-sig',
        index=False,
        quoting=csv.QUOTE_ALL
    )
    print(f"✅ Saved {len(df)} raw records to {raw_filename}")

    # Step 2: Filter by N days only (no recommendation threshold)
    print(f"\n📅 Filtering by last {n_days} days...")
    filtered_df = filter_data_by_n_days(df, n_days)
    
    if not filtered_df.empty:
        # Save date-filtered data
        save_n_days_data(filtered_df, config, n_days)
        
        # Step 3: Crawl content for date-filtered posts
        print("\n📚 Crawling content for date-filtered posts...")
        crawl_post_content(filtered_df, config)
        
        # Step 4: Save final content with date filter
        print(f"\n📊 Processing content data for last {n_days} days...")
        content_file = get_data_path(f'{config["BOARD_NAME"]}_content.csv', config)
        
        if os.path.exists(content_file):
            try:
                # Load content data and apply date filter
                content_df = pd.read_csv(content_file, quoting=csv.QUOTE_ALL, encoding='utf-8-sig')
                content_filtered = filter_data_by_n_days(content_df, n_days)
                
                if not content_filtered.empty:
                    # Clean content
                    content_filtered['content'] = content_filtered['content'].apply(clean_content)
                    # Save date-filtered content
                    save_n_days_data(content_filtered, config, n_days, suffix="content")
                else:
                    print(f"📅 No content found from the last {n_days} days")
                    
            except Exception as e:
                print(f"⚠️ Error processing content data: {e}")
    else:
        print(f"📅 No posts found from the last {n_days} days")

print(f"\n✨ Board '{board}' processed successfully with {n_days} days date filter!")


🔄 Processing board: EuropeTravel
📅 Filtering posts from last 14 days

📥 Crawling posts...


  0%|          | 0/10 [00:00<?, ?it/s]


📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2362.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2361.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2361.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2360.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2360.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2359.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2359.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2358.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2358.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2357.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2357.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2356.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2356.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2355.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2355.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTravel/index2354.html

📄 抓取頁面: https://www.ptt.cc/bbs/EuropeTr

  0%|          | 0/19 [00:00<?, ?it/s]

✅ 全部內文抓取完成！

📊 Processing content data for last 14 days...
📅 Original posts: 25
📅 Posts from last 14 days: 19
✅ Successfully saved 19 records from last 14 days to c:\Users\benin\RAG_learning-project\EuropeTravel\EuropeTravel_14_days_content.csv
📁 File size: 31.3 KB

✨ Board 'EuropeTravel' processed successfully with 14 days date filter!
