In [115]:
# Main Execution Functions
def crawl_posts(num_pages=5):
    """Main function to crawl posts from PTT"""
    latest_index = get_latest_index()
    if not latest_index:
        print("❌ 無法取得最新頁碼")
        return None
        
    all_articles = []
    for page_index in tqdm(range(latest_index, latest_index - num_pages, -1)):
        try:
            articles = crawl_page(page_index)
            all_articles.extend(articles)
            random_sleep(CONFIG['PAGE_SLEEP_TIME_RANGE'])
        except Exception as e:
            print(f"⚠️ 發生錯誤於 index{page_index}: {e}")
    
    return pd.DataFrame(all_articles)

def process_and_save_data(df, threshold=None, filename=None):
    """Process and save filtered data"""
    threshold = threshold or CONFIG['DEFAULT_THRESHOLD']
    if filename is None:
        filename = f'{CONFIG["BOARD_NAME"]}_above_{threshold}_rec.csv'  # Changed from 'below' to 'above'
    
    # Filter and save data
    filtered_df = process_recommendations(df, threshold)
    output_path = get_data_path(filename)
    filtered_df.to_csv(output_path, encoding='utf-8-sig', index=False)
    
    # Print status
    print(f"✅ Successfully saved {len(filtered_df)} records with {threshold}+ recommendations to {output_path}")
    print(f"📁 File size: {os.path.getsize(output_path) / 1024:.1f} KB")
    
    return filtered_df

In [116]:
# Visualization Functions
def visualize_recommendations(filtered_df, threshold=None):
    """Visualize the distribution of recommendations"""
    threshold = threshold or CONFIG['DEFAULT_THRESHOLD']
    
    # Display statistics
    print(f"Posts with {threshold} or more recommendations:")
    print(f"Total count: {len(filtered_df)}")
    print("\nFiltered data preview:")
    print(filtered_df[['title', 'nrec', 'date']].head())
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_df['nrec'], bins=30, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of Posts with {threshold}+ Recommendations')
    plt.xlabel('Number of Recommendations')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Print statistics
    print("\nBasic statistics of filtered recommendations:")
    print(filtered_df['nrec'].describe())

In [117]:
# Data Processing Functions
def process_recommendations(df, threshold=None):
    """Process and filter recommendations from the dataframe
    
    Args:
        df (pandas.DataFrame): DataFrame containing posts data
        threshold (int): minimum number of recommendations to include
    
    Returns:
        pandas.DataFrame: Filtered DataFrame with nrec >= threshold
    """
    threshold = threshold or CONFIG['DEFAULT_THRESHOLD']
    
    # Convert recommendations to numeric values
    df['nrec'] = pd.to_numeric(
        df['nrec'].replace('爆', '100').replace('X', '-1'), 
        errors='coerce'
    ).fillna(0)
    
    # Filter posts with recommendations >= threshold
    return df[df['nrec'] >= threshold]

In [118]:
# Web Scraping Functions
def get_latest_index(board_url=None):
    """Get the latest page index from PTT board"""
    board_url = board_url or CONFIG['BOARD_URL']
    res = safe_request(board_url)
    if not res:
        return None
    
    soup = BeautifulSoup(res.text, "html.parser")
    prev_btn = soup.select_one("div.btn-group-paging a.btn.wide:nth-child(2)")
    
    if prev_btn and 'href' in prev_btn.attrs:
        match = re.search(r"index(\d+)\.html", prev_btn["href"])
        if match:
            return int(match.group(1)) + 1
    return None

def get_post_content(url):
    """Extract content from a single PTT post"""
    res = safe_request(url)
    if not res:
        return ""
    
    soup = BeautifulSoup(res.text, "html.parser")
    content_tag = soup.select_one("div#main-content")
    
    if not content_tag:
        return ""
        
    # Remove comments and tags
    for tag in content_tag.select("div, span"):
        tag.extract()
    
    return content_tag.get_text(separator="\n").strip()

def crawl_page(index):
    """Crawl a single page of PTT posts"""
    url = f"{CONFIG['BASE_URL']}/bbs/Tech_Job/index{index}.html"
    print(f"\n📄 抓取頁面: {url}")
    
    res = safe_request(url)
    if not res:
        return []
        
    soup = BeautifulSoup(res.text, "html.parser")
    articles = []
    
    for div in soup.select("div.r-ent"):
        title_tag = div.select_one("div.title > a")
        date_tag = div.select_one("div.meta > div.date")
        nrec_tag = div.select_one("div.nrec")
        
        if title_tag and date_tag:
            record = {
                "title": title_tag.text.strip(),
                "date": date_tag.text.strip(),
                "link": f"{CONFIG['BASE_URL']}{title_tag['href']}",
                "nrec": nrec_tag.text.strip() if nrec_tag else "0"
            }
            articles.append(record)
            
    return articles

In [119]:
# Utility Functions
def safe_request(url, headers=None):
    """Make a safe HTTP request with error handling"""
    try:
        headers = headers or CONFIG['HEADERS']
        res = requests.get(url, headers=headers)
        res.encoding = CONFIG['ENCODING']
        return res
    except Exception as e:
        print(f"⚠️ Request failed for {url}: {e}")
        return None

def random_sleep(range_tuple=None):
    """Sleep for a random duration within the specified range"""
    sleep_range = range_tuple or CONFIG['SLEEP_TIME_RANGE']
    time.sleep(round(random.uniform(*sleep_range), 2))

def safe_write_csv(data, filename, mode='a', fieldnames=None):
    """Safely write data to CSV with proper encoding"""
    try:
        with open(filename, mode=mode, encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if f.tell() == 0:
                writer.writeheader()
            if isinstance(data, list):
                writer.writerows(data)
            else:
                writer.writerow(data)
        return True
    except Exception as e:
        print(f"❌ Error writing to {filename}: {e}")
        return False

In [120]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
import random
import re
import os
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Constants and configurations
BOARD_NAME = "stock"  # Board name for file organization

CONFIG = {
    'BOARD_NAME': BOARD_NAME,
    'BOARD_URL': f"https://www.ptt.cc/bbs/{BOARD_NAME}/index.html",
    'BASE_URL': "https://www.ptt.cc",
    'HEADERS': {"User-Agent": "Mozilla/5.0"},
    'ENCODING': 'utf-8',
    'DEFAULT_THRESHOLD': 40,
    'SLEEP_TIME_RANGE': (0.1, 0.3),  # Shorter sleep time for general operations
    'PAGE_SLEEP_TIME_RANGE': (0.2, 0.5),  # Shorter sleep time between pages
    'DATA_DIR': os.path.join(os.getcwd(), BOARD_NAME)  # Create a directory for the board
}

# Create board directory if it doesn't exist
try:
    if not os.path.exists(CONFIG['DATA_DIR']):
        os.makedirs(CONFIG['DATA_DIR'])
        print(f"✅ Created new directory for {BOARD_NAME} at: {CONFIG['DATA_DIR']}")
    else:
        print(f"📂 Using existing directory for {BOARD_NAME} at: {CONFIG['DATA_DIR']}")
except Exception as e:
    print(f"❌ Error creating directory {CONFIG['DATA_DIR']}: {e}")
    raise  # Re-raise the exception since we need the directory to proceed

def get_data_path(filename):
    """Get the full path for a data file in the board's directory"""
    path = os.path.join(CONFIG['DATA_DIR'], filename)
    # Ensure the directory exists (in case it was deleted after initial creation)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    return path

📂 Using existing directory for stock at: c:\Users\USER\RAG_learning-project\stock


In [121]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re


def get_latest_index(board_url=None):
    """Get the latest page index from PTT board"""
    board_url = board_url or "https://www.ptt.cc/bbs/Tech_Job/index.html"
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(board_url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    # 取得上一頁按鈕連結
    prev_btn = soup.select_one("div.btn-group-paging a.btn.wide:nth-child(2)")
    if prev_btn:
        href = prev_btn["href"]  # e.g., /bbs/Tech_Job/index6772.html
        match = re.search(r"index(\d+)\.html", href)
        if match:
            return int(match.group(1)) + 1  # 最新頁 = 上一頁 + 1
    return None

def crawl_page(index):
    url = f"https://www.ptt.cc/bbs/Tech_Job/index{index}.html"
    print(f"\n📄 抓取頁面: {url}")

    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, "html.parser")

    articles = []
    for div in soup.select("div.r-ent"):
        title_tag = div.select_one("div.title > a")
        date_tag = div.select_one("div.meta > div.date")
        nrec_tag = div.select_one("div.nrec")

        if title_tag and date_tag:
            record = {
                "title": title_tag.text.strip(),
                "date": date_tag.text.strip(),
                "link": "https://www.ptt.cc" + title_tag["href"],
                "nrec": nrec_tag.text.strip() if nrec_tag else "0"
            }
            articles.append(record)
    return articles

In [122]:
import pandas as pd
df = pd.read_csv("ptt_tech_job.csv")

In [123]:
def filter_low_recommendations(df, threshold=20):
    """
    Filter posts with recommendations below a threshold
    
    Args:
        df (pandas.DataFrame): DataFrame containing posts data
        threshold (int): minimum number of recommendations to include
    
    Returns:
        pandas.DataFrame: Filtered DataFrame with nrec < threshold
    """
    # Convert nrec to numeric first
    df['nrec'] = pd.to_numeric(df['nrec'].replace('爆', '100').replace('X', '-1'), errors='coerce').fillna(0)
    
    # Filter and return
    return df[df['nrec'] > threshold]

def visualize_recommendation_distribution(filtered_df, threshold=20):
    """
    Visualize the distribution of recommendations for filtered data
    
    Args:
        filtered_df (pandas.DataFrame): DataFrame containing filtered posts
        threshold (int): Maximum value for visualization bins
    """
    # Display statistics
    print(f"Posts with less than {threshold} recommendations:")
    print(f"Total count: {len(filtered_df)}")
    print("\nFiltered data preview:")
    print(filtered_df[['title', 'nrec', 'date']].head())
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_df['nrec'], bins=threshold, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of Posts with < {threshold} Recommendations')
    plt.xlabel('Number of Recommendations')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Print basic statistics
    print("\nBasic statistics of filtered recommendations:")
    print(filtered_df['nrec'].describe())

# Example usage:
threshold = 20
filtered_data = filter_low_recommendations(df, threshold)
#visualize_recommendation_distribution(filtered_data, threshold)

In [124]:
def save_filtered_data(filtered_df, threshold=20, filename=None):
    """
    Save filtered recommendations data to CSV
    
    Args:
        filtered_df (pandas.DataFrame): DataFrame containing filtered posts
        threshold (int): Threshold used for filtering (default: 20)
        filename (str): Optional custom filename (default: None)
    """
    if filename is None:
        filename = f'ptt_tech_job_below_{threshold}_rec.csv'
    
    try:
        # Save to CSV with UTF-8-SIG encoding for Chinese characters
        filtered_df.to_csv(filename, encoding='utf-8-sig', index=False)
        print(f"✅ Successfully saved {len(filtered_df)} records to {filename}")
        
        # Show file size
        import os
        file_size = os.path.getsize(filename) / 1024  # Convert to KB
        print(f"📁 File size: {file_size:.1f} KB")
        
    except Exception as e:
        print(f"❌ Error saving file: {e}")

save_filtered_data(filtered_data, threshold)

✅ Successfully saved 122 records to ptt_tech_job_below_20_rec.csv
📁 File size: 15.1 KB


In [125]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
import random
import os
import re

def get_structured_content(url):
    """Extract structured content from a PTT post
    
    Returns:
        dict: {
            "title": str,
            "source": str,
            "urls": list[str],
            "content": str
        }
    """
    res = safe_request(url)
    if not res:
        return {}

    soup = BeautifulSoup(res.text, "html.parser")
    main_content = soup.select_one("div#main-content")
    if not main_content:
        return {}

    # 移除推文區
    for tag in main_content.find_all(['div', 'span'], recursive=False):
        tag.extract()

    text = main_content.get_text(separator="\n").strip()
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    
    # 擷取所有網址
    urls = [a['href'] for a in main_content.find_all('a', href=True)]

    # 解析內文結構
    title = ""
    source = ""
    content_lines = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if "標題" in line or "標題:" in line:
            title = re.sub(r'^.*?標題[:,：]\s*', '', line).strip()
        elif "作者" in line or "作者:" in line:
            source = re.sub(r'^.*?作者[:,：]\s*', '', line).strip()
        else:
            content_lines.append(line)
        i += 1

    # 清理內文，移除額外的 header 資訊
    clean_content = []
    content_started = False
    for line in content_lines:
        if not content_started:
            if "看板" in line or "時間" in line:
                continue
            content_started = True
        clean_content.append(line)

    return {
        "title": title,
        "source": source,
        "urls": urls,
        "content": "\n".join(clean_content).strip()
    }

def crawl_ptt_post_content(filtered_data: pd.DataFrame, output_file: str = None):
    """
    從 PTT 已篩選的文章中抓取每篇結構化內文，並儲存至 CSV。

    Args:
        filtered_data (pd.DataFrame): 含有 title/date/link/nrec 的資料
        output_file (str): 輸出檔案名稱
    """
    if output_file is None:
        output_file = get_data_path(f'{CONFIG["BOARD_NAME"]}_content.csv')
    
    # Check existing data
    existing_links = set()
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_csv(output_file)
            existing_links = set(existing_df["link"].dropna().tolist())
            print(f"📁 已存在資料筆數：{len(existing_links)}")
        except Exception as e:
            print(f"⚠️ 無法讀取 {output_file}，錯誤：{e}")

    # Filter new links
    to_crawl = filtered_data[~filtered_data["link"].isin(existing_links)]
    print(f"🚀 準備抓取新連結筆數：{len(to_crawl)}")

    # Process each post
    fieldnames = ["title", "date", "link", "nrec", "source", "content", "urls"]
    for _, row in tqdm(to_crawl.iterrows(), total=len(to_crawl)):
        structured_content = get_structured_content(row["link"])
        
        record = {
            "title": row["title"],
            "date": row["date"],
            "link": row["link"],
            "nrec": row["nrec"],
            "source": structured_content.get("source", ""),
            "content": structured_content.get("content", ""),
            "urls": "|".join(structured_content.get("urls", []))  # Join URLs with pipe separator
        }
        
        safe_write_csv(record, output_file, fieldnames=fieldnames)
        random_sleep()

    print("✅ 全部內文抓取完成！")

In [None]:
# Example Usage
if __name__ == "__main__":
    print(f"📂 Data will be saved in: {CONFIG['DATA_DIR']}")
    
    # 1. Crawl posts
    df = crawl_posts(num_pages=10)
    
    # Save raw data
    raw_filename = f'{CONFIG["BOARD_NAME"]}_raw.csv'
    df.to_csv(get_data_path(raw_filename), encoding='utf-8-sig', index=False)
    print(f"✅ Raw data saved to {raw_filename}")
    
    # 2. Process and save filtered data
    filtered_df = process_and_save_data(df)
    
    # 3. Visualize results
    #visualize_recommendations(filtered_df)
    
    # 4. Crawl post contents
    crawl_posts_content(filtered_df)

📂 Data will be saved in: c:\Users\USER\RAG_learning-project\stock


  0%|          | 0/5 [00:00<?, ?it/s]


📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4002.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4001.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4001.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4000.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4000.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3999.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3999.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3998.html

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index3998.html
✅ Raw data saved to stock_raw.csv
✅ Successfully saved 4 records with 40+ recommendations to c:\Users\USER\RAG_learning-project\stock\stock_above_40_rec.csv
📁 File size: 0.5 KB
📁 已存在資料筆數：13
🚀 準備抓取新連結筆數：0
✅ Raw data saved to stock_raw.csv
✅ Successfully saved 4 records with 40+ recommendations to c:\Users\USER\RAG_learning-project\stock\stock_above_40_rec.csv
📁 File size: 0.5 KB
📁 已存在資料筆數：13
🚀 準備抓取新連結筆數：0


0it [00:00, ?it/s]

✅ 全部內文抓取完成！


In [None]:
# Clean and filter content based on recommendations
def clean_content(text):
    if not isinstance(text, str):
        return text
    # Replace multiple newlines with a single newline
    cleaned = re.sub(r'\n\s*\n', '\n', text)
    # Remove leading/trailing whitespace and extra newlines
    cleaned = cleaned.strip()
    return cleaned

# Read the content data
result = pd.read_csv(get_data_path(f'{CONFIG["BOARD_NAME"]}_content.csv'))

# Clean the content column
result['content'] = result['content'].apply(clean_content)

# Convert recommendations to numeric values
result['nrec'] = pd.to_numeric(
    result['nrec'].replace('爆', '100').replace('X', '-1'), 
    errors='coerce'
).fillna(0)

# Filter and save data for different thresholds
thresholds = [20, 40]  # You can modify these thresholds as needed
for threshold in thresholds:
    # Filter data below threshold
    filtered_df = result[result['nrec'] < threshold]
    # Save filtered data
    output_filename = f'{CONFIG["BOARD_NAME"]}_below_{threshold}_rec.csv'
    filtered_df.to_csv(get_data_path(output_filename), encoding='utf-8-sig', index=False)
    print(f"✅ Saved {len(filtered_df)} records below {threshold} recommendations to {output_filename}")

# Display a sample of cleaned content
print("\n=== Sample of cleaned content (4th row) ===")
print(result.iloc[3]['content'])

=== Sample of cleaned content (4th row) ===
ETtoday新聞雲 2025年07月18日 16:23
快訊／台積電嘉義廠「2月4起工安意外」　女子遭百公斤冰水管砸死
記者翁伊森、黃資真／嘉義報導
，
細數這兩個月來的工安意外，
；
，
；
，2個月內的4起工安事故，已造成2死2傷。
https://www.ettoday.net/news/20250718/2998843.htm
怕
2個月4起工安事故  雖然這是外包廠商但工安事故也發生太頻繁了
上次好像停工一個月  這次感覺應該也是停一個月?
這樣下去嘉義廠有辦法時間內完工?
--


In [130]:
# Save the cleaned full data back to the CSV
result.to_csv(get_data_path(f'{CONFIG["BOARD_NAME"]}_content.csv'), encoding='utf-8-sig', index=False)
print(f"✅ Saved {len(result)} records to {CONFIG['BOARD_NAME']}_content.csv")