In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re


def get_latest_index(board_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(board_url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    # 取得上一頁按鈕連結
    prev_btn = soup.select_one("div.btn-group-paging a.btn.wide:nth-child(2)")
    if prev_btn:
        href = prev_btn["href"]  # e.g., /bbs/Tech_Job/index6772.html
        match = re.search(r"index(\d+)\.html", href)
        if match:
            return int(match.group(1)) + 1  # 最新頁 = 上一頁 + 1
    return None

def crawl_page(index):
    url = f"https://www.ptt.cc/bbs/Tech_Job/index{index}.html"
    print(f"\n📄 抓取頁面: {url}")

    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, "html.parser")

    articles = []
    for div in soup.select("div.r-ent"):
        title_tag = div.select_one("div.title > a")
        date_tag = div.select_one("div.meta > div.date")
        nrec_tag = div.select_one("div.nrec")

        if title_tag and date_tag:
            record = {
                "title": title_tag.text.strip(),
                "date": date_tag.text.strip(),
                "link": "https://www.ptt.cc" + title_tag["href"],
                "nrec": nrec_tag.text.strip() if nrec_tag else "0"
            }
            articles.append(record)
    return articles

def main():
    num_pages = 5  # 要往前抓幾頁
    board_url = "https://www.ptt.cc/bbs/Tech_Job/index.html"
    latest_index = get_latest_index(board_url)

    if not latest_index:
        print("❌ 無法取得最新頁碼")
        return

    # 開啟 CSV（追加寫入）
    with open("ptt_tech_job.csv", mode='a', encoding='utf-8-sig', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["title", "date", "link", "nrec"])
        if csv_file.tell() == 0:  # 若為新檔案，寫欄位名
            writer.writeheader()

        # 從最新頁往前 N 頁
        for page_index in range(latest_index, latest_index - num_pages, -1):
            try:
                articles = crawl_page(page_index)
                for record in articles:
                    writer.writerow(record)
                    print(f"✅ 寫入: {record['title']}")
            except Exception as e:
                print(f"⚠️ 發生錯誤於 index{page_index}: {e}")
            
            # 翻頁後等待
            time.sleep(random.uniform(1.0, 2.5))

    print(f"\n🎉 共處理 {num_pages} 頁完成！")

if __name__ == "__main__":
    main()



📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4001.html
✅ 寫入: [新聞] 黃仁勳大讚：中國AI崛起 沒有輝達也行
✅ 寫入: [公告] op1984 livehouse emarl5566 水桶一個月
✅ 寫入: [公告] eokuan 水桶一年
✅ 寫入: [討論] 有人知道鈺太科技這家公司嗎
✅ 寫入: [新聞] 經長︰爭取談到有競爭力的關稅
✅ 寫入: [討論] 高中生科展做1.5nm GAA 可以去台積RD嗎?
✅ 寫入: [公告] hinesmile romeie06 水桶一個月
✅ 寫入: [公告] teamax cityhunter04 水桶
✅ 寫入: [情報] 免費晶片安全實戰課程- 2 天
✅ 寫入: Re: [討論] 高中生科展做1.5nm GAA 可以去台積RD嗎?
✅ 寫入: [新聞] AI取代人力？人力銀行：企業估31%工作恐
✅ 寫入: [新聞] AI新十大建設 打造新護國群山
✅ 寫入: [新聞]三星困境都怪自己！2018 年黃仁勳曾上門尋
✅ 寫入: [公告] godman362 xabcxabc0123 Ferscism 水桶
✅ 寫入: Re: [討論] 高中生科展做1.5nm GAA 可以去台積RD嗎?
✅ 寫入: [新聞] AMD採購台積電美製晶片成本高　蘇姿丰：
✅ 寫入: Re: [討論] 高中生科展做1.5nm GAA 可以去台積RD
✅ 寫入: [公告] zrc888 水桶一年
✅ 寫入: [情報] 薪資查詢平台
✅ 寫入: [公告] 多重帳號違規處理說明與適用原則
✅ 寫入: [公告] 科技板（Tech_Job）板規2025版
✅ 寫入: [公告] 跑步哥列本板不受歡迎人物
✅ 寫入: [公告] 本版徵新版主

📄 抓取頁面: https://www.ptt.cc/bbs/Tech_Job/index4000.html
✅ 寫入: [請益] 有人覺得台積電薪水給太低嗎
✅ 寫入: [請益] 元鈦科技（散熱）文化好嗎？
✅ 寫入: Re: [請益] 有人覺得台積電薪水給太低嗎
✅ 寫入: [新聞] 輝達解禁 盧特尼克：讓中對美技術上癮
✅ 寫入: Re: [新聞] 徵才鎖定AI技能 6成企業願加薪最高3千
✅ 寫入: [新聞] 快訊／台積電嘉義廠「2月4起工

In [6]:
import pandas as pd
df = pd.read_csv("ptt_tech_job.csv")

In [7]:
def filter_low_recommendations(df, threshold=20):
    """
    Filter posts with recommendations below a threshold
    
    Args:
        df (pandas.DataFrame): DataFrame containing posts data
        threshold (int): minimum number of recommendations to include
    
    Returns:
        pandas.DataFrame: Filtered DataFrame with nrec < threshold
    """
    # Convert nrec to numeric first
    df['nrec'] = pd.to_numeric(df['nrec'].replace('爆', '100').replace('X', '-1'), errors='coerce').fillna(0)
    
    # Filter and return
    return df[df['nrec'] > threshold]

def visualize_recommendation_distribution(filtered_df, threshold=20):
    """
    Visualize the distribution of recommendations for filtered data
    
    Args:
        filtered_df (pandas.DataFrame): DataFrame containing filtered posts
        threshold (int): Maximum value for visualization bins
    """
    # Display statistics
    print(f"Posts with less than {threshold} recommendations:")
    print(f"Total count: {len(filtered_df)}")
    print("\nFiltered data preview:")
    print(filtered_df[['title', 'nrec', 'date']].head())
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    plt.hist(filtered_df['nrec'], bins=threshold, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of Posts with < {threshold} Recommendations')
    plt.xlabel('Number of Recommendations')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Print basic statistics
    print("\nBasic statistics of filtered recommendations:")
    print(filtered_df['nrec'].describe())

# Example usage:
threshold = 20
filtered_data = filter_low_recommendations(df, threshold)
#visualize_recommendation_distribution(filtered_data, threshold)

In [8]:
def save_filtered_data(filtered_df, threshold=20, filename=None):
    """
    Save filtered recommendations data to CSV
    
    Args:
        filtered_df (pandas.DataFrame): DataFrame containing filtered posts
        threshold (int): Threshold used for filtering (default: 20)
        filename (str): Optional custom filename (default: None)
    """
    if filename is None:
        filename = f'ptt_tech_job_below_{threshold}_rec.csv'
    
    try:
        # Save to CSV with UTF-8-SIG encoding for Chinese characters
        filtered_df.to_csv(filename, encoding='utf-8-sig', index=False)
        print(f"✅ Successfully saved {len(filtered_df)} records to {filename}")
        
        # Show file size
        import os
        file_size = os.path.getsize(filename) / 1024  # Convert to KB
        print(f"📁 File size: {file_size:.1f} KB")
        
    except Exception as e:
        print(f"❌ Error saving file: {e}")

save_filtered_data(filtered_data, threshold)

✅ Successfully saved 65 records to ptt_tech_job_below_20_rec.csv
📁 File size: 7.8 KB


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
import random
import os

def get_post_content(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        res = requests.get(url, headers=headers)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, "html.parser")

        content_tag = soup.select_one("div#main-content")
        if not content_tag:
            return ""

        # 移除推文與標籤
        for tag in content_tag.select("div, span"):
            tag.extract()

        text = content_tag.get_text(separator="\n").strip()
        return text
    except Exception as e:
        print(f"⚠️ 抓取失敗: {url} | 錯誤: {e}")
        return ""

def crawl_ptt_post_content(filtered_data: pd.DataFrame, output_file: str = "ptt_tech_job_content.csv"):
    """
    從 PTT 已篩選的文章中抓取每篇內文，並儲存至 CSV，具備避免重複功能。

    Args:
        filtered_data (pd.DataFrame): 含有 title/date/link/nrec 的資料
        output_file (str): 輸出檔案名稱
    """
    # === 讀取已存在的 link 避免重複 ===
    existing_links = set()
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_csv(output_file)
            existing_links = set(existing_df["link"].dropna().tolist())
            print(f"📁 已存在資料筆數：{len(existing_links)}")
        except Exception as e:
            print(f"⚠️ 無法讀取 {output_file}，錯誤：{e}")

    # === 篩選尚未處理的連結 ===
    to_crawl = filtered_data[~filtered_data["link"].isin(existing_links)]

    print(f"🚀 準備抓取新連結筆數：{len(to_crawl)}")

    # === 寫入 CSV ===
    with open(output_file, mode='a', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=["title", "date", "link", "nrec", "content"])
        if f.tell() == 0:
            writer.writeheader()

        for _, row in to_crawl.iterrows():
            url = row["link"]
            print(f"🔍 抓取內容: {url}")
            content = get_post_content(url)

            writer.writerow({
                "title": row["title"],
                "date": row["date"],
                "link": row["link"],
                "nrec": row["nrec"],
                "content": content
            })

            sleep_time = round(random.uniform(0.2, 1.0), 2)
            print(f"⏳ 等待 {sleep_time} 秒...\n")
            time.sleep(sleep_time)

    print("✅ 全部內文抓取完成！")


In [None]:
crawl_ptt_post_content(filtered_data, output_file="ptt_tech_job_content.csv")

📁 已存在資料筆數：31
🚀 準備抓取新連結筆數：0
✅ 全部內文抓取完成！
