# Phase 1: 批次清洗 10-K 報告

此筆記本會批次處理 `data/10k_raw` 中的所有 10-K 報告檔案，
提取相關章節（Item 1, 1A, 1C, 7, 7A, 9A + ESG/Cybersecurity）並儲存到 `data/10k_cleaned`。

In [1]:
import sys
import json
import os
from pathlib import Path
import re

# 切換到專案根目錄
os.chdir('..')
print(f"工作目錄: {Path.cwd()}")

# 導入預處理模組
sys.path.insert(0, 'src')
from preprocess import process_html_file, ensure_outdir, IN_ROOT, OUT_DIR

# 確認路徑並建立輸出目錄
print(f"\n輸入目錄: {IN_ROOT}")
print(f"輸出目錄: {OUT_DIR}")
print(f"輸入目錄存在: {IN_ROOT.exists()}")

ensure_outdir()
print(f"輸出目錄已建立: {OUT_DIR.exists()}")

工作目錄: /home/wa1ter/RAG-ESG-DRI

輸入目錄: /home/wa1ter/RAG-ESG-DRI/data/10k_raw
輸出目錄: /home/wa1ter/RAG-ESG-DRI/data/10k_cleaned
輸入目錄存在: True
輸出目錄已建立: True


In [2]:
def extract_year_from_path(filing_path: Path) -> int:
    """
    從檔案路徑中提取年份
    例如: 0001047469-15-001377 -> 2015
    """
    filing_num = filing_path.parent.name
    match = re.search(r'-(\d{2})-', filing_num)
    if match:
        year_suffix = int(match.group(1))
        year = 2000 + year_suffix if year_suffix <= 24 else 1900 + year_suffix
        return year
    return None

def extract_company_from_path(filing_path: Path) -> str:
    """
    從路徑提取公司代號
    例如: data/10k_raw/AAPL/10-K/... -> AAPL
    """
    parts = filing_path.parts
    for i, part in enumerate(parts):
        if part == '10k_raw' and i + 1 < len(parts):
            return parts[i + 1]
    return "UNKNOWN"

In [3]:
# 找出所有 HTML 檔案
def find_all_html_files(root: Path):
    """
    找出所有 primary-document.html 檔案
    回傳 list of (company, year, file_path)
    """
    files_info = []
    
    for html_file in root.rglob("primary-document.html"):
        company = extract_company_from_path(html_file)
        year = extract_year_from_path(html_file)
        if year:
            files_info.append((company, year, html_file))
    
    return sorted(files_info, key=lambda x: (x[0], x[1]))

# 掃描所有檔案
all_files = find_all_html_files(IN_ROOT)
print(f"共找到 {len(all_files)} 個 10-K 報告檔案")
print(f"\n前 5 個檔案:")
for company, year, path in all_files[:5]:
    print(f"  {company} {year}")

共找到 100 個 10-K 報告檔案

前 5 個檔案:
  A 2015
  A 2016
  A 2017
  A 2018
  A 2019


In [4]:
# 批次處理所有檔案
processed_count = 0
error_count = 0
errors = []
results_summary = []

for company, year, html_path in all_files:
    try:
        print(f"處理中: {company} {year}...", end=" ")
        
        # 提取章節
        sections = process_html_file(html_path)
        
        # 統計非空章節
        non_empty_sections = {k: v for k, v in sections.items() if v and len(v.strip()) > 0}
        
        if not non_empty_sections:
            print("⚠ 未找到任何章節內容")
            continue
        
        # 建立輸出資料
        rel_path = html_path.relative_to(IN_ROOT)
        output_data = {
            "company": company,
            "year": year,
            "source_path": str(rel_path).replace("\\", "/"),
            **sections
        }
        
        # 儲存為 JSON 檔案
        output_filename = f"{company.lower()}_{year}.json"
        output_path = OUT_DIR / output_filename
        output_path.write_text(
            json.dumps(output_data, ensure_ascii=False, indent=2),
            encoding="utf-8"
        )
        
        # 記錄統計
        section_stats = {k: len(v) for k, v in non_empty_sections.items()}
        results_summary.append({
            "company": company,
            "year": year,
            "sections": list(non_empty_sections.keys()),
            "stats": section_stats
        })
        
        print(f"✓ 成功 ({len(non_empty_sections)} 個章節)")
        processed_count += 1
        
    except Exception as e:
        error_count += 1
        error_msg = f"{company} {year}: {str(e)}"
        errors.append(error_msg)
        print(f"✗ 錯誤: {str(e)[:80]}")

print(f"\n{'='*60}")
print(f"處理完成!")
print(f"成功: {processed_count} 個檔案")
print(f"錯誤: {error_count} 個檔案")

if errors:
    print(f"\n錯誤詳情 (前 10 個):")
    for err in errors[:10]:
        print(f"  - {err}")

處理中: A 2015... ✓ 成功 (5 個章節)
處理中: A 2016... ✓ 成功 (5 個章節)
處理中: A 2017... ✓ 成功 (5 個章節)
處理中: A 2018... ✓ 成功 (5 個章節)
處理中: A 2019... 


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "lxml")


✓ 成功 (5 個章節)
處理中: A 2020... ✓ 成功 (5 個章節)
處理中: A 2021... ✓ 成功 (5 個章節)
處理中: A 2022... ✓ 成功 (5 個章節)
處理中: A 2023... ✓ 成功 (5 個章節)
處理中: A 2024... ✓ 成功 (7 個章節)
處理中: AAPL 2015... ✓ 成功 (5 個章節)
處理中: AAPL 2016... ✓ 成功 (5 個章節)
處理中: AAPL 2017... ✓ 成功 (5 個章節)
處理中: AAPL 2018... ✓ 成功 (5 個章節)
處理中: AAPL 2019... ✓ 成功 (5 個章節)
處理中: AAPL 2020... ✓ 成功 (5 個章節)
處理中: AAPL 2021... ✓ 成功 (5 個章節)
處理中: AAPL 2022... ✓ 成功 (5 個章節)
處理中: AAPL 2023... ✓ 成功 (5 個章節)
處理中: AAPL 2024... ✓ 成功 (6 個章節)
處理中: ABBV 2015... ✓ 成功 (5 個章節)
處理中: ABBV 2016... ✓ 成功 (5 個章節)
處理中: ABBV 2017... ✓ 成功 (5 個章節)
處理中: ABBV 2018... ✓ 成功 (5 個章節)
處理中: ABBV 2019... ✓ 成功 (5 個章節)
處理中: ABBV 2020... ✓ 成功 (5 個章節)
處理中: ABBV 2021... ✓ 成功 (5 個章節)
處理中: ABBV 2022... ✓ 成功 (5 個章節)
處理中: ABBV 2023... ✓ 成功 (5 個章節)
處理中: ABBV 2024... ✓ 成功 (6 個章節)
處理中: ABT 2015... ✓ 成功 (5 個章節)
處理中: ABT 2016... ✓ 成功 (5 個章節)
處理中: ABT 2017... ✓ 成功 (5 個章節)
處理中: ABT 2018... ✓ 成功 (6 個章節)
處理中: ABT 2019... ✓ 成功 (5 個章節)
處理中: ABT 2020... ✓ 成功 (5 個章節)
處理中: ABT 2021... ✓ 成功 (5 個章節)
處理中: ABT 2022... 

In [5]:
# 統計分析
print("\n=== 處理結果統計 ===")
print(f"總處理檔案數: {processed_count}")

# 統計各章節的出現次數
section_counts = {}
for result in results_summary:
    for section in result["sections"]:
        section_counts[section] = section_counts.get(section, 0) + 1

print(f"\n各章節出現次數:")
for section, count in sorted(section_counts.items(), key=lambda x: -x[1]):
    percentage = (count / processed_count * 100) if processed_count > 0 else 0
    print(f"  {section:25s}: {count:4d} ({percentage:5.1f}%)")

# 統計各公司的報告數量
company_counts = {}
for result in results_summary:
    company = result["company"]
    company_counts[company] = company_counts.get(company, 0) + 1

print(f"\n各公司報告數量 (前 10 個):")
for company, count in sorted(company_counts.items(), key=lambda x: -x[1])[:10]:
    print(f"  {company:10s}: {count} 份報告")


=== 處理結果統計 ===
總處理檔案數: 100

各章節出現次數:
  item_1                   :  100 (100.0%)
  item_1a                  :  100 (100.0%)
  item_7                   :  100 (100.0%)
  item_9a                  :  100 (100.0%)
  item_7a                  :   80 ( 80.0%)
  cybersecurity            :   16 ( 16.0%)
  item_1c                  :    8 (  8.0%)
  esg_sustainability       :    4 (  4.0%)

各公司報告數量 (前 10 個):
  A         : 10 份報告
  AAPL      : 10 份報告
  ABBV      : 10 份報告
  ABT       : 10 份報告
  ACGL      : 10 份報告
  ACN       : 10 份報告
  ADBE      : 10 份報告
  ADI       : 10 份報告
  ADM       : 10 份報告
  ADP       : 10 份報告


In [6]:
# 檢視範例輸出
output_files = list(OUT_DIR.glob("*.json"))
if output_files:
    sample_file = output_files[0]
    print(f"\n檢視範例檔案: {sample_file.name}")
    print("="*60)
    
    with open(sample_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    print(f"公司: {data.get('company', 'N/A')}")
    print(f"年份: {data.get('year', 'N/A')}")
    print(f"來源: {data.get('source_path', 'N/A')}")
    print(f"\n包含的章節:")
    
    for key in ['item_1', 'item_1a', 'item_1c', 'item_7', 'item_7a', 'item_9a', 
                'cybersecurity', 'information_security', 'esg_sustainability']:
        content = data.get(key, "")
        if content and len(content.strip()) > 0:
            preview = content[:200].replace('\n', ' ')
            print(f"\n  [{key}]: {len(content)} 字元")
            print(f"    預覽: {preview}...")
else:
    print("\n沒有找到輸出檔案")


檢視範例檔案: adm_2015.json
公司: ADM
年份: 2015
來源: ADM/10-K/0000007084-15-000005/primary-document.html

包含的章節:

  [item_1]: 4467 字元
    預覽: Item 1. BUSINESS (Continued) The Company is working with the U.S. Department of Energy’s National Energy Technology Laboratory and other key academic and corporate partners on projects to demonstrate ...

  [item_1a]: 5469 字元
    預覽: Item 1A. RISK FACTORS (Continued) The Company’s business is capital-intensive in nature and the Company relies on cash generated from its operations and external financing to fund its growth and ongoi...

  [item_7]: 6066 字元
    預覽: Item 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS (Continued) Oilseeds Processing operating profit increased  $132 million  to  $1.6 billion . Included in t...

  [item_7a]: 4099 字元
    預覽: Item 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK The market risk inherent in the Company’s market risk sensitive instruments and positions is the p

In [7]:
# 儲存處理摘要
summary_file = OUT_DIR / "_processing_summary.json"
summary_data = {
    "total_processed": processed_count,
    "total_errors": error_count,
    "section_counts": section_counts,
    "company_counts": company_counts,
    "results": results_summary
}

summary_file.write_text(
    json.dumps(summary_data, ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print(f"\n處理摘要已儲存至: {summary_file}")


處理摘要已儲存至: /home/wa1ter/RAG-ESG-DRI/data/10k_cleaned/_processing_summary.json
