In [5]:
# 檢測PDF字體屬性
import fitz

def detect_font_properties(pdf_path):
    doc = fitz.open(pdf_path)
    font_properties = {}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    for span in line["spans"]:
                        font = span["font"]
                        size = span["size"]
                        text = span["text"]

                        # 在字典中建立字體屬性的鍵值對
                        if font not in font_properties:
                            font_properties[font] = {"sizes": set(), "texts": set()}
                        
                        font_properties[font]["sizes"].add(size)
                        font_properties[font]["texts"].add(text)

    doc.close()
    return font_properties

# 檢測 PDF 文件的字體屬性
pdf_path = r"/Users/tangjiahong/Dropbox/textmining/PDF/Files (35).PDF"
font_properties = detect_font_properties(pdf_path)

# 輸出字體屬性
for font, properties in font_properties.items():
    print("Font:", font)
    print("Sizes:", properties["sizes"])
    print("Texts:", properties["texts"])
    print()


Font: Helvetica
Sizes: {8.0, 9.0, 10.0, 6.0}

Font: Helvetica-Bold
Sizes: {8.0, 9.0, 10.0, 12.0, 14.0}
Texts: {' [**1329] ', ' [***845] ', ' [**1287] ', ' [*467] ', ' [**996] ', ' [****33] ', ' [*2289] ', ' [*580] ', ' [**230] ', ' [*641] ', ' [**1629] ', ' [**1652] ', ' [**2722] ', ' [****32] ', ' [1946] ', ' [**820] ', ' [*241] ', ' [*923] ', 'Andrew J. Pincus', ' [**1605] ', ' Trade Practices & Unfair Competition, Federal Trade Commission Act', ' [**1988] ', ' [1971] ', ' [*444] ', ' [**1758] ', ' [*578] ', ' [***234] ', ' [*45] ', ' [*2293] ', ' [*2295] ', ' [**2730] ', ' [***501] ', ' [***73] ', ' [****112] ', ' [*631] ', ' [****129] ', ' [*634] ', ' [***846] ', ' [**292] ', ' [*448] ', ' Federal Government, Claims By & Against', ' [*533] ', ' Sherman Act, Claims', ' [*374] ', ' [*2278] ', ' [*560] ', ' [***163] ', ' [**1001] ', ' [*262] ', ' [1972] ', ' [***49] ', ' [***165] ', ' [**693] ', ' [**805] ', ' Legislation, Statutory Remedies & Rights', ' Abuse of Discretion, Evidence'

**以Files_100為例, 以下為所有文本的字體及大小** <br>
+ **Font**: Helvetica
+ **Sizes**: {8.0, 9.0, 10.0, 6.0}<br>
<br>
+ **Font**: Helvetica-Bold
+ **Sizes**: {8.0, 9.0, 10.0, 12.0, 14.0}<br>
<br>
+ **Font**: Helvetica-Oblique
+ **Sizes**: {9.0, 10.0}<br>
<br>
+ **Font**: Helvetica-BoldOblique
+ **Sizes**: {9.0, 10.0, 14.0}<br>
<br>
+ **Font**: Arial
+ **Sizes**: {10.0}

**可以發現包含footnotes的字體及其大小為:**<br> 

**target_font="Helvetica", size1=6.0, size2=9.0**<br>

於是寫一個Function抓取目標文本字體 **"Helvetica"** 且開頭為 **6.0** 接著內容為 **9.0** 就可以抓取所有Footnotes <br>
+ 6.0： "*" , "+" , "footnotes開頭的數字標號" <br>

+ 9.0：footnotes內容


In [31]:
import fitz  # PyMuPDF
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False  # 標記是否開始收集footnotes文本
    footnote_text = ""  # 累積footnotes文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                            # 檢查是否停止收集footnotes文本
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    # 結束的條件
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                        # 檢查footnotes開始抓取的條件
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  # 開始收集footnotes文本
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    # 處理每行結束時的情況
                    if collecting and line_num == len(b["lines"]) - 1:
                        # 行结束且正在收集footnotes，保存footnotes
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        # 確保最後一個footnotes文本被保存
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

# 指定 PDF 路徑和 CSV 路徑
pdf_file_path = r"/Users/tangjiahong/Dropbox/textmining/PDF/Files (35).PDF"
csv_file_path = r"/Users/tangjiahong/Dropbox/textmining/FN/footnotes_text.csv"

# 提取footnotes寫入 CSV
extract_footnotes(pdf_file_path, csv_file_path)


**目前問題**<br>

+ 有些註腳沒有數字標號也沒有符號開頭，無法偵測抓取<br>

+ 有註腳包含了不符合預設註腳字體大小、字型的設定：HN\d+[] 導致其後文本未被抓取直接跳過

**註解**

+ csv_rows = [] 存取提取的FN文本及頁碼
+ collecting = False 標記是否正在收集FN
+ footnote_text = "" 臨時儲存正在收集的FN
+ for page_num in range(len(doc)) scan每一頁
+ page = doc.load_page(page_num)加載當前頁碼頁面
+ blocks = page.get_text("dict")["blocks"] 獲取當頁文本塊，組織成字典存在blocks中
+ for b in blocks scan當頁所有文本塊
+ if "lines" in b 檢查文本塊中有無包含"lines" 表示包括FN
+ for line_num, line in enumerate(b["line"]) scan當前文本塊
+ spans = line["spans"] 獲取文本所有片段
+ for i, span in enumerate(spans) scan所有片段
+ font_name = span.get('font','')獲取片段字體名稱
+ size = span.get("size",0)獲取片段字體大小
+ text = span.get("text","") 獲取片段文本<br>

+ if collecting 如果當前正在收集FN

    + 檢查片段是否為當前倒數第二個文本片段，並抓下一個文本片段的文本<br>
    
    + 若片段以句號結尾，且下一段以大寫字母開頭，表示結束。
    + 將收集好的FN存入csv_rows停止收集並重置
    + 若FN為結束，將當前片段添加至FN文本
+ if not collecting ... 若正在收集且達到當前文本塊最後一行，將FN加到csv_rows並停止收集
+ if collecting and line_num == len(b["lines"]) - 1 若正在收集且達最後一行將FN加到csv_rows並停止收集，加到csv_rows。

In [1]:
import fitz 
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False 
    footnote_text = "" 

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                           
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                    
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    
                    if collecting and line_num == len(b["lines"]) - 1:
                    
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r"/Users/tangjiahong/Dropbox/textmining/PDF/Files (100).PDF"
csv_file_path = r"/Users/tangjiahong/Dropbox/textmining/FN/footnotes_text1.csv"

extract_footnotes(pdf_file_path, csv_file_path)
