**以Files_100為例, 以下為所有文本的字體及大小** <br>
+ **Font**: Helvetica
+ **Sizes**: {8.0, 9.0, 10.0, 6.0}<br>
<br>
+ **Font**: Helvetica-Bold
+ **Sizes**: {8.0, 9.0, 10.0, 12.0, 14.0}<br>
<br>
+ **Font**: Helvetica-Oblique
+ **Sizes**: {9.0, 10.0}<br>
<br>
+ **Font**: Helvetica-BoldOblique
+ **Sizes**: {9.0, 10.0, 14.0}<br>
<br>
+ **Font**: Arial
+ **Sizes**: {10.0}

**可以發現包含footnotes的字體及其大小為:**<br> **target_font="Helvetica", size1=6.0, size2=9.0**<br>
於是寫一個Function抓取目標文本字體 **"Helvetica"** 且開頭為 **6.0** 接著內容為 **9.0** 就可以抓取所有Footnotes <br>
+ 6.0： "*" , "+" , "footnotes開頭的數字標號" 
+ 9.0：footnotes內容

In [15]:
import fitz  # PyMuPDF
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False  # 標記是否開始收集footnotes文本
    footnote_text = ""  # 累積footnotes文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                            # 檢查是否停止收集footnotes文本
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    # 結束的條件
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                        # 檢查footnotes開始抓取的條件
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  # 開始收集footnotes文本
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    # 處理每行結束時的情況
                    if collecting and line_num == len(b["lines"]) - 1:
                        # 行结束且正在收集footnotes，保存footnotes
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        # 確保最後一個footnotes文本被保存
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

# 指定 PDF 路徑和 CSV 路徑
pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\footnotes.csv'

# 提取footnotes寫入 CSV
extract_footnotes(pdf_file_path, csv_file_path)


**可以抓到全部, 但是很多不必要的東西, 也許可以用原本的Code 抓到標號和星號 然後去和抓到所有文本的 做對照 再做清理**

In [3]:
# FULL footnotes
import fitz  # PyMuPDF
import csv
import re

def extract_footnotes(pdf_path, csv_path):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False
    footnote_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        text = span.get("text", "")
                        if collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            csv_rows.append([page_num + 1, footnote_text.strip()])
                            collecting = False
                            footnote_text = ""
                        if not collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            collecting = True
                            footnote_text += " " + text
                        elif collecting:
                            footnote_text += " " + text
        if collecting:
            csv_rows.append([page_num + 1, footnote_text.strip()])
            collecting = False
            footnote_text = ""

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining\FN\footnotes.csv'
extract_footnotes(pdf_file_path, csv_file_path)


## 最終版（沒發現問題）

**發現 Black Lines**
+ 針對黑線和文本塊進行歸類

    + 能辨識Footnotes上面的黑色線條

    + 文本塊進行排列

+ 若無符號開頭的文本視為上一個符號的內容

    + 設定若文本塊前面沒符號歸類在上一個符號

    + files35_page20  3, 會被視為是一個新的標號
    

In [1]:
import fitz
import csv
import re

def extract_text_below_lines(pdf_path, csv_path, x_range, color, width_range):
    doc = fitz.open(pdf_path)

    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Page', 'Label', 'Text'])

        for page_num in range(len(doc)):
            page = doc[page_num]
            text_blocks = page.get_text('blocks')
            lines = []

            
            for item in page.get_drawings():
                if item['type'] == 's' and item['color'] == color and width_range[0] <= item['width'] <= width_range[1]:
                    line_start = item['items'][0][1]
                    line_end = item['items'][0][2]
                    if x_range[0] <= line_start.x <= x_range[1] and x_range[0] <= line_end.x <= x_range[1]:
                        lines.append(item)

            lines.sort(key=lambda l: l['items'][0][1].y)

            last_label = None
            accumulated_text = ''

            if lines:  
                last_line_y = lines[-1]['items'][0][1].y  
                for block in sorted(text_blocks, key=lambda b: b[1]): 
                    if block[1] > last_line_y: 
                        text = block[4].strip()
                        
                        if re.match(r'^[\d\*\+]', text):
                            if accumulated_text: 
                                writer.writerow([page_num + 1, last_label, accumulated_text])
                                accumulated_text = '' 
                            last_label = re.findall(r'^[\d\*\+]+', text)[0]
                            text = re.sub(r'^[\d\*\+]+', '', text).strip()  
                        accumulated_text += ' ' + text

            
            if accumulated_text:
                writer.writerow([page_num + 1, last_label, accumulated_text])

    doc.close()


pdf_file_path = r'/Users/tangjiahong/Dropbox/textmining1/PDF/Files (100).PDF'
csv_file_path = r'/Users/tangjiahong/Dropbox/textmining1/FN/FN_final_100.csv'
x_range = (50, 563)  
color = (0.0, 0.0, 0.0)  
width_range = (0.72, 0.73) 

extract_text_below_lines(pdf_file_path, csv_file_path, x_range, color, width_range)
