In [None]:
# 檢測PDF字體屬性
import fitz

def detect_font_properties(pdf_path):
    doc = fitz.open(pdf_path)
    font_properties = {}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    for span in line["spans"]:
                        font = span["font"]
                        size = span["size"]
                        text = span["text"]

                        # 在字典中建立字體屬性的鍵值對
                        if font not in font_properties:
                            font_properties[font] = {"sizes": set(), "texts": set()}
                        
                        font_properties[font]["sizes"].add(size)
                        font_properties[font]["texts"].add(text)

    doc.close()
    return font_properties

# 檢測 PDF 文件的字體屬性
pdf_path = r'C:\Users\User\Dropbox\textmining\PDF\Files (100).PDF'
font_properties = detect_font_properties(pdf_path)

# 輸出字體屬性
for font, properties in font_properties.items():
    print("Font:", font)
    print("Sizes:", properties["sizes"])
    print("Texts:", properties["texts"])
    print()


**以Files_100為例, 以下為所有文本的字體及大小** <br>
+ **Font**: Helvetica
+ **Sizes**: {8.0, 9.0, 10.0, 6.0}<br>
<br>
+ **Font**: Helvetica-Bold
+ **Sizes**: {8.0, 9.0, 10.0, 12.0, 14.0}<br>
<br>
+ **Font**: Helvetica-Oblique
+ **Sizes**: {9.0, 10.0}<br>
<br>
+ **Font**: Helvetica-BoldOblique
+ **Sizes**: {9.0, 10.0, 14.0}<br>
<br>
+ **Font**: Arial
+ **Sizes**: {10.0}

**可以發現包含footnotes的字體及其大小為:**<br> **target_font="Helvetica", size1=6.0, size2=9.0**<br>
於是寫一個Function抓取目標文本字體 **"Helvetica"** 且開頭為 **6.0** 接著內容為 **9.0** 就可以抓取所有Footnotes <br>
+ 6.0： "*" , "+" , "footnotes開頭的數字標號" 
+ 9.0：footnotes內容

In [15]:
import fitz  # PyMuPDF
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False  # 標記是否開始收集footnotes文本
    footnote_text = ""  # 累積footnotes文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                            # 檢查是否停止收集footnotes文本
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    # 結束的條件
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                        # 檢查footnotes開始抓取的條件
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  # 開始收集footnotes文本
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    # 處理每行結束時的情況
                    if collecting and line_num == len(b["lines"]) - 1:
                        # 行结束且正在收集footnotes，保存footnotes
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        # 確保最後一個footnotes文本被保存
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

# 指定 PDF 路徑和 CSV 路徑
pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\footnotes.csv'

# 提取footnotes寫入 CSV
extract_footnotes(pdf_file_path, csv_file_path)


In [4]:
# TEST
import fitz  # PyMuPDF
import csv
import re

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False
    post_colon = False
    footnote_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        # Check if current span matches the initial footnote criteria
                        if not collecting and font_name == target_font and size == size1:
                            collecting = True
                            footnote_text = text
                        elif collecting:
                            # Special handling after encountering a colon
                            if ":" in text:
                                post_colon = True
                            
                            # Append text if post-colon or matching footnote format
                            if post_colon or (font_name == target_font and size in [size1, size2]):
                                footnote_text += " " + text
                            
                            # Check conditions to end post-colon collection
                            if post_colon and ("\"" in text or "HN" in text or "LEdHN" in text or (font_name == target_font and size == size1)):
                                post_colon = False
                            
                            # Check for end of footnote collection
                            if not post_colon and (i == len(spans) - 1 or text.endswith(".")):
                                csv_rows.append([page_num + 1, footnote_text.strip()])
                                collecting = False
                                footnote_text = ""
                    
                    # Finalize collecting if at the end of a block and still collecting
                    if collecting and not post_colon:
                        csv_rows.append([page_num + 1, footnote_text.strip()])
                        collecting = False
                        footnote_text = ""

    # Handle any ongoing collection at the document's end
    if collecting:
        csv_rows.append([page_num + 1, footnote_text.strip()])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\footnotes.csv'

# Extract footnotes and write to CSV
extract_footnotes(pdf_file_path, csv_file_path)


NameError: name 'extract_footnotes' is not defined

**可以抓到全部, 但是很多不必要的東西, 也許可以用原本的Code 抓到標號和星號 然後去和抓到所有文本的 做對照 再做清理**

In [3]:
# FULL footnotes
import fitz  # PyMuPDF
import csv
import re

def extract_footnotes(pdf_path, csv_path):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False
    footnote_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        text = span.get("text", "")
                        if collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            csv_rows.append([page_num + 1, footnote_text.strip()])
                            collecting = False
                            footnote_text = ""
                        if not collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            collecting = True
                            footnote_text += " " + text
                        elif collecting:
                            footnote_text += " " + text
        if collecting:
            csv_rows.append([page_num + 1, footnote_text.strip()])
            collecting = False
            footnote_text = ""

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining\FN\footnotes.csv'
extract_footnotes(pdf_file_path, csv_file_path)


## TEST

In [12]:
import fitz  # PyMuPDF

def extract_footnotes(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    # 将整个文档的文本合并到一个字符串中
    for page in doc:
        full_text += page.get_text() + "\n"

    # 使用"Core Terms"和"End of Document"分割文本为不同的case
    case_texts = re.split(r"Core Terms|End of Document", full_text)
    
    cases_footnotes = []  # 用于存储每个case的footnotes列表

    for case_index, case_text in enumerate(case_texts):
        footnotes = []  # 存储当前case的footnotes
        # 在当前case文本中查找footnotes
        # 假设footnotes由特定格式标识，此处需要根据实际情况调整查找逻辑
        # 示例使用字体和大小标识footnotes的简化逻辑
        # 注意：以下代码块需要根据实际的footnotes格式进行调整
        footnotes_texts = re.findall(r'(?<=Footnote:).*?(?=\n\n|\Z)', case_text, re.DOTALL)
        for text in footnotes_texts:
            footnotes.append(text.strip())
        
        cases_footnotes.append((case_index + 1, footnotes))
    
    return cases_footnotes

# 指定PDF文件路径
pdf_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (35).PDF'

# 调用函数并打印结果
cases_footnotes = extract_footnotes(pdf_path)
for case_number, footnotes in cases_footnotes:
    print(f"Case {case_number}, Footnotes:")
    for i, footnote in enumerate(footnotes, start=1):
        print(f"  Footnote {i}: {footnote}")
    print("\n")


Case 1, Footnotes:


Case 2, Footnotes:


Case 3, Footnotes:


Case 4, Footnotes:


Case 5, Footnotes:


Case 6, Footnotes:


Case 7, Footnotes:


Case 8, Footnotes:


Case 9, Footnotes:


Case 10, Footnotes:


Case 11, Footnotes:


Case 12, Footnotes:


Case 13, Footnotes:


Case 14, Footnotes:


Case 15, Footnotes:


Case 16, Footnotes:


Case 17, Footnotes:


Case 18, Footnotes:


Case 19, Footnotes:


Case 20, Footnotes:


Case 21, Footnotes:


Case 22, Footnotes:


Case 23, Footnotes:


Case 24, Footnotes:


Case 25, Footnotes:


Case 26, Footnotes:


Case 27, Footnotes:


Case 28, Footnotes:


Case 29, Footnotes:


Case 30, Footnotes:


Case 31, Footnotes:


Case 32, Footnotes:


Case 33, Footnotes:


Case 34, Footnotes:


Case 35, Footnotes:


Case 36, Footnotes:


Case 37, Footnotes:


Case 38, Footnotes:


Case 39, Footnotes:


Case 40, Footnotes:


Case 41, Footnotes:


Case 42, Footnotes:


Case 43, Footnotes:


Case 44, Footnotes:


Case 45, Footnotes:


Case 46, Footnotes: