In [6]:
# 檢測PDF字體屬性
import fitz

def detect_font_properties(pdf_path):
    doc = fitz.open(pdf_path)
    font_properties = {}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    for span in line["spans"]:
                        font = span["font"]
                        size = span["size"]
                        text = span["text"]

                        # 在字典中建立字體屬性的鍵值對
                        if font not in font_properties:
                            font_properties[font] = {"sizes": set(), "texts": set()}
                        
                        font_properties[font]["sizes"].add(size)
                        font_properties[font]["texts"].add(text)

    doc.close()
    return font_properties

# 檢測 PDF 文件的字體屬性
pdf_path = r'C:\Users\User\Dropbox\textmining1\test_PDF\case6.pdf'
font_properties = detect_font_properties(pdf_path)

# 輸出字體屬性
for font, properties in font_properties.items():
    print("Font:", font)
    print("Sizes:", properties["sizes"])
    print("Texts:", properties["texts"])
    print()


Font: Helvetica
Sizes: {8.0, 9.0, 10.0, 6.0}
Texts: {'Page 2 of 14', 'shall be deemed guilty of a misdemeanor, and, on conviction thereof, shall be punished by fine not exceeding fifty thousand ', ', set forth in note 11, ', 'MONOPOLIES, AND UNFAIR TRADE PRACTICES §20  > exemptions -- agricultural associations -- purpose. --  > Headnote:', 'consent of the parties, considered and decided this ', ' charge. ', 'conspired with Embassy Dairy and others to eliminate and foreclose competition in the same milk market area, in ', 'unduly enhanced by reason thereof [after a "show cause" hearing he may direct] such association to cease and desist from ', '* ', ').', 'entities. As the House Report on the Capper-Volstead Act said:', "] [4]In the early 1900's, when agricultural cooperatives were growing in effectiveness, there was ", ' a monopoly in violation of § 7 of the Clayton Act. ', ' the cause for future orders, including the right of visitation if deemed appropriate.  See  ', "defendant's co

**以Files_100為例, 以下為所有文本的字體及大小** <br>
+ **Font**: Helvetica
+ **Sizes**: {8.0, 9.0, 10.0, 6.0}<br>
<br>
+ **Font**: Helvetica-Bold
+ **Sizes**: {8.0, 9.0, 10.0, 12.0, 14.0}<br>
<br>
+ **Font**: Helvetica-Oblique
+ **Sizes**: {9.0, 10.0}<br>
<br>
+ **Font**: Helvetica-BoldOblique
+ **Sizes**: {9.0, 10.0, 14.0}<br>
<br>
+ **Font**: Arial
+ **Sizes**: {10.0}

**可以發現包含footnotes的字體及其大小為:**<br> **target_font="Helvetica", size1=6.0, size2=9.0**<br>
於是寫一個Function抓取目標文本字體 **"Helvetica"** 且開頭為 **6.0** 接著內容為 **9.0** 就可以抓取所有Footnotes <br>
+ 6.0： "*" , "+" , "footnotes開頭的數字標號" 
+ 9.0：footnotes內容

In [15]:
import fitz  # PyMuPDF
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False  # 標記是否開始收集footnotes文本
    footnote_text = ""  # 累積footnotes文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                            # 檢查是否停止收集footnotes文本
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    # 結束的條件
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                        # 檢查footnotes開始抓取的條件
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  # 開始收集footnotes文本
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    # 處理每行結束時的情況
                    if collecting and line_num == len(b["lines"]) - 1:
                        # 行结束且正在收集footnotes，保存footnotes
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        # 確保最後一個footnotes文本被保存
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

# 指定 PDF 路徑和 CSV 路徑
pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\footnotes.csv'

# 提取footnotes寫入 CSV
extract_footnotes(pdf_file_path, csv_file_path)


In [4]:
# TEST
import fitz  # PyMuPDF
import csv
import re

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False
    post_colon = False
    footnote_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        # Check if current span matches the initial footnote criteria
                        if not collecting and font_name == target_font and size == size1:
                            collecting = True
                            footnote_text = text
                        elif collecting:
                            # Special handling after encountering a colon
                            if ":" in text:
                                post_colon = True
                            
                            # Append text if post-colon or matching footnote format
                            if post_colon or (font_name == target_font and size in [size1, size2]):
                                footnote_text += " " + text
                            
                            # Check conditions to end post-colon collection
                            if post_colon and ("\"" in text or "HN" in text or "LEdHN" in text or (font_name == target_font and size == size1)):
                                post_colon = False
                            
                            # Check for end of footnote collection
                            if not post_colon and (i == len(spans) - 1 or text.endswith(".")):
                                csv_rows.append([page_num + 1, footnote_text.strip()])
                                collecting = False
                                footnote_text = ""
                    
                    # Finalize collecting if at the end of a block and still collecting
                    if collecting and not post_colon:
                        csv_rows.append([page_num + 1, footnote_text.strip()])
                        collecting = False
                        footnote_text = ""

    # Handle any ongoing collection at the document's end
    if collecting:
        csv_rows.append([page_num + 1, footnote_text.strip()])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\footnotes.csv'

# Extract footnotes and write to CSV
extract_footnotes(pdf_file_path, csv_file_path)


NameError: name 'extract_footnotes' is not defined

**可以抓到全部, 但是很多不必要的東西, 也許可以用原本的Code 抓到標號和星號 然後去和抓到所有文本的 做對照 再做清理**

In [3]:
# FULL footnotes
import fitz  # PyMuPDF
import csv
import re

def extract_footnotes(pdf_path, csv_path):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False
    footnote_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        text = span.get("text", "")
                        if collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            csv_rows.append([page_num + 1, footnote_text.strip()])
                            collecting = False
                            footnote_text = ""
                        if not collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            collecting = True
                            footnote_text += " " + text
                        elif collecting:
                            footnote_text += " " + text
        if collecting:
            csv_rows.append([page_num + 1, footnote_text.strip()])
            collecting = False
            footnote_text = ""

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining\FN\footnotes.csv'
extract_footnotes(pdf_file_path, csv_file_path)


## TEST

In [13]:
import fitz  # PyMuPDF
import re

def extract_and_print_cases(pdf_path):
    doc = fitz.open(pdf_path)
    collecting = False  # 标记是否开始收集案例文本
    case_text = ""  # 累积案例文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for span in spans:
                        text = span.get("text", "")
                        if collecting and "End of Document" in text:
                            # 当遇到“End of Document”，结束当前案例的收集并打印
                            print(case_text.strip())
                            print("---- Case End ----\n")  # 案例结束的标记
                            collecting = False
                            case_text = ""
                        elif "Core Terms" in text and not collecting:
                            # 当遇到“Core Terms”，开始新的案例收集
                            collecting = True
                            case_text += text
                        elif collecting:
                            case_text += " " + text
        if collecting:
            # 确保最后一个案例文本被打印
            print(case_text.strip())
            print("---- Case End ----\n")
            collecting = False
            case_text = ""

    doc.close()

# 指定PDF路径
pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'

# 提取案例并打印
extract_and_print_cases(pdf_file_path)


Core Terms contests, Baseball, boxing, promoters, championship, commerce, rights, exhibition, television, motion picture,  exemption, sport, interstate, negotiate, boxers, radio, heavyweight, broadcast, bout, anti trust law, Sherman Act,  Sherman Law, allegations, enterprises, arrange, fight, radio and television, exclusive right, sale of tickets, stare  decisis LexisNexis® Headnotes Antitrust & Trade Law > Sherman Act > General Overview HN1 [ ]   Antitrust & Trade Law, Sherman Act See  15 U.S.C.S. § 1 . Antitrust & Trade Law > Sherman Act > General Overview HN2 [ ]   Antitrust & Trade Law, Sherman Act See  15 U.S.C.S. § 2 . Antitrust & Trade Law > Regulated Industries > Sports > Baseball Antitrust & Trade Law > Regulated Industries > Sports > General Overview Antitrust & Trade Law > Sherman Act > General Overview
---- Case End ----

Core Terms championship, boxing, contests, decree, promotion, Square, divestiture, conspiracy, stock, antitrust, Stadium,  arenas, fights, staged, dissolu