**以Files_100為例, 以下為所有文本的字體及大小** <br>
+ **Font**: Helvetica
+ **Sizes**: {8.0, 9.0, 10.0, 6.0}<br>
<br>
+ **Font**: Helvetica-Bold
+ **Sizes**: {8.0, 9.0, 10.0, 12.0, 14.0}<br>
<br>
+ **Font**: Helvetica-Oblique
+ **Sizes**: {9.0, 10.0}<br>
<br>
+ **Font**: Helvetica-BoldOblique
+ **Sizes**: {9.0, 10.0, 14.0}<br>
<br>
+ **Font**: Arial
+ **Sizes**: {10.0}

**可以發現包含footnotes的字體及其大小為:**<br> **target_font="Helvetica", size1=6.0, size2=9.0**<br>
於是寫一個Function抓取目標文本字體 **"Helvetica"** 且開頭為 **6.0** 接著內容為 **9.0** 就可以抓取所有Footnotes <br>
+ 6.0： "*" , "+" , "footnotes開頭的數字標號" 
+ 9.0：footnotes內容

In [15]:
import fitz  # PyMuPDF
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False  # 標記是否開始收集footnotes文本
    footnote_text = ""  # 累積footnotes文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                            # 檢查是否停止收集footnotes文本
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    # 結束的條件
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                        # 檢查footnotes開始抓取的條件
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  # 開始收集footnotes文本
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    # 處理每行結束時的情況
                    if collecting and line_num == len(b["lines"]) - 1:
                        # 行结束且正在收集footnotes，保存footnotes
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        # 確保最後一個footnotes文本被保存
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

# 指定 PDF 路徑和 CSV 路徑
pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\footnotes.csv'

# 提取footnotes寫入 CSV
extract_footnotes(pdf_file_path, csv_file_path)


**可以抓到全部, 但是很多不必要的東西, 也許可以用原本的Code 抓到標號和星號 然後去和抓到所有文本的 做對照 再做清理**

In [3]:
# FULL footnotes
import fitz  # PyMuPDF
import csv
import re

def extract_footnotes(pdf_path, csv_path):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False
    footnote_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        text = span.get("text", "")
                        if collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            csv_rows.append([page_num + 1, footnote_text.strip()])
                            collecting = False
                            footnote_text = ""
                        if not collecting and re.search(r'^\s*("[^"]+"|HN\d+\[|\d+)', text):
                            collecting = True
                            footnote_text += " " + text
                        elif collecting:
                            footnote_text += " " + text
        if collecting:
            csv_rows.append([page_num + 1, footnote_text.strip()])
            collecting = False
            footnote_text = ""

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining\FN\footnotes.csv'
extract_footnotes(pdf_file_path, csv_file_path)


## 最終版（沒發現問題）

**發現 Black Lines**
+ 針對黑線和文本塊進行歸類

    + 能辨識Footnotes上面的黑色線條

    + 文本塊進行排列

+ 若無符號開頭的文本視為上一個符號的內容

    + 設定若文本塊前面沒符號歸類在上一個符號

    + files35_page20  3, 會被視為是一個新的標號
    

+ 1. 繪製邊界框 -> 找關鍵字的座標 -> 確認座標範圍文本 -> 看某個頁面的黑線座標 <br>

    + -> 設Function定義黑線座標X = [50, 563] , color = 黑色(0,0,0) , width = [0.72, 0.73] 和文本塊範圍 **(黑線以下)** <br> 

    + -> 設一個For迴圈遍歷資料夾及子資料夾的PDF檔案 

+ 2. Supreme法案文本結構不同 -> 發現文本塊為左右並排 **(一個頁面可能出現兩條線)** -> 分析每條線下方的文本塊 <br> 

    + -> 定義兩條黑線的範圍 -> 檢查兩條線下面的文本塊是否正確識別 -> 設定For迴圈遍歷特定子資料夾的PDF檔案 <br>

In [4]:
import fitz
import csv
import re

def extract_text_below_lines(pdf_path, csv_path, x_range, color, width_range):
    doc = fitz.open(pdf_path)

    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Page', 'Label', 'Text'])

        for page_num in range(len(doc)):
            page = doc[page_num]
            text_blocks = page.get_text('blocks')
            lines = []

            # 提取頁面上的圖案
            for item in page.get_drawings():
                # s為線條, 且顏色(0,0,0)黑色, 跟寬度符合range[50,563]
                if item['type'] == 's' and item['color'] == color and width_range[0] <= item['width'] <= width_range[1]:
                    line_start = item['items'][0][1]
                    line_end = item['items'][0][2]
                    if x_range[0] <= line_start.x <= x_range[1] and x_range[0] <= line_end.x <= x_range[1]:
                        lines.append(item)
            # 按照起始點的Y座標排序
            lines.sort(key=lambda l: l['items'][0][1].y)
            # 初始化標籤和累積文本
            last_label = None
            accumulated_text = ''
            # 提取線條下方文本
            if lines:  
                last_line_y = lines[-1]['items'][0][1].y  
                for block in sorted(text_blocks, key=lambda b: b[1]): 
                    if block[1] > last_line_y: 
                        text = block[4].strip()
                        # 文本開頭為 數字標號 , 星號 , 加號 作為一個新的段落
                        if re.match(r'^[\d\*\+]', text):
                            if accumulated_text: 
                                writer.writerow([page_num + 1, last_label, accumulated_text])
                                accumulated_text = '' 
                            # 更新最後標籤並移除文本塊前面的符號
                            last_label = re.findall(r'^[\d\*\+]+', text)[0]
                            text = re.sub(r'^[\d\*\+]+', '', text).strip()  
                        accumulated_text += ' ' + text

            
            if accumulated_text:
                writer.writerow([page_num + 1, last_label, accumulated_text])

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (19).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\FN_Sup19.csv'

x_range = (50, 563)  
color = (0.0, 0.0, 0.0)  
width_range = (0.72, 0.73) 

extract_text_below_lines(pdf_file_path, csv_file_path, x_range, color, width_range)


## 特殊處理　Supreme 並排文本
+ 成功比對左右並排文本塊 , 能正確識別線的正下方文本 , 不會誤抓線Y座標以下的左邊文本塊

In [12]:
def extract_text_strictly_below_lines(pdf_path, csv_path, x_range, color, width_range):
    doc = fitz.open(pdf_path)

    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Page', 'Label', 'Text'])

        for page_num in range(len(doc)):
            page = doc[page_num]
            text_blocks = page.get_text('blocks')
            lines = []

            # 檢索符合條件的線條
            for item in page.get_drawings():
                if item['type'] == 's' and item['color'] == color and width_range[0] <= item['width'] <= width_range[1]:
                    line_start = item['items'][0][1]
                    line_end = item['items'][0][2]
                    if x_range[0] <= line_start.x <= x_range[1] and x_range[0] <= line_end.x <= x_range[1]:
                        lines.append(item)

            lines.sort(key=lambda l: l['items'][0][1].y)

            # 分析每條線下方的文本塊
            for line in lines:
                line_y = line['items'][0][1].y
                # 計算線條X座標的範圍
                line_x0 = min(line['items'][0][1].x, line['items'][0][2].x)
                line_x1 = max(line['items'][0][1].x, line['items'][0][2].x)
                
                # 初始化變量來收集線條下方的文本
                last_label = None
                accumulated_text = ''
                
                for block in text_blocks:
                    block_x0, block_y0, block_x1, block_y1, text = block[:5]
                    # 檢查文本塊是否位於線條正下方的X座標範圍內且Y座標低於線條
                    if line_x0 <= block_x0 <= line_x1 and block_y0 > line_y:
                        if re.match(r'^[\d\*\+]', text):
                            if accumulated_text:
                                writer.writerow([page_num + 1, last_label, accumulated_text])
                                accumulated_text = '' 
                            last_label = re.findall(r'^[\d\*\+]+', text)[0]
                            text = re.sub(r'^[\d\*\+]+', '', text).strip()  
                        accumulated_text += ' ' + text

                if accumulated_text:
                    writer.writerow([page_num + 1, last_label, accumulated_text])

    doc.close()

pdf_file_path = r'C:\Users\User\Dropbox\textmining1\Data 2\9_14_1998-10_26_2004\914 (1).pdf'
csv_file_path = r'C:\Users\User\Dropbox\textmining1\FN\FN_914.csv'
x_range = (50, 563)  # you may need to adjust this based on the actual PDF
color = (0.0, 0.0, 0.0)  # black
width_range = (0.72, 0.73)  # line width range

extract_text_strictly_below_lines(pdf_file_path, csv_file_path, x_range, color, width_range)


## 遍歷資料夾及子資料夾的ＰＤＦ檔案

In [4]:
import fitz
import csv
import re
import os
from pathlib import Path

def extract_text_below_lines(pdf_path, csv_path, x_range, color, width_range):
    doc = fitz.open(pdf_path)

    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Page', 'Label', 'Text'])

        for page_num in range(len(doc)):
            page = doc[page_num]
            text_blocks = page.get_text('blocks')
            lines = []

            
            for item in page.get_drawings():
                if item['type'] == 's' and item['color'] == color and width_range[0] <= item['width'] <= width_range[1]:
                    line_start = item['items'][0][1]
                    line_end = item['items'][0][2]
                    if x_range[0] <= line_start.x <= x_range[1] and x_range[0] <= line_end.x <= x_range[1]:
                        lines.append(item)

            lines.sort(key=lambda l: l['items'][0][1].y)

            last_label = None
            accumulated_text = ''

            if lines:  
                last_line_y = lines[-1]['items'][0][1].y  
                for block in sorted(text_blocks, key=lambda b: b[1]): 
                    if block[1] > last_line_y: 
                        text = block[4].strip()
                        
                        if re.match(r'^[\d\*\+]', text):
                            if accumulated_text: 
                                writer.writerow([page_num + 1, last_label, accumulated_text])
                                accumulated_text = '' 
                            last_label = re.findall(r'^[\d\*\+]+', text)[0]
                            text = re.sub(r'^[\d\*\+]+', '', text).strip()  
                        accumulated_text += ' ' + text

            
            if accumulated_text:
                writer.writerow([page_num + 1, last_label, accumulated_text])

    doc.close()

def process_pdfs_with_text_below_lines(folder_path, output_csv_path, x_range, color, width_range):
    Path(output_csv_path).mkdir(parents=True, exist_ok=True)
    
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    subfolder_counter = 0

    for root, dirs, files in os.walk(folder_path):
        # 確保只有當目錄有 PDF 文件時才更新計數器
        pdf_files = [file for file in files if file.lower().endswith('.pdf')]
        if not pdf_files:
            continue  # 如果目錄沒有 PDF 文件，則跳過此目錄
        
        # 為當前子資料夾計算前綴字母
        prefix = alphabet[subfolder_counter % len(alphabet)]
        file_counter = 1  # 重置文件編號計數器
        
        for file in pdf_files:
            pdf_file_path = os.path.join(root, file)
            # 建立符合新命名規則的 CSV 文件名
            csv_file_name = f"{prefix}{file_counter}_FN.csv"
            csv_file_path = os.path.join(output_csv_path, csv_file_name)
            extract_text_below_lines(pdf_file_path, csv_file_path, x_range, color, width_range)
            
            file_counter += 1  # 更新文件編號
        
        subfolder_counter += 1  # 更新子資料夾計數器

# 使用範例
data_folder_path = r'C:\Users\User\Dropbox\textmining1'
output_csv_folder_path = r'C:\Users\User\Dropbox\textmining1\FN'

x_range = (50, 563)  
color = (0.0, 0.0, 0.0)  
width_range = (0.72, 0.73) 

process_pdfs_with_text_below_lines(data_folder_path, output_csv_folder_path, x_range, color, width_range)

MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF error: library error: zlib error: unknown compression method
MuPDF error: library error: FT_New_Memory_Face(Arial): unknown file format
MuPDF 

## 遍歷特定資料夾

In [6]:
import os 
import glob
import fitz
import csv
import re

def extract_text_strictly_below_lines(pdf_path, csv_path, x_range, color, width_range):
    doc = fitz.open(pdf_path)

    with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Page', 'Label', 'Text'])

        for page_num in range(len(doc)):
            page = doc[page_num]
            text_blocks = page.get_text('blocks')
            lines = []

            # 檢索符合條件的線條
            for item in page.get_drawings():
                if item['type'] == 's' and item['color'] == color and width_range[0] <= item['width'] <= width_range[1]:
                    line_start = item['items'][0][1]
                    line_end = item['items'][0][2]
                    if x_range[0] <= line_start.x <= x_range[1] and x_range[0] <= line_end.x <= x_range[1]:
                        lines.append(item)

            lines.sort(key=lambda l: l['items'][0][1].y)

            # 分析每條線下方的文本塊
            for line in lines:
                line_y = line['items'][0][1].y
                # 計算線條X座標的範圍
                line_x0 = min(line['items'][0][1].x, line['items'][0][2].x)
                line_x1 = max(line['items'][0][1].x, line['items'][0][2].x)
                
                # 初始化變量來收集線條下方的文本
                last_label = None
                accumulated_text = ''
                
                for block in text_blocks:
                    block_x0, block_y0, block_x1, block_y1, text = block[:5]
                    # 檢查文本塊是否位於線條正下方的X座標範圍內且Y座標低於線條
                    if line_x0 <= block_x0 <= line_x1 and block_y0 > line_y:
                        if re.match(r'^[\d\*\+]', text):
                            if accumulated_text:
                                writer.writerow([page_num + 1, last_label, accumulated_text])
                                accumulated_text = '' 
                            last_label = re.findall(r'^[\d\*\+]+', text)[0]
                            text = re.sub(r'^[\d\*\+]+', '', text).strip()  
                        accumulated_text += ' ' + text

                if accumulated_text:
                    writer.writerow([page_num + 1, last_label, accumulated_text])

    doc.close()

def process_pdfs_in_folder(folder_path, output_folder, x_range, color, width_range):
    # 獲取子資料夾中所有PDF文件的路徑
    pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
    
    for pdf_file in pdf_files:
        # 從PDF文件名稱構造輸出CSV文件名稱
        base_name = os.path.basename(pdf_file)
        csv_file_name = f"{os.path.splitext(base_name)[0]}_output.csv"
        csv_file_path = os.path.join(output_folder, csv_file_name)
        
        # 調用您的提取函數處理每個PDF文件
        extract_text_strictly_below_lines(pdf_file, csv_file_path, x_range, color, width_range)
        print(f"Processed {pdf_file}")

# 設定您的資料夾路徑和輸出資料夾路徑
folder_to_process = r'C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court'
output_folder = r'C:\Users\User\Dropbox\textmining1\Data 1'
x_range = (50, 563)  # Adjust as needed
color = (0.0, 0.0, 0.0)  # Black
width_range = (0.72, 0.73)  # Line width range

process_pdfs_in_folder(folder_to_process, output_folder, x_range, color, width_range)

Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (1).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (10).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (11).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (12).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (13).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (14).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (15).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (16).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme (17).PDF
Processed C:\Users\User\Dropbox\textmining1\Data 1\Context Samples_US Supreme Court\Supreme 

## merge csv

In [5]:
import csv
import os
from pathlib import Path
import re

def natural_sort_key(s):
    """
    提取用于自然排序的键（包括数字排序）。
    """
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

def merge_csv_files(input_folder, output_csv_path):
    # 确保输出文件夹存在
    Path(output_csv_path).parent.mkdir(parents=True, exist_ok=True)

    with open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:
        output_writer = csv.writer(output_file)
        headers_written = False

        # 使用自定义的自然排序键来排序文件名
        for input_file in sorted(os.listdir(input_folder), key=natural_sort_key):
            if input_file.endswith('.csv'):
                pdf_label = input_file.split('_')[0]  # 假设文件名格式为 'd1_FN.csv'
                with open(Path(input_folder) / input_file, newline='', encoding='utf-8') as csv_file:
                    csv_reader = csv.reader(csv_file)
                    headers = next(csv_reader)

                    if not headers_written:
                        output_writer.writerow(['PDF'] + headers)
                        headers_written = True

                    for row in csv_reader:
                        output_writer.writerow([pdf_label] + row)

input_folder = r'C:\Users\User\Dropbox\textmining1\FN'
output_csv_path = r'C:\Users\User\Dropbox\textmining1\merge.csv'

merge_csv_files(input_folder, output_csv_path)
