In [15]:
# HN CSV 乾淨版
# !pip install PyMuPDF

import fitz
import re
import csv


def extract_and_process_segments_with_page_number(pdf_file_path, csv_file_path):
    all_text = ""
    page_breaks = []

    with fitz.open(pdf_file_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            all_text += text
            page_breaks.append(len(all_text))

    all_text = re.sub(r'Page \d+ of \d+', '', all_text)

    page_content = []

    def process_section(regex, section_name):
        matches = re.finditer(regex, all_text, re.S)
        for match in matches:
            start_pos = match.start()
            page_number = next(pn for pn, pb in enumerate(page_breaks, start=1) if start_pos < pb)
            content = match.group(1).strip()
            page_content.append([page_number, section_name, content])

    process_section(r'Core Terms\s+([\s\S]+?)(?=\n(?:[A-Z][^\s]*(?:\s[A-Z][^\s]*)*[:\n]|Opinion by:))', "Core Terms")
    process_section(r'LexisNexis® Headnotes\s+(.+?)(?=HN\d+|$)', "Headnotes")

    hn_matches = re.findall(r'HN\d+\[.*?\].+?(?=HN\d+|LexisNexis® Headnotes|\Z)', all_text, re.DOTALL)
    for match in hn_matches:
        match_start = all_text.find(match)
        page_number = next(pn for pn, pb in enumerate(page_breaks, start=1) if match_start < pb)
        lines = match.split('\n')
        ongoing_hn_label = ""
        ongoing_hn_content = ""
        for line in lines:
            if 'HN' in line:
                if ongoing_hn_label and ongoing_hn_content:
                    page_content.append([page_number, ongoing_hn_label, ongoing_hn_content.strip()])
                ongoing_hn_label = line
                ongoing_hn_content = ""
            elif ">" in line:
                ongoing_hn_content += line.strip() + " "
        if ongoing_hn_label and ongoing_hn_content:
            page_content.append([page_number, ongoing_hn_label, ongoing_hn_content.strip()])

    core_terms_count = sum(1 for row in page_content if row[1] == "Core Terms")
    headnotes_count = sum(1 for row in page_content if row[1] == "Headnotes")
    hn1_count = sum(1 for row in page_content if row[1].startswith("HN1["))

    with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Section', 'Content'])
        csv_writer.writerows(page_content)

    print(f'Core Terms 數量: {core_terms_count}')
    print(f'Headnotes 數量: {headnotes_count}')
    print(f'HN1[ 數量: {hn1_count}')

pdf_file_path = r'C:\Users\User\Dropbox\textmining\PDF\Files (35).PDF'
csv_file_path = r'C:\Users\User\Dropbox\textmining\HN\HN_35.csv'

extract_and_process_segments_with_page_number(pdf_file_path, csv_file_path)


Core Terms 數量: 35
Headnotes 數量: 34
HN1[ 數量: 35


In [1]:
# HN CSV (CoreTerms,Headnotes,HN) 註解版

# !pip install PyMuPDF
# !pip install --upgrade PyMuPDF

import fitz
import re
import csv


def extract_and_process_segments_with_page_number(pdf_file_path, csv_file_path):
    all_text = ""   #儲存PDF文本內容
    page_breaks = []  #儲存每個頁面的文本結尾位置

    with fitz.open(pdf_file_path) as doc: #打開PDF 作為doc處理 , with 表退出區塊後自動關閉PDF文件
        for page_num in range(len(doc)): #尋找所有頁面
            page = doc.load_page(page_num)  #載入當前頁面輸入Page變數
            text = page.get_text()    #獲取當前頁面文本
            all_text += text      #將文本附加到all_text變數中以供後續處理
            page_breaks.append(len(all_text)) #計算all_text長度,以計算頁數

    all_text = re.sub(r'Page \d+ of \d+', '', all_text) #去除Page of 頁碼

    page_content = []   #儲存處理後的文本,將每一行存為列表中的一個單位

    def process_section(regex, section_name): # regex(正則表達式),section_name(區段名稱) #處理文本並提取目標內容
        matches = re.finditer(regex, all_text, re.S) #re.S 匹配包含換行符的所有字元
        for match in matches:
            start_pos = match.start() #獲取匹配文本的起始位置
            page_number = next(pn for pn, pb in enumerate(page_breaks, start=1) if start_pos < pb) #透過比對'start_pos','page_breaks'的位置找到匹配文本頁數
            content = match.group(1).strip() #獲取文本內容,使用strip去除首尾空白字符
            page_content.append([page_number, section_name, content]) #將頁數,區段名稱,內容作為一個列表添加到page_content

    process_section(r'Core Terms\s+([\s\S]+?)(?=\n(?:[A-Z][^\s]*(?:\s[A-Z][^\s]*)*[:\n]|Opinion by:))', "Core Terms")
# '以Core Terms開頭' , '\s+ 匹配多個空格符,允許區段名稱前有多個空格', '([\s\S]+?) 使用括號將匹配內容捕獲到Group中,匹配任意字符,+? 表示非貪婪匹配,匹配最短內容'
# '(?=\n ...) 要求接下來的文本必須符合括號內條件,但不包含在匹配的結果中', '(?: ....) 非捕獲分組,用於組合多個匹配條件 '
# '[A-Z] 匹配大寫字母 ' , '[^\s]匹配0個或多個非空格的字符'  , '(?:\s[A-Z][^\s]*)* 匹配一個或多個以空格分隔的大寫字母單詞'
# '[:\n] 匹配冒號或換行符' , '|Opinion by: 表示或以 Opinion by:做為結束標記

    process_section(r'LexisNexis® Headnotes\s+(.+?)(?=HN\d+|$)', "Headnotes")
# '(.+?)匹配任何字符,用括號將匹配內容捕獲到分組中' , (?=HN\d+|$)使用正向查找來限制匹配的結尾,' HN\d+：匹配以"HN"開頭並後面跟著一個或多個數字的字串，這表示區段的結束標記 '
#'$：匹配文本的結尾，用於處理區段出現在文本結尾的情況'

    hn_matches = re.findall(r'HN\d+\[.*?\].+?(?=HN\d+|LexisNexis® Headnotes|\Z)', all_text, re.DOTALL) # 匹配類似"HN1[]"格式的區段
# ' HN\d+ 匹配以"HN"開頭並後面跟著一個或多個數字的字串，表示區段的開始標記' , '\[.*?\] 匹配方括號，用於包圍區段標籤 ' , '(?=HN\d+|\Z)：使用正向查找來限制匹配的結尾 '
# |LexisNexis® Headnotes| or ... or ...當結尾 , 加這個很重要, 解決Headnotes被誤認為HN的一部份
    for match in hn_matches: #尋找所有HN格式的區段
        match_start = all_text.find(match) #獲取匹配區段的起始位置

        page_number = next(pn for pn, pb in enumerate(page_breaks, start=1) if match_start < pb)
#比對'match_start'和'page_breaks'中的位置,找到匹配區段所在的頁數

        lines = match.split('\n') #將匹配區段按照換行符號分割成多行,逐行處理(解決換行後無法匹配的情況)
        ongoing_hn_label = "" #遇到包含">"的行,將行內容添加到ongoing_hn_content
        ongoing_hn_content = ""
        for line in lines:
            if 'HN' in line:
                if ongoing_hn_label and ongoing_hn_content: #若Label & content都包含內容,將他們及頁數添加到page_content列表
                    page_content.append([page_number, ongoing_hn_label, ongoing_hn_content.strip()])
                ongoing_hn_label = line
                ongoing_hn_content = ""
            elif ">" in line:
                ongoing_hn_content += line.strip() + " "
        if ongoing_hn_label and ongoing_hn_content:
            page_content.append([page_number, ongoing_hn_label, ongoing_hn_content.strip()])

    # 計算 Core Terms、Headnotes 和 HN1[ 的數量
    core_terms_count = sum(1 for row in page_content if row[1] == "Core Terms")
    headnotes_count = sum(1 for row in page_content if row[1] == "Headnotes")
    hn1_count = sum(1 for row in page_content if row[1].startswith("HN1["))

    with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file: #寫入模式'w'編碼為UTF-8 且不添加額外空行
        csv_writer = csv.writer(csv_file) #將數據輸入CSV
        csv_writer.writerow(['Page', 'Section', 'Content']) #寫入標題
        csv_writer.writerows(page_content)   #寫入內容

    # 打印統計結果
    print(f'Core Terms 數量: {core_terms_count}')
    print(f'Headnotes 數量: {headnotes_count}')
    print(f'HN1[ 數量: {hn1_count}')

pdf_file_path = r'/workspaces/textmining/textmining/PDF/Files (100).PDF'
csv_file_path = r'/workspaces/textmining/textmining/HN/HN_100_.csv'

extract_and_process_segments_with_page_number(pdf_file_path, csv_file_path)


Core Terms 數量: 100
Headnotes 數量: 95
HN1[ 數量: 95
