## 字體屬性

In [7]:
import fitz

def detect_font_properties(pdf_path):
    doc = fitz.open(pdf_path)
    font_properties = {}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line in b["lines"]:
                    for span in line["spans"]:
                        font = span["font"]
                        size = span["size"]
                        text = span["text"]

                        # 在字典中建立字體屬性的鍵值對
                        if font not in font_properties:
                            font_properties[font] = {}
                        
                        if size not in font_properties[font]:
                            font_properties[font][size] = set()

                        font_properties[font][size].add(text)

    doc.close()
    return font_properties

# 檢測 PDF 文件的字體屬性
pdf_path = r"/Users/tangjiahong/Dropbox/textmining1/PDF/Files (35).PDF"
font_properties = detect_font_properties(pdf_path)

# 輸出字體屬性
for font, sizes in font_properties.items():
    print("Font:", font)
    for size, texts in sizes.items():
        print("Size:", size)
        print("Texts:", texts)
        print()


Font: Helvetica
Size: 10.0
Texts: {'the radio and television industries.  That is a separate consideration.  What others do with pictures they are allowed ', 'three House bills were conducted by the Subcommittee on Study of Monopoly Power of the Committee on the ', 'that Congress has not attempted to do so.  If there is a conspiracy, it is not one to control commerce between the ', 'Supreme Court of the United States', '(and sometimes five) years, take part only in title contests promoted by the defendants.  As a consequence of these ', ' of its hearings, the House ', ' case consisted of bills which sought exemption for ', 'the performance of a legitimate stage attraction (', '.  But they would be content with a more ', 'other segments of the entertainment business, athletic or otherwise.  Surely there is ', 'As in the ', ' restrictive interpretation of ', 'constitutes "trade or commerce among the several States" within the meaning of the Sherman Act. Pp. 240-243.  ', 'and ', 'the ', '

## 繪製邊界框

In [73]:
import fitz  # PyMuPDF
from PIL import Image, ImageDraw  # Pillow

pdf_path = r"C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF"
img_path = r"C:\Users\User\Dropbox\textmining1\PDF\page_image4.png"

# 打开PDF并获取第一页
doc = fitz.open(pdf_path)
page = doc.load_page(1826)

# 将第一页转换为图像并保存
pix = page.get_pixmap()
pix.save(img_path)

# 打开图像以绘制边界框
image = Image.open(img_path)
draw = ImageDraw.Draw(image)

# 获取文本区块及其边界信息
blocks = page.get_text("dict")["blocks"]
for block in blocks:
    if "bbox" in block:
        bbox = block["bbox"]
        # 绘制边界框，注意坐标需要转换为整数
        draw.rectangle([int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])], outline="red", width=2)

# 保存绘制了边界框的图像
image.save(img_path)

# 清理
doc.close()


## 找關鍵字的座標

In [33]:

import fitz

pdf_path = "C:\Users\User\Dropbox\textmining1\PDF\Files (100).PDF"  # 替換為您的PDF文件路徑
doc = fitz.open(pdf_path)

# 從第一頁開始尋找含有"Footnotes"的文本塊
for page_num in range(len(doc)):
    page = doc[page_num]
    text_blocks = page.get_text("dict")["blocks"]

    for block in text_blocks:
        if "lines" in block:  # 確保是文本塊
            for line in block["lines"]:
                for span in line["spans"]:
                    if "Footnotes" in span["text"]:
                        print("找到含有'Footnotes'的文本塊:")
                        print(f"頁面: {page_num}, 座標: {span['bbox']}")
                        # 您可以根據座標提取或高亮顯示文本塊，或進一步處理
                        
# 完成後關閉文檔
doc.close()


找到含有'Footnotes'的文本塊:
頁面: 622, 座標: (50.0, 658.6190185546875, 277.5109558105469, 670.9849853515625)
找到含有'Footnotes'的文本塊:
頁面: 793, 座標: (50.0, 736.6190185546875, 202.75698852539062, 748.9849853515625)
找到含有'Footnotes'的文本塊:
頁面: 963, 座標: (50.0, 325.2279968261719, 459.7499084472656, 338.9679870605469)
找到含有'Footnotes'的文本塊:
頁面: 996, 座標: (195.29498291015625, 437.22802734375, 377.6048889160156, 450.968017578125)
找到含有'Footnotes'的文本塊:
頁面: 1570, 座標: (50.0, 71.42900085449219, 139.47999572753906, 85.16899871826172)


### 確認座標範圍文本

In [127]:
import fitz

pdf_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (35).PDF'  # 替换为您的PDF文件路径
doc = fitz.open(pdf_path)
page = doc[176]  # 假设横线出现在第一页

# 获取页面尺寸，以确定搜索范围
page_rect = page.rect
bottom_half_rect = fitz.Rect(50,160,600,750)

# 获取指定区域内的文本
text_instances = page.get_text("dict", clip=bottom_half_rect)["blocks"]

# 处理每个文本块
for block in text_instances:
    # 可能需要进一步检查是否是文本块
    if 'lines' in block:  # 只处理包含文本的块
        text = ""
        for line in block["lines"]:
            for span in line["spans"]:
                text += span["text"]
        print(text)  # 打印块内文本

doc.close()


local-competition rules were invalid, most notably the one requiring that prices for interconnection and unbundled access be based on "Total Element Long Run Incremental Cost" (TELRIC) -- a forward-looking rather than historic measure. 3 See 47 CFR §§ 51.503, 51.505 (1997). The Court of Appeals agreed, and vacated the pricing rules, and several other aspects of the order, as reaching beyond the Commission's jurisdiction.  Iowa Utilities Board v. FCC, 120 F.3d 753, 800, 804, 805-806 [****13]  (1997). It held that the general rulemaking authority conferred upon the Commission by the Communications Act of 1934 extended only to interstate matters, and that the Commission therefore needed specific congressional authorization before implementing provisions of the 1996 Act addressing intrastate telecommunications. Id. at 795. It found no such authorization for the Commission's rules regarding pricing, dialing parity, 4 exemptions  [*375]  for rural LECs, the proper procedure for resolving loc

### 看某個頁面的黑線座標

In [141]:
import fitz
from PIL import Image, ImageDraw

def is_black(rgb_color, threshold=0.1):
    # Determine if a color is "black" based on a threshold
    return max(rgb_color) < threshold

def visualize_pdf_lines(pdf_path, page_number, output_image_path):
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    lines_info = []

    # Extract line information
    for shape in page.get_drawings():
        if shape['type'] == 's':  # 's' indicates a stroke or line
            if is_black(shape['color']):  # Check if the line is black
                line_info = {
                    'start': shape['items'][0][1],  # Start point
                    'end': shape['items'][0][2],    # End point
                    'width': shape['width'],        # Line width
                    'color': shape['color'],        # Line color
                }
                lines_info.append(line_info)

    # Determine page size for the output image
    page_rect = page.rect
    img = Image.new('RGB', (int(page_rect.width), int(page_rect.height)), color='white')
    draw = ImageDraw.Draw(img)

    # Draw lines on the image
    for line in lines_info:
        start_x, start_y = map(int, line['start'])
        end_x, end_y = map(int, line['end'])
        line_color = 'black'  # Set the line color to black for visualization
        draw.line((start_x, start_y, end_x, end_y), fill=line_color, width=int(line['width']))

        # Optionally, print line details
        print(f"Black line from {line['start']} to {line['end']}, Width: {line['width']}")

    # Save or display the image
    img.save(output_image_path)
    img.show()

    doc.close()

# Usage
pdf_path = r'C:\Users\User\Dropbox\textmining1\PDF\Files (35).PDF'
page_number = 176  # Example for page 176
output_image_path = 'black_lines_image.png'
visualize_pdf_lines(pdf_path, page_number, output_image_path)


Black line from Point(50.0, 616.4729614257812) to Point(193.5, 616.4729614257812), Width: 0.7200000286102295


+ files 35 有一條較長的線 p.176

+ 設定兩條線的 1.X座標範圍 2.黑色線 3. 寬度

    + Black line from Point(50.0, 138.0659942626953) to Point(562.0, 138.0659942626953), Width: 0.7200000286102295      
     
    + Black line from Point(50.0, 616.4729614257812) to Point(193.5, 616.4729614257812), Width: 0.7200000286102295   

**以Files_100為例, 以下為所有文本的字體及大小** <br>
+ **Font**: Helvetica
+ **Sizes**: {8.0, 9.0, 10.0, 6.0}<br>
<br>
+ **Font**: Helvetica-Bold
+ **Sizes**: {8.0, 9.0, 10.0, 12.0, 14.0}<br>
<br>
+ **Font**: Helvetica-Oblique
+ **Sizes**: {9.0, 10.0}<br>
<br>
+ **Font**: Helvetica-BoldOblique
+ **Sizes**: {9.0, 10.0, 14.0}<br>
<br>
+ **Font**: Arial
+ **Sizes**: {10.0}

**可以發現包含footnotes的字體及其大小為:**<br> 

**target_font="Helvetica", size1=6.0, size2=9.0**<br>

於是寫一個Function抓取目標文本字體 **"Helvetica"** 且開頭為 **6.0** 接著內容為 **9.0** 就可以抓取所有Footnotes <br>
+ 6.0： "*" , "+" , "footnotes開頭的數字標號" <br>

+ 9.0：footnotes內容


In [10]:
import fitz  # PyMuPDF
import csv
import re  

def extract_footnotes(pdf_path, csv_path, target_font="Helvetica", size1=6.0, size2=9.0):
    doc = fitz.open(pdf_path)
    csv_rows = []
    collecting = False  # 標記是否開始收集footnotes文本
    footnote_text = ""  # 累積footnotes文本

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" in b:
                for line_num, line in enumerate(b["lines"]):
                    spans = line["spans"]
                    for i, span in enumerate(spans):
                        font_name = span.get('font', '')
                        size = span.get("size", 0)
                        text = span.get("text", "")

                        if collecting:
                            # 檢查是否停止收集footnotes文本
                            if i < len(spans) - 1:
                                next_text = spans[i + 1]["text"]
                                if text.endswith(".") and re.match(r'^[A-Z]', next_text):
                                    # 結束的條件
                                    csv_rows.append([page_num + 1, footnote_text])
                                    collecting = False
                                    footnote_text = ""
                                    continue
                            footnote_text += " " + text

                        # 檢查footnotes開始抓取的條件
                        if not collecting and font_name == target_font and size == size1 and (i < len(spans) - 1 and spans[i + 1].get("size", 0) == size2):
                            collecting = True  # 開始收集footnotes文本
                            footnote_text = text + " " + spans[i + 1]['text'].strip()

                    # 處理每行結束時的情況
                    if collecting and line_num == len(b["lines"]) - 1:
                        # 行结束且正在收集footnotes，保存footnotes
                        csv_rows.append([page_num + 1, footnote_text])
                        collecting = False
                        footnote_text = ""

    if collecting:
        # 確保最後一個footnotes文本被保存
        csv_rows.append([page_num + 1, footnote_text])

    with open(csv_path, "w", encoding="utf-8", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Page', 'Footnote'])
        csv_writer.writerows(csv_rows)

    doc.close()

# 指定 PDF 路徑和 CSV 路徑
pdf_file_path = r"/Users/tangjiahong/Dropbox/textmining1/PDF/Files (100).PDF"
csv_file_path = r"/Users/tangjiahong/Dropbox/textmining1/FN/footnotes_final_100.csv"

# 提取footnotes寫入 CSV
extract_footnotes(pdf_file_path, csv_file_path)



**目前問題**<br>

+ 有些註腳沒有數字標號也沒有符號開頭，無法偵測抓取<br>   
    + ex files_100 page 24,1827 其FN似乎是連結上一個標號的內容 <br>
    
        + 冒號後面內容跳頁帶著 "" 的文本 , 直接跳下個標號內容 <br>
        
        + 冒號接跨行 HN 內容 , 跳下個標號內容  

+ 有註腳包含了不符合預設註腳字體大小、字型的設定：HN\d+[] 導致其後文本未被抓取直接跳過

    + ex files_35 page 29,175,176,213,213-219,268,1822
    
    + 冒號後面帶著 "" 的文本沒抓到       
    + 冒號接著 "" HN 開頭文本沒抓到
    + 冒號接著 HN 開頭文本沒抓到
    + 標號 + LEdHN 開頭 後面文本沒抓到
    + 標號 + LEdHN 開頭 有抓到後面文本
        



**註解**

+ csv_rows = [] 存取提取的FN文本及頁碼
+ collecting = False 標記是否正在收集FN
+ footnote_text = "" 臨時儲存正在收集的FN
+ for page_num in range(len(doc)) scan每一頁
+ page = doc.load_page(page_num)加載當前頁碼頁面
+ blocks = page.get_text("dict")["blocks"] 獲取當頁文本塊，組織成字典存在blocks中
+ for b in blocks scan當頁所有文本塊
+ if "lines" in b 檢查文本塊中有無包含"lines" 表示包括FN
+ for line_num, line in enumerate(b["line"]) scan當前文本塊
+ spans = line["spans"] 獲取文本所有片段
+ for i, span in enumerate(spans) scan所有片段
+ font_name = span.get('font','')獲取片段字體名稱
+ size = span.get("size",0)獲取片段字體大小
+ text = span.get("text","") 獲取片段文本<br>

+ if collecting 如果當前正在收集FN

    + 檢查片段是否為當前倒數第二個文本片段，並抓下一個文本片段的文本<br>
    
    + 若片段以句號結尾，且下一段以大寫字母開頭，表示結束。
    + 將收集好的FN存入csv_rows停止收集並重置
    + 若FN為結束，將當前片段添加至FN文本
+ if not collecting ... 若正在收集且達到當前文本塊最後一行，將FN加到csv_rows並停止收集
+ if collecting and line_num == len(b["lines"]) - 1 若正在收集且達最後一行將FN加到csv_rows並停止收集，加到csv_rows。