# 提取文字、表格和图片

In [1]:
import pandas as pd
import openpyxl
import pdfplumber
import fitz
import os
from win32com import client
import win32com.client as win32
from docx import Document

"""
filepath: 需要提取的文件的路径
objectpath: 将提取后的文字写到指定文件的路径
"""

# 提取文字
def get_text(filepath, objectpath):
    with pdfplumber.open(filepath) as pdf:
        for pagenum in pdf.pages:
            needtext = pagenum.extract_text()
            newdum = open(objectpath, mode='a', encoding='utf-8')
            newdum.write(needtext)

# 提取表格
def get_table(filepath, objectpath):
    count = 1
    with pdfplumber.open(filepath) as pdf:
        with pd.ExcelWriter(objectpath) as writer:
            for pagenum in pdf.pages:
                for table in pagenum.extract_tables():
                    data = pd.DataFrame(table[1:], columns=table[0])
                    data.to_excel(writer, sheet_name=f'sheet{count}')
                    count += 1

# 提取图片
def get_images(filepath, objectpath):
    pdf_document = fitz.open(filepath)
    for current_page in range(len(pdf_document)):
        for image in pdf_document.get_page_images(current_page):
            xref = image[0]
            pix = fitz.Pixmap(pdf_document, xref)
            if pix.n < 5:        # this is GRAY or RGB
                img_name = "page%s-%s.png" % (current_page, xref)
                # pix.save(img_name)
                pix.save(os.path.join(objectpath, img_name))
            else:                # CMYK: convert to RGB first
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                img_name = "page%s-%s.png" % (current_page, xref)
                # pix.save(img_name)
                pix.save(os.path.join(objectpath, img_name))
                pix1 = None
            pix = None

# 转PDF
def convert_to_pdf(input_path, output_path):
    # 创建Word应用程序实例
    word_app = win32.gencache.EnsureDispatch('Word.Application')
    # 设置应用程序可见性为False（不显示Word界面）
    word_app.Visible = False
 
    try:
        # 打开Word文档
        doc = word_app.Documents.Open(input_path)
        # 保存为PDF
        doc.SaveAs(output_path, FileFormat=17)
        doc.Close()
        return True
    except Exception as e:
        print("转换PDF失败:" + str(e))
        return False
    finally:
        # 关闭Word应用程序
        os.remove(input_path)
        word_app.Quit()
    
# 输入路径和输出路径
in_path = "E:/桌面/文档治理"
out_path = "E:/桌面/文档治理结果"

# 拿到所有文件的名字  每个文件的名字为filelist[i]
filelist = os.listdir(f"{in_path}/")

for i in range(len(filelist)):
    # 文件名（不带后缀）
    filename = filelist[i].split('.')[0]
    fileformat = filelist[i].split('.')[-1]
    if fileformat == 'doc' or fileformat == 'docx':
        convert_to_pdf(f'{in_path}/{filelist[i]}',f'{in_path}/{filename}.pdf')

# 重新读一遍文件
filelist = os.listdir(f"{in_path}/")

for i in range(len(filelist)):

    
    # 指定需要提取的文件路径
    filepath = f"{in_path}/{filelist[i]}"

    # 获取文件的名字(不带后缀)，方便后续创建文件夹
    filename = filelist[i].split(".")[0]

    # 创建以PDF命名的文件夹
    os.makedirs(f"{out_path}/{filename}")
    os.makedirs(f"{out_path}/{filename}/图片")

    # 将文件写到指定文件中
    text_objectpath = f"{out_path}/{filename}/{filename}.txt"
    # 将表格写到指定文件中
    table_objectpath = f"{out_path}/{filename}/{filename}.xlsx"
    # 将图片存到指定文件中
    images_objectpath = f"{out_path}/{filename}/图片"

    # 调用函数
    get_text(filepath, text_objectpath)
    get_table(filepath, table_objectpath)
    get_images(filepath, images_objectpath)