In [3]:
import zipfile
from xml.etree import ElementTree as ET

def extract_document_xml(docx_path):
    """
    Функция для извлечения document.xml из docx-файла.
    """
    with zipfile.ZipFile(docx_path, 'r') as archive:
        try:
            xml_content = archive.read('word/document.xml')
            return xml_content
        except KeyError:
            raise ValueError("Файл не содержит 'word/document.xml'")

def parse_xml(xml_bytes):
    """
    Функция для разбора XML и поиска текстовых узлов.
    """
    xml_str = xml_bytes.decode('utf-8')
    root = ET.fromstring(xml_str)
    namespace = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    text_elements = root.findall('.//w:t', namespace)
    return text_elements

def get_text_from_word(docx_path):
    """
    Основная функция для получения текста из docx-файла.
    """
    xml_bytes = extract_document_xml(docx_path)
    text_elements = parse_xml(xml_bytes)
    text = ''.join([element.text or '' for element in text_elements])
    return text

if __name__ == "__main__":
    docx_path = 'llm модель 04.09.2024.docx'
    #text = get_text_from_word(docx_path)
    print(extract_document_xml(docx_path))

b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/draw

In [13]:
def read_binary_file(filename):
    with open(filename, 'rb') as f:
        return f.read()

def find_document_xml(data):
    start_marker = b'\x50\x4B\x03\x04'  # Начало записи ZIP
    end_marker = b'\x50\x4B\x05\x06'    # Конец центрального каталога ZIP
    pos = data.find(b'document.xml')
    if pos == -1:
        return None
    while True:
        pos = data.rfind(start_marker, 0, pos)
        if pos == -1 or pos + len(start_marker) >= len(data):
            break
        header = data[pos:pos+30]
        filename_length = int.from_bytes(header[28:30], byteorder='little', signed=False)
        extra_field_length = int.from_bytes(header[30:32], byteorder='little', signed=False)
        content_start = pos + 46 + filename_length + extra_field_length
        content_end = content_start + int.from_bytes(header[24:26], byteorder='little', signed=False)
        if content_start <= pos < content_end:
            return data[content_start:content_end].decode('utf-8')
    return None

# Пример использования
data = read_binary_file('llm модель 04.09.2024.docx')
xml_data = find_document_xml(data)
if xml_data:
    print(xml_data)
else:
    print("XML-данные не найдены.")

XML-данные не найдены.


In [21]:
f = open(r'D:\Учеба\УПД\Отчеты\llm модель 04.09.2024.docx', 'rb').read()
a = f.find(b'document.xml')
f[a:a + 30]

b"document.xml\xec=io\x1c\xc7\x95\xdf\x17\xd8\xff\xd0\xe0'/`\x92}"