In [58]:
!pip install python-docx

In [2]:
import docx

from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table

from docx.oxml.text.paragraph import CT_P as omxl_paragraph
from docx.oxml.table import CT_Tbl as omxl_table
from docx.oxml.section import CT_SectPr as omxl_section

import regex

In [3]:
def open_docx(path: str) -> Document:
    """
    :params str path: Путь к файлу
    :return Document: Объект docx
    """
    return Document(path)

In [4]:
def get_body(document: Document) -> list[omxl_paragraph | omxl_table | omxl_section]:
    """
    :params Document document: Объект docx
    :return list[omxl_paragraph | omxl_table | omxl_section]: Список объектов находящийся в document
    """
    return [element for element in document.element.body]

In [5]:
def convert_oxml_paragraph_to_text(paragraph: omxl_paragraph, document: Document) -> str:
    """
    :params omxl_paragraph element: параграф oxml
    :params Document document: документ, в котором находится данный параграф
    :return str: текст параграфа
    """
    return Paragraph(paragraph, document).text.lower()

In [6]:
def join_title(paragraphs: list[omxl_paragraph], document: Document) -> str:
    """
    :params list[omxl_paragraph] element: список параграфов oxml
    :params Document document: документ, в котором находится данные параграфы
    :return str: заголовок таблицы
    """
    title = list(map(lambda x: convert_oxml_paragraph_to_text(x, document), paragraphs))
    title[0] = f'{title[0]}.'
    return ' '.join(title)

In [7]:
def regex_title(text: str) -> bool:
    """
    ...
    """
    return regex.findall(r'таблица\s[0-9]{1,}', text) != []

In [8]:
def convert_text_to_oxml_paragraph(text: str) -> omxl_paragraph:
    """
    :params str text: текст параграфа
    :return omxl_paragraph: параграф oxml
    """

    tmp_document = Document()
    paragraph = tmp_document.add_paragraph(text)
    return paragraph._element

In [9]:
def convert_oxml_table_to_table(element: omxl_table, document: Document) -> Table:
    """
    :params omxl_table element: таблица oxml
    :return Table: объект таблица
    """
    return Table(element, document)

In [10]:
def correct_space(data: list[str]) -> None:
    """
    ...
    """
    for index, value in enumerate(data):
        data[index]: str = ' '.join(value.split())

In [11]:
def find_title_repeat(title: list[str]) -> int | None:
    for i in range(len(title)):
        if title.count(title[i]) > 1:
            return i

In [12]:
def get_table(table: Table) -> list[list[str]]:
    """
    ...
    """
    table: list[list[str]] = [[cell.text for cell in row.cells] for row in table.rows]
    index: int | None = find_title_repeat(table[0])
    for row in table:
        correct_space(row)
        if index:
            row.pop(index)

    return table

In [13]:
def get_tables(body: list[omxl_paragraph | omxl_table | omxl_section], document: Document) -> dict[str: Table]:
    """
    ...
    """

    i: int = 0
    tables: dict[str: omxl_table] = {}
    while len(body) > i:
        element: omxl_paragraph | omxl_table = body[i]

        if isinstance(element, omxl_paragraph):
            text: str = convert_oxml_paragraph_to_text(element, document)
            if regex_title(text):
                tmp_body: list[omxl_paragraph | omxl_table] = body[i + 1:]
                for j, el in enumerate(tmp_body):
                    if isinstance(el, omxl_table):
                        title: str = join_title([element] + tmp_body[:j], document)
                        table: Table = convert_oxml_table_to_table(el, document)
                        tables[title]: Table = table

                        # new_body += [convert_text_to_paragraph(title), el]

                        i += j
                        break

            else:
                i += 1

        else:
            i += 1

    return tables

In [14]:
def name_tables(tables: dict[str: Table]) -> list[str]:
    """
    """
    return list(tables.keys())

In [15]:
def gauss(table: list[list[str]]):
    for i, row in enumerate(table):
        row_isdigit: list[bool] = list(map(lambda x: x.isdigit(), row))
        if all(row_isdigit):
            length_row: int = len(row)
            if sum(map(int, row)) == (length_row * (length_row + 1)) / 2 and length_row == len(set(row)):
                return i

In [16]:
def count_columns(table: list[list[str]]) -> int:
    """
    """
    return len(table[gauss(table)])

In [17]:
def count_rows(table: list[list[str]]) -> int:
    """
    """
    return len(table)

In [18]:
def get_maxim_length_columns(table: list[list[str]], rows: int, columns: int) -> dict:
	maxim_lenght_columns = dict()

	for i in range(columns):
		maxim_lenght = 0
		for j in range(rows):
			maxim_lenght = max(maxim_lenght, len(table[j][i]))
			maxim_lenght_columns[f"столбец_{i + 1}"] = maxim_lenght

	return maxim_lenght_columns

In [19]:
def format_values_table(table: list, rows: int, columns: int) -> tuple:
	lenght_columns = get_maxim_length_columns(table, rows, columns)
	for i in range(columns):
		format_length = lenght_columns[f"столбец_{i + 1}"]
		format_length = format_length + 9 if format_length % 2 else format_length + 8
		for j in range(rows):
			table[j][i] = f"{table[j][i]:^{format_length}}"

	return table[0], table[1:]

In [20]:
def format_title(title: list) -> str:
	return f"|{'|'.join(title)}|"


def format_sep_title(title: list) -> str:
	return f"+{(len(format_title(title)) - 2) * '-'}+"


def format_values(title: list, values: list) -> str:
	return ("\n" + format_sep_title(title).replace('+', '-') + "\n").join(map(lambda x: format_title(x), values))


def format_table(title: list, values: list, format_sep: str) -> str:
	return f"{format_sep}\n{format_title(title)}\n{format_sep}\n{format_values(title, values)}\n{format_sep}"

In [21]:
def to_string(table: Table) -> str:
    """
    """
    pass

In [57]:
document = open_docx('путь_к_файлу.docx')
document

In [34]:
body = get_body(document)
# body

In [35]:
tables = get_tables(body, document)

In [52]:
# Отсюда копируется название таблицы и ниже вставляется или через индекст, пример ниже
name_tables(tables)

In [53]:
# table = get_table(tables[name_tables(tables)[0]]
table = get_table(tables['вставить название файл из списка выше'])
table

In [45]:
columns = count_columns(table)

In [46]:
rows = count_rows(table)

In [54]:
get_maxim_length_columns(table, rows, columns)

In [48]:
title, table = format_values_table(table, rows, columns)

In [49]:
format_string = format_sep_title(title)

In [55]:
print(format_table(title, table, format_string))