In [1]:
!pip install python-docx regex



In [2]:
import docx

from docx import Document
from docx.text.paragraph import Paragraph
from docx.table import Table

from docx.oxml.text.paragraph import CT_P as omxl_paragraph
from docx.oxml.table import CT_Tbl as omxl_table
from docx.oxml.section import CT_SectPr as omxl_section

import regex

In [3]:
def open_docx(path):
	return Document(path)

In [4]:
def convert_Table_to_list(table):
	return [[' '.join(cell.text.split()) for cell in row.cells] for row in table.rows]

In [5]:
def get_rows(table):
	return len(table)

In [6]:
def get_columns(table):
	return len(table[0])

In [7]:
def lenght_every_column(table, rows, columns):
	dict_lenght_columns = {}

	for i in range(columns):
		maxim_lenght = 0
		for j in range(rows):
			maxim_lenght = max(maxim_lenght, len(table[j][i]))

		dict_lenght_columns[f'столбец_{i}'] = maxim_lenght + 9 if maxim_lenght % 2 else maxim_lenght + 8

	return dict_lenght_columns

In [8]:
def add_space(dict_lenght_columns, title, columns):
	lenght_title = len(title)

	if sum(list(dict_lenght_columns.values())) + columns <= lenght_title:
		return lenght_title // columns

	return 0

In [9]:
def format_columns(title, table, rows, columns):
	dict_lenght_columns = lenght_every_column(table, rows, columns)
	space = add_space(dict_lenght_columns, title, columns)

	for i in range(columns):
		format_space = dict_lenght_columns[f'столбец_{i}'] + space
		for j in range(rows):
			table[j][i] = f'{table[j][i]:^{format_space}}'

In [10]:
def sep_values_rows(row):
	return f"│{'│'.join(row)}│"

In [11]:
def index_sep_rows(row, columns):
	index_sep = []
	count = 0

	while count != columns:
		index = row[1:].index('│')
		index_sep += [index + 1]
		row = row[index + 1:]
		count += 1

	return index_sep

In [12]:
def sep_rows(index_sep):
	return f"\n├{'┼'.join(list(map(lambda x: '─' * (x - 1), index_sep)))}┤\n"

In [13]:
def sep_begin_row(index_sep):
	return f"├{'┬'.join(list(map(lambda x: '─' * (x - 1), index_sep)))}┤"

In [14]:
def sep_last_row(index_sep):
	return f"└{'┴'.join(list(map(lambda x: '─' * (x - 1), index_sep)))}┘"

In [15]:
def sep_title(title, lenght):
	return f"╭{lenght * '─'}╮\n│{title:^{lenght}}│"

In [16]:
def format_rows(title, table, columns):
	for i, row in enumerate(table):
		table[i] = sep_values_rows(row)

	index_sep = index_sep_rows(table[0], columns)
	return f"{sep_title(title, len(table[0]) - 2)}\n{sep_begin_row(index_sep)}\n{f'{sep_rows(index_sep)}'.join(table)}\n{sep_last_row(index_sep)}"

In [17]:
def to_string(title, element):
	table = convert_Table_to_list(element)

	rows = get_rows(table)
	columns = get_columns(table)

	format_columns(title, table, rows, columns)
	return format_rows(title, table, columns)

In [18]:
def convert_omxl_paragraph_to_text(paragraph, document):
	return ' '.join(Paragraph(paragraph, document).text.lower().split())

In [19]:
def convert_list_omxl_paragraph_to_text(list_paragraph, document):
	return ' '.join(list(map(lambda x: convert_omxl_paragraph_to_text(x, document), list_paragraph)))

In [20]:
def convert_omxl_table_to_table(element: omxl_table, document: Document) -> Table:
	return Table(element, document)

In [21]:
def convert_text_to_oxml_paragraph(text: str) -> omxl_paragraph:
	return Document().add_paragraph(text)._element
	tmp_document: Document = Document()
	paragraph = tmp_document.add_paragraph(text)
	return paragraph._element

In [22]:
def regex_title(text):
	return regex.findall(r'таблица\s[0-9]{1,}', text) != []

In [23]:
def get_body(document):
	return [element for element in document.element.body if isinstance(element, omxl_paragraph) or isinstance(element, omxl_table)]

In [24]:
def correct_body(body, document):
    _body = []
    index_title = None

    for i, element in enumerate(body):
        if isinstance(element, omxl_paragraph):
            text = convert_omxl_paragraph_to_text(element, document)
            if text:
                if regex_title(text):
                    index_title = i

                elif not index_title:
                    _body += [element]
        else:
            if index_title:
                title = convert_list_omxl_paragraph_to_text(body[index_title:i], document)
                _body += [convert_text_to_oxml_paragraph(title), element]
                index_title = None

    return _body

In [25]:
def merge(body, document):
	merge_text = ''
	index_title = None

	for i, element in enumerate(body):
		if isinstance(element, omxl_paragraph):
			text = convert_omxl_paragraph_to_text(element, document)
			if regex_title(text):
				index_title = i

			if not index_title:
				merge_text += f"{text}\n\n"

		else:
			title = convert_list_omxl_paragraph_to_text(body[index_title:i], document)
			table = convert_omxl_table_to_table(element, document)
			merge_text += f"{to_string(title, table)}\n"
			index_title = None

	return merge_text

In [26]:
document = open_docx('путь_к_документу.docx')

In [27]:
body = correct_body(get_body(document), document)

In [28]:
print(merge(body, document))