In [None]:
!pip install python-docx pylatex


In [2]:
from docx import Document
from pylatex import Document as LatexDocument, Section, Subsection, Command, Figure, Table, Tabular, NoEscape
import os
import subprocess
import re

def read_word_file(file_path):
    doc = Document(file_path)
    return doc

def clean_text(text):
    # Remove caracteres Unicode indesejados
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

def convert_paragraphs(doc, latex_doc):
    for para in doc.paragraphs:
        clean_para = clean_text(para.text)
        if clean_para:
            latex_doc.append(clean_para)
            latex_doc.append("\n")

def convert_figures(doc, latex_doc):
    for rel in doc.inline_shapes:
        if rel.type == 3:  # This means it's an image
            image_path = rel._inline.graphic.graphicData.pic.blipFill.blip.embed
            image = doc.part.related_parts[image_path]
            image_filename = os.path.join('images', image.filename)
            with open(image_filename, 'wb') as img_file:
                img_file.write(image.blob)
            with latex_doc.create(Figure(position='h!')) as pic:
                pic.add_image(image_filename, width=NoEscape(r'0.8\textwidth'))
                pic.add_caption("Caption for image")

def convert_tables(doc, latex_doc):
    for table in doc.tables:
        max_cols = max(len(row.cells) for row in table.rows)
        column_format = "|".join(["c"] * max_cols)  # Adjust column format based on max_cols
        with latex_doc.create(Table(position='h!')) as tab:
            with tab.create(Tabular(f'|{column_format}|')) as data_table:
                for row in table.rows:
                    data_table.add_hline()
                    row_data = [clean_text(cell.text) for cell in row.cells]
                    row_data.extend([""] * (max_cols - len(row_data)))  # Pad with empty strings if row_data is short
                    data_table.add_row(row_data)
                data_table.add_hline()

def create_latex_document(doc):
    latex_doc = LatexDocument()
    with latex_doc.create(Section('Document Content')):
        convert_paragraphs(doc, latex_doc)
        convert_figures(doc, latex_doc)
        convert_tables(doc, latex_doc)
    return latex_doc

def main():
    file_path = 'Busca de Arquivos Interessantes.docx'
    doc = read_word_file(file_path)
    latex_doc = create_latex_document(doc)
    latex_filename = 'output'
    latex_doc.generate_tex(latex_filename)
    
    # Compile the LaTeX file using pdflatex
    try:
        result = subprocess.run(['pdflatex', '--interaction=nonstopmode', f'{latex_filename}.tex'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(result.stdout.decode())
    except subprocess.CalledProcessError as e:
        print("Error during LaTeX compilation")
        print(e.output.decode('latin-1'))  # Using 'latin-1' to handle any non-UTF-8 characters

if __name__ == "__main__":
    main()


This is pdfTeX, Version 3.141592653-2.6-1.40.26 (MiKTeX 24.4) (preloaded format=pdflatex.fmt)
 restricted \write18 enabled.
entering extended mode
(output.tex
LaTeX2e <2024-06-01> patch level 1
L3 programming layer <2024-05-27>
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/base\article.cls
Document Class: article 2024/02/08 v1.4n Standard LaTeX document class
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/base\size10.clo))
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/base\fontenc.sty)
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/base\inputenc.sty)
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/lm\lmodern.sty)
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/base\textcomp.sty)
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/lastpage\lastpage.sty
(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/lastpage\lastpage2e.sty

(C:\Users\wagne\AppData\Local\Programs\MiKTeX\tex/latex/lastpage\lastpagemodern
.sty) 
)) (C:\Users