<a href="https://colab.research.google.com/github/dioschuarz/ai_utils/blob/main/HTML_to_PPTX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# -*- coding: utf-8 -*-
"""
Este script √© um conversor universal aprimorado que transforma um arquivo HTML
contendo m√∫ltiplos slides em uma apresenta√ß√£o do PowerPoint (.pptx) TOTALMENTE EDIT√ÅVEL.

Como funciona:
1.  Usa BeautifulSoup para dividir o HTML principal em m√∫ltiplos arquivos HTML tempor√°rios, um para cada slide.
2.  Para cada slide tempor√°rio, utiliza um navegador 'headless' (Selenium) para renderiz√°-lo e
    extrair com alta fidelidade a posi√ß√£o, estilo e conte√∫do de cada elemento.
3.  Coleta os dados de todos os slides processados.
4.  Usa a biblioteca python-pptx para montar uma √∫nica apresenta√ß√£o do PowerPoint,
    recriando cada elemento em seu respectivo slide.

O resultado √© uma apresenta√ß√£o que se parece com o HTML, mas √© 100% modific√°vel.
"""

# Passo 0: Instala√ß√£o e configura√ß√£o de depend√™ncias
try:
    import selenium
    import pptx
    from bs4 import BeautifulSoup
    import cairosvg
except ImportError:
    print("Instalando bibliotecas necess√°rias: python-pptx, selenium, beautifulsoup4, lxml, cairosvg, requests...")
    import os
    os.system('pip install python-pptx selenium beautifulsoup4 lxml cairosvg requests > /dev/null')
    print("\nConfigurando o ambiente para o navegador (pode levar um minuto)...")
    os.system('apt-get update > /dev/null')
    os.system('apt install -y chromium-chromedriver > /dev/null')
    os.system('cp /usr/lib/chromium-browser/chromedriver /usr/bin')
    print("\nAmbiente pronto! Por favor, execute a c√©lula novamente para iniciar a convers√£o.")
    exit()

import os
import re
import io
import shutil
import requests
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN, MSO_VERTICAL_ANCHOR
from pptx.enum.shapes import MSO_SHAPE
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


# --- CONFIGURA√ß√µes ---
# ‚¨áÔ∏è ALTERE AQUI O NOME DO SEU ARQUIVO HTML ‚¨áÔ∏è
HTML_FILE_TO_CONVERT = 'index.html'
# ---------------------------------------------

OUTPUT_PPTX_FILENAME = 'index.pptx'
# Dimens√µes padr√£o de um slide 16:9, que correspondem ao viewport
SLIDE_WIDTH_INCHES = 13.333
SLIDE_HEIGHT_INCHES = 7.5
VIEWPORT_WIDTH = 1280 # Alinhado com a propor√ß√£o 16:9
VIEWPORT_HEIGHT = 720


# --- Fun√ß√µes Auxiliares de Convers√£o ---

def parse_rgba_color(rgba_string):
    """Converte uma string 'rgba(r, g, b, a)' ou 'rgb(r, g, b)' para uma tupla (r, g, b) e opacidade."""
    try:
        parts = re.findall(r"[\d\.]+", rgba_string)
        if not parts: return (0, 0, 0), 0.0
        r, g, b = int(parts[0]), int(parts[1]), int(parts[2])
        alpha = float(parts[3]) if len(parts) > 3 else 1.0
        return (r, g, b), alpha
    except (IndexError, ValueError):
        return (0, 0, 0), 1.0

def parse_font_family(font_family_string):
    """Extrai o nome da fonte principal de uma string de CSS, removendo aspas."""
    return font_family_string.split(',')[0].strip().replace('"', '').replace("'", "")


def get_elements_data_from_html(file_path):
    """
    (MOTOR PRINCIPAL) Abre UM √öNICO arquivo HTML no Selenium, extrai a geometria e os estilos
    de cada elemento vis√≠vel com alta fidelidade.
    """
    if not os.path.exists(file_path):
        print(f"‚ùå ERRO: Arquivo tempor√°rio '{file_path}' n√£o encontrado.")
        return None

    chrome_options = Options(); chrome_options.add_argument('--headless'); chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage'); chrome_options.add_argument(f'--window-size={VIEWPORT_WIDTH},{VIEWPORT_HEIGHT}')

    driver = webdriver.Chrome(options=chrome_options)
    local_url = f"file://{os.path.abspath(file_path)}"; driver.get(local_url)
    driver.implicitly_wait(3)

    driver.execute_script("document.documentElement.style.overflow = 'hidden';")

    # Esconde todos os elementos do corpo para tirar uma foto apenas do fundo
    all_elements_js = "return Array.from(document.body.children);"
    all_elements = driver.execute_script(all_elements_js)
    for element in all_elements:
         driver.execute_script("arguments[0].style.visibility = 'hidden';", element)
    background_png = driver.get_screenshot_as_png()
    for element in all_elements:
         driver.execute_script("arguments[0].style.visibility = 'visible';", element)

    js_script = """
    const getFontSizeFromClass = (element) => {
        if (element && typeof element.className === 'string') {
            const match = element.className.match(/text-\\[(\\d+(\\.\\d+)?)px\\]/);
            if (match && match[1]) return match[1] + 'px';
        }
        return null;
    };

    const container = document.body;
    if (!container) return [];
    const elements = Array.from(container.getElementsByTagName('*'));
    const elementData = [];
    const processedElements = new Set();

    for (const el of elements) {
        if (processedElements.has(el)) continue;

        const rect = el.getBoundingClientRect();
        if (rect.width < 1 || rect.height < 1 || el.offsetParent === null) continue;

        const style = window.getComputedStyle(el);
        const tagName = el.tagName.toLowerCase();

        let data = {
            tag: tagName, x: rect.x, y: rect.y, width: rect.width, height: rect.height,
            background_color: style.backgroundColor, border_radius: style.borderRadius,
            border_top_width: style.borderTopWidth, border_top_color: style.borderTopColor,
            border_right_width: style.borderRightWidth, border_right_color: style.borderRightColor,
            border_bottom_width: style.borderBottomWidth, border_bottom_color: style.borderBottomColor,
            border_left_width: style.borderLeftWidth, border_left_color: style.borderLeftColor,
            padding_top: style.paddingTop, padding_right: style.paddingRight,
            padding_bottom: style.paddingBottom, padding_left: style.paddingLeft,
            text_align: style.textAlign, align_items: style.alignItems,
            display: style.display, line_height: style.lineHeight,
            z_index: style.zIndex, svg_content: null, text_runs: [],
            is_bullet_item: tagName === 'li' || style.display === 'list-item'
        };

        if (tagName === 'svg') {
            data.svg_content = el.outerHTML;
            elementData.push(data);
            processedElements.add(el);
        } else if (tagName === 'img') {
            data.tag = 'img';
            data.src = el.src;
            elementData.push(data);
            processedElements.add(el);
        }
        else {
            let runs = [];
            let isContainerOnly = el.children.length > 0 && Array.from(el.children).every(c => ['block', 'flex', 'grid'].includes(window.getComputedStyle(c).display));

            for (const node of el.childNodes) {
                if (node.nodeType === 3 && node.textContent.trim()) {
                    runs.push({
                        text: node.textContent.trim().replace(/\\s+/g, ' '),
                        color: style.color,
                        font_size: getFontSizeFromClass(el) || style.fontSize,
                        font_weight: style.fontWeight, font_family: style.fontFamily
                    });
                } else if (node.nodeType === 1) {
                    const childStyle = window.getComputedStyle(node);
                    if (childStyle.display.includes('inline') && node.textContent.trim()) {
                         runs.push({
                            text: node.textContent.trim().replace(/\\s+/g, ' '),
                            color: childStyle.color,
                            font_size: getFontSizeFromClass(node) || childStyle.fontSize,
                            font_weight: childStyle.fontWeight, font_family: childStyle.fontFamily
                        });
                        node.querySelectorAll('*').forEach(n => processedElements.add(n));
                        processedElements.add(node);
                    }
                }
            }

            const hasVisibleBg = !data.background_color.includes('rgba(0, 0, 0, 0)');
            const hasVisibleBorder = ['Top', 'Right', 'Bottom', 'Left'].some(side => parseFloat(style['border' + side + 'Width']) > 0 && !style['border' + side + 'Color'].includes('rgba(0, 0, 0, 0)'));

            if ((runs.length > 0 && !isContainerOnly) || hasVisibleBg || hasVisibleBorder) {
                data.text_runs = runs;
                elementData.push(data);
            }
            processedElements.add(el);
        }
    }
    return elementData;
    """
    try:
        elements = driver.execute_script(js_script)
        return {'elements': elements, 'background_png': background_png}
    except Exception as e:
        print(f"‚ùå ERRO ao extrair dados dos elementos: {e}")
        return None
    finally:
        driver.quit()

def add_slide_to_presentation(prs, slide_data):
    """Adiciona e constr√≥i um √∫nico slide em uma apresenta√ß√£o existente."""
    print(f"‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...")
    slide_layout = prs.slide_layouts[6]
    slide = prs.slides.add_slide(slide_layout)

    slide.shapes.add_picture(io.BytesIO(slide_data['background_png']), 0, 0, width=prs.slide_width, height=prs.slide_height)

    scale_x = prs.slide_width.inches / VIEWPORT_WIDTH
    scale_y = prs.slide_height.inches / VIEWPORT_HEIGHT

    elements = slide_data['elements']
    elements.sort(key=lambda el: (0 if el['z_index'] == 'auto' else int(el['z_index']), el['y'], el['x']))

    for el in elements:
        try:
            left=Inches(el['x'] * scale_x); top=Inches(el['y'] * scale_y)
            width=Inches(el['width'] * scale_x); height=Inches(el['height'] * scale_y)

            if el['svg_content']:
                png_bytes = cairosvg.svg2png(bytestring=el['svg_content'].encode('utf-8'))
                slide.shapes.add_picture(io.BytesIO(png_bytes), left, top, width, height)
            elif el.get('tag') == 'img' and el.get('src'):
                try:
                    response = requests.get(el['src'])
                    if response.status_code == 200:
                        slide.shapes.add_picture(io.BytesIO(response.content), left, top, width, height)
                except Exception: pass
            else:
                has_visible_bg = not 'rgba(0, 0, 0, 0)' in el.get('background_color', '')
                border_widths_px = [float(el.get(f'border_{s}_width', '0px').replace('px', '')) for s in ['top', 'right', 'bottom', 'left']]
                has_visible_border = any(w > 0 for w in border_widths_px)

                if (el['text_runs']) or (has_visible_bg or has_visible_border):
                    shape = slide.shapes.add_textbox(left, top, width, height) if el['text_runs'] else slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, left, top, width, height)

                    if has_visible_bg:
                        bg_color, bg_alpha = parse_rgba_color(el['background_color'])
                        if bg_alpha > 0.05: shape.fill.solid(); shape.fill.fore_color.rgb = RGBColor(*bg_color)
                    else: shape.fill.background()

                    if has_visible_border:
                        max_width = max(border_widths_px); side_idx = border_widths_px.index(max_width)
                        side = ['top', 'right', 'bottom', 'left'][side_idx]
                        color_str = el.get(f'border_{side}_color')
                        color, _ = parse_rgba_color(color_str)
                        shape.line.color.rgb = RGBColor(*color); shape.line.width = Pt(max_width)
                    else: shape.line.fill.background()

                    if el['text_runs']:
                        tf = shape.text_frame; tf.clear(); tf.word_wrap = True
                        try:
                            tf.margin_top = Pt(float(el.get('padding_top', '0px').replace('px','')))
                            tf.margin_right = Pt(float(el.get('padding_right', '0px').replace('px','')))
                            tf.margin_bottom = Pt(float(el.get('padding_bottom', '0px').replace('px','')))
                            tf.margin_left = Pt(float(el.get('padding_left', '0px').replace('px','')))
                        except ValueError: pass

                        if 'center' in [el.get('align_items'), el.get('text_align')]:
                            tf.vertical_anchor = MSO_VERTICAL_ANCHOR.MIDDLE
                        else:
                            tf.vertical_anchor = MSO_VERTICAL_ANCHOR.TOP

                        p = tf.paragraphs[0]; p.text = ""; p.space_before = Pt(0); p.space_after = Pt(0)

                        try:
                            line_height_str = el.get('line_height')
                            if line_height_str and line_height_str != 'normal':
                                if 'px' in line_height_str:
                                    lh_px = float(line_height_str.replace('px', ''))
                                    fs_px = float(el['text_runs'][0]['font_size'].replace('px', ''))
                                    p.line_spacing = lh_px / fs_px if fs_px > 0 else 1.2
                                else: p.line_spacing = float(line_height_str)
                            else: p.line_spacing = 1.2
                        except (ValueError, IndexError): p.line_spacing = 1.2

                        if el.get('is_bullet_item'): p.level = 0

                        for k, run_data in enumerate(el['text_runs']):
                            run = p.add_run(); run.text = run_data['text']
                            if k < len(el['text_runs']) - 1: run.text += ' '
                            font = run.font
                            font_color, _ = parse_rgba_color(run_data['color'])
                            font.color.rgb = RGBColor(*font_color)
                            font.size = Pt(float(run_data['font_size'].replace('px', '')))
                            font.bold = int(run_data['font_weight']) >= 600 if run_data['font_weight'].isnumeric() else 'bold' in run_data['font_weight']
                            font.name = parse_font_family(run_data.get('font_family', 'Arial'))

                        align_map = {'left': PP_ALIGN.LEFT, 'center': PP_ALIGN.CENTER, 'right': PP_ALIGN.RIGHT, 'justify': PP_ALIGN.JUSTIFY}
                        p.alignment = align_map.get(el.get('text_align'), PP_ALIGN.LEFT)
        except Exception: pass

def main():
    """
    (ORQUESTRADOR) Fun√ß√£o principal que divide o HTML, processa cada slide individualmente,
    e depois monta a apresenta√ß√£o final.
    """
    if not os.path.exists(HTML_FILE_TO_CONVERT):
        print(f"‚ùå ERRO: Arquivo principal '{HTML_FILE_TO_CONVERT}' n√£o encontrado.")
        return

    # 1. Dividir o HTML em slides tempor√°rios
    print("üî™ Dividindo o HTML em slides individuais...")
    with open(HTML_FILE_TO_CONVERT, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml')

    head_content = soup.find('head')
    slides_content = soup.find_all('section', class_='slide')
    if not slides_content:
        print("‚ùå ERRO: Nenhuma tag <section class='slide'> encontrada no HTML.")
        return

    temp_dir = 'temp_slides'
    if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    temp_files = []
    for i, slide_content in enumerate(slides_content):
        temp_html = f"<html><head>{head_content}</head><body style='margin:0; padding:0;'>{slide_content}</body></html>"
        temp_file_path = os.path.join(temp_dir, f"slide_{i}.html")
        with open(temp_file_path, 'w', encoding='utf-8') as f:
            f.write(temp_html)
        temp_files.append(temp_file_path)

    print(f"‚úÖ {len(temp_files)} slides foram extra√≠dos para arquivos tempor√°rios.")

    # 2. Processar cada slide e coletar os dados
    all_slides_data = []
    for i, file_path in enumerate(temp_files):
        print(f"\nüö¢ Processando Slide {i + 1}/{len(temp_files)}...")
        slide_data = get_elements_data_from_html(file_path)
        if slide_data:
            all_slides_data.append(slide_data)

    # 3. Construir a apresenta√ß√£o final a partir dos dados coletados
    if all_slides_data:
        print(f"\nüõ†Ô∏è  Construindo apresenta√ß√£o final com {len(all_slides_data)} slides...")
        prs = Presentation()
        prs.slide_width = Inches(SLIDE_WIDTH_INCHES); prs.slide_height = Inches(SLIDE_HEIGHT_INCHES)

        for slide_data in all_slides_data:
            add_slide_to_presentation(prs, slide_data)

        prs.save(OUTPUT_PPTX_FILENAME)
        print(f"\nüéâ Apresenta√ß√£o edit√°vel '{OUTPUT_PPTX_FILENAME}' criada com sucesso!")

    # 4. Limpar arquivos tempor√°rios
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
        print("üóëÔ∏è  Arquivos tempor√°rios foram limpos.")

if __name__ == '__main__':
    main()



üî™ Dividindo o HTML em slides individuais...
‚úÖ 13 slides foram extra√≠dos para arquivos tempor√°rios.

üö¢ Processando Slide 1/13...

üö¢ Processando Slide 2/13...

üö¢ Processando Slide 3/13...

üö¢ Processando Slide 4/13...

üö¢ Processando Slide 5/13...

üö¢ Processando Slide 6/13...

üö¢ Processando Slide 7/13...

üö¢ Processando Slide 8/13...

üö¢ Processando Slide 9/13...

üö¢ Processando Slide 10/13...

üö¢ Processando Slide 11/13...

üö¢ Processando Slide 12/13...

üö¢ Processando Slide 13/13...

üõ†Ô∏è  Construindo apresenta√ß√£o final com 13 slides...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta√ß√£o...
‚û°Ô∏è  Adicionando slide √† apresenta