In [1]:
from pptx import Presentation
import json
import pprint
from collections import OrderedDict

In [2]:
def extract_all_text_by_slide(pptx_path):
    prs = Presentation(pptx_path)
    slides_data = []

    for i, slide in enumerate(prs.slides):
        slide_texts = []
        for shape in slide.shapes:
            if shape.has_text_frame:
                text = shape.text.strip()
                if text:
                    slide_texts.append(text)
        if slide_texts:
            slides_data.append({
                "slide_number": i + 1,
                "contents": slide_texts
            })

    return slides_data

In [3]:
ppt_path = "Charters.pptx"
slides_json = extract_all_text_by_slide(ppt_path)

with open("output_charters_completo.json", "w", encoding="utf-8") as f:
    json.dump(slides_json, f, indent=2, ensure_ascii=False)


In [13]:
import json
from collections import OrderedDict

# Cargar JSON original
with open("output_charters_completo.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extraer bloques delimitados
def extract_block(normalized, contents, start_title, end_title=None):
    try:
        start_idx = normalized.index(start_title.lower()) + 1
        end_idx = normalized.index(end_title.lower()) if end_title else len(contents)
        return "\n".join(x.strip() for x in contents[start_idx:end_idx] if x.strip())
    except:
        return ""

# Extraer bloques basados en frases internas
def extract_fallback_block(contents, start_phrase, stop_phrases=[]):
    try:
        start_idx = next(i for i, c in enumerate(contents) if start_phrase in c)
        end_idx = len(contents)
        for stop in stop_phrases:
            try:
                temp = next(i for i, c in enumerate(contents[start_idx:], start=start_idx) if stop.lower() in c.lower())
                end_idx = min(end_idx, temp)
            except:
                continue
        return "\n".join(x.strip() for x in contents[start_idx:end_idx] if x.strip())
    except:
        return ""

# Procesamiento
structured_slides = []

for slide in data:
    contents = slide["contents"]
    normalized = [c.lower().strip() for c in contents]
    slide_struct = OrderedDict()

    # Título = último
    slide_struct["slide_number"] = slide["slide_number"]
    slide_struct["title"] = contents[-1].strip() if contents else ""

    # Problem statement = primero
    slide_struct["problem_statement"] = contents[0].strip() if contents else ""

    # Duration + Costs
    slide_struct["duration"] = ""
    slide_struct["one_time_costs"] = ""
    slide_struct["run_rate_cost"] = ""
    try:
        for i in range(len(contents) - 6):
            if (contents[i].strip().lower() == "duration" and
                contents[i + 1].strip().lower() == "one-time costs" and
                contents[i + 2].strip().lower() == "run rate cost"):
                slide_struct["duration"] = contents[i + 3].strip()
                slide_struct["one_time_costs"] = contents[i + 4].strip()
                slide_struct["run_rate_cost"] = contents[i + 5].strip()
                break
    except:
        pass

    # Goal Statement
    goal_text = extract_block(normalized, contents, "Goal Statement", "In Scope")
    if not goal_text:
        goal_text = extract_fallback_block(contents, "Business Objective:", ["In Scope", "Out of Scope", "Approach including phasing"])
    slide_struct["goal_statement"] = goal_text

    # In Scope = anterior a Out of Scope
    try:
        out_scope_idx = normalized.index("out of scope")
        in_scope_text = contents[out_scope_idx - 1].strip()
        slide_struct["in_scope"] = in_scope_text
    except:
        slide_struct["in_scope"] = ""

    # Out of Scope
    out_scope_block = extract_block(normalized, contents, "Out of Scope", "Approach including phasing")
    if not out_scope_block:
        out_scope_block = extract_fallback_block(contents, "Out of Scope", ["Phase 1", "Phase 2"])
    slide_struct["out_of_scope"] = out_scope_block

    # Approach
    approach_text = extract_block(normalized, contents, "Approach including phasing", "Benefits overview")
    if not approach_text:
        approach_text = extract_fallback_block(contents, "Phase 1", ["Benefits overview"])
    slide_struct["approach"] = approach_text

    # Benefits
    slide_struct["benefits"] = extract_block(normalized, contents, "Benefits overview & positive impact to qlik", "Dependencies and constraints")

    # Dependencias y restricciones
    slide_struct["dependencies_constraints"] = extract_block(normalized, contents, "Dependencies and constraints", "Risks & negative impact to qlik")

    # Riesgos = hasta el penúltimo (antes del título)
    try:
        start_idx = normalized.index("risks & negative impact to qlik") + 1
        end_idx = len(contents) - 1
        block = contents[start_idx:end_idx]
        slide_struct["risks"] = "\n".join(x.strip() for x in block if x.strip())
    except:
        slide_struct["risks"] = ""

    structured_slides.append(slide_struct)

# Guardar resultado
with open("charters_SLIDES_FINAL.json", "w", encoding="utf-8") as f:
    json.dump(structured_slides, f, indent=2, ensure_ascii=False)

OrderedDict([('slide_number', 1),
             ('title', 'Program Management Automation'),
             ('problem_statement',
              'Although Qlik has dashboards like the Program Health Dashboard, '
              'program and project tracking is still managed across multiple, '
              'disconnected sources (Excel files, Teams chats, emails). This '
              'fragmentation causes inefficiencies, inconsistencies, and '
              'limited visibility into project health and progress. There is '
              'no centralized, automated repository, nor is there a fast way '
              'to query live program data.'),
             ('duration', '6 Months'),
             ('one_time_costs', 'None'),
             ('run_rate_cost', 'None'),
             ('goal_statement',
              'Business Objective: Build a centralized, structured database of '
              'active and historical programs/projects, and develop a '
              'conversational assistant using Qlik