# Previous attempt problem

In the previous attempt, I've found out 2 problems. </br></br>
The first one is related to the hierarchy of the PowerPoint. The hierarchy is like slides->layouts->master. So if the user changes something from the PowerPoint template, the change will be marked down in the slides folder (size, font, bold, etc). But if the user didn’t touch anything and use the default font and size of title, subtitle, content, etc, those things lie in either layouts or masters folder. </br></br>
Knowing the issue, I have approach and create a trace situation in my code to layouts or masters folder, however, it is still not working.</br></br>
--> Therefore, in this second attempt I will try to fix that issue.

# This project attemptation

In [5]:
from lxml import etree
import os, glob

ns = {
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'p': 'http://schemas.openxmlformats.org/presentationml/2006/main'
}

def extract_text_styles(tree, slide_path, ppt_root):
    for shape in tree.xpath('//p:sp', namespaces=ns):
        text_nodes = shape.xpath('.//a:t', namespaces=ns)
        text_content = ''.join(etree.tostring(n, method='text', encoding='unicode') for n in text_nodes)
        if not text_content or not text_content.strip():
            continue

        font = shape.xpath('.//a:rPr//a:latin/@typeface', namespaces=ns)
        size = shape.xpath('.//a:rPr/@sz', namespaces=ns)
        bold = shape.xpath('.//a:rPr/@b', namespaces=ns)
        italic = shape.xpath('.//a:rPr/@i', namespaces=ns)
        underline = shape.xpath('.//a:rPr/@u', namespaces=ns)

        ph_type = shape.xpath('.//p:ph/@type', namespaces=ns)
        ph_idx  = shape.xpath('.//p:ph/@idx', namespaces=ns)
        para_level = get_paragraph_level(shape)

        if not font or not size or not bold or not italic or not underline:
            ph_type_str = ph_type[0] if ph_type else None
            ph_idx_str  = ph_idx[0] if ph_idx else None
            layout_font, layout_size, layout_bold, layout_italic, layout_underline = trace_to_layout_master(
                slide_path, ppt_root, ph_type_str, ph_idx_str, para_level
            )
            if layout_font and isinstance(layout_font, str) and layout_font.startswith('+'):
                layout_font = resolve_theme_font(ppt_root, layout_font)

            font      = font or ([layout_font] if layout_font else [])
            size      = size or ([layout_size] if layout_size else [])
            bold      = bold or ([layout_bold] if layout_bold is not None else [])
            italic    = italic or ([layout_italic] if layout_italic is not None else [])
            underline = underline or ([layout_underline] if layout_underline else [])

        ft=first_nonempty(font)
        sz=first_nonempty(size)
        bd=first_nonempty(bold)
        it=first_nonempty(italic)
        ul=first_nonempty(underline)

        print(
            f"Text: {preview(text_content, 80)} | "
            f"Font: {ft or 'N/A'} | "
            f"Size: {as_pt(sz) if sz is not None else 'N/A'} pt | "
            f"Bold: {as_bool01(bd) if bd is not None else 'N/A'} | "
            f"Italic: {as_bool01(it) if it is not None else 'N/A'} | "
            f"Underline: {ul or 'N/A'} | "
            f"Placeholder: {ph_type[0] if ph_type else 'Non-placeholder'} | "
            f"Level: {para_level}"
        )

def get_paragraph_level(shape):
    pPr = shape.xpath('.//a:pPr/@lvl', namespaces=ns)
    return int(pPr[0]) if pPr else 0

def trace_to_layout_master(slide_path, ppt_root, ph_type, ph_idx, para_level):
    font = size = bold = italic = underline = None
    layout_path = get_linked_file(slide_path, "slideLayout", ppt_root)
    if layout_path:
        layout_tree = etree.parse(layout_path)
        l_font, l_size, l_bold, l_italic, l_underline = find_placeholder_style(layout_tree, ph_type, ph_idx, para_level)
        font      = font or l_font
        size      = size or l_size
        bold      = bold if bold is not None else l_bold
        italic    = italic if italic is not None else l_italic
        underline = underline or l_underline

    master_path = get_linked_file(layout_path, "slideMaster", ppt_root) if layout_path else None
    if not master_path:
        master_path = get_linked_file(slide_path, "slideMaster", ppt_root)

    if master_path:
        master_tree = etree.parse(master_path)
        m_font, m_size, m_bold, m_italic, m_underline = find_placeholder_style(master_tree, ph_type, ph_idx, para_level)
        font      = font or m_font
        size      = size or m_size
        bold      = bold if bold is not None else m_bold
        italic    = italic if italic is not None else m_italic
        underline = underline or m_underline

        if not font:
            t_font, *_ = find_text_styles_in_master(master_tree, ph_type, para_level)
            font = t_font or font

    return font, size, bold, italic, underline

def find_placeholder_style(tree, ph_type, ph_idx, para_level):
    candidates = []
    xps = []
    if ph_type and ph_idx:
        xps.append(f'//p:sp[.//p:ph[@type="{ph_type}" and @idx="{ph_idx}"]]')
    if ph_type:
        xps.append(f'//p:sp[.//p:ph[@type="{ph_type}"]]')
    if ph_idx:
        xps.append(f'//p:sp[.//p:ph[@idx="{ph_idx}"]]')
    xps.extend([
        '//p:sp[.//p:ph[@type="body"]]',
        '//p:sp[.//p:ph[@type="ctrTitle"]]',
        '//p:sp[.//p:ph[@type="title"]]',
        '//p:sp[.//p:ph]'
    ])
    for xp in xps:
        nodes = tree.xpath(xp, namespaces=ns)
        if nodes:
            candidates = nodes
            break

    if not candidates:
        return find_text_styles_in_master(tree, ph_type, para_level)

    lvl_xpath = f'.//a:lstStyle/a:lvl{para_level + 1}pPr/a:defRPr'
    for sp in candidates:
        rPr = sp.xpath(lvl_xpath, namespaces=ns)
        if rPr:
            vals = extract_from_rPr(rPr[0]);  # tuple
            if any(vals): return vals

        rPr = sp.xpath('.//a:defRPr', namespaces=ns)
        if rPr:
            vals = extract_from_rPr(rPr[0])
            if any(vals): return vals

        rPr = sp.xpath('.//a:pPr/a:defRPr', namespaces=ns)
        if rPr:
            vals = extract_from_rPr(rPr[0])
            if any(vals): return vals

    return find_text_styles_in_master(tree, ph_type, para_level)

def find_text_styles_in_master(tree, ph_type, para_level):
    style_map = {
        'title': 'titleStyle',
        'ctrTitle': 'titleStyle',
        'subTitle': 'bodyStyle',
        'body': 'bodyStyle',
        None: 'bodyStyle'
    }
    style_name = style_map.get(ph_type, 'bodyStyle')

    lvl_xpath = f'//p:txStyles/p:{style_name}/a:lvl{para_level + 1}pPr/a:defRPr'
    rPr_list = tree.xpath(lvl_xpath, namespaces=ns)
    if rPr_list:
        return extract_from_rPr(rPr_list[0])

    default_xpath = f'//p:txStyles/p:{style_name}//a:defRPr'
    rPr_list = tree.xpath(default_xpath, namespaces=ns)
    if rPr_list:
        return extract_from_rPr(rPr_list[0])

    any_rPr = tree.xpath('//p:txStyles//a:defRPr', namespaces=ns)
    if any_rPr:
        return extract_from_rPr(any_rPr[0])

    return None, None, None, None, None

def extract_from_rPr(rPr):
    font = size = bold = italic = underline = None
    latin = rPr.xpath('./a:latin/@typeface', namespaces=ns)
    if latin:
        font = latin[0]
    else:
        buFont = rPr.xpath('./a:buFont/@typeface', namespaces=ns)
        if buFont:
            font = buFont[0]
    size = rPr.get('sz')
    bold = rPr.get('b')
    italic = rPr.get('i')
    underline = rPr.get('u')
    return font, size, bold, italic, underline

def get_theme_path(ppt_root):
    """
    Use ppt/_rels/presentation.xml.rels to find the active theme (themeN.xml).
    Fallback to ppt/theme/theme1.xml if needed.
    """
    pres_rels = os.path.join(ppt_root, "_rels", "presentation.xml.rels")
    if os.path.exists(pres_rels):
        rel_tree = etree.parse(pres_rels)
        targets = rel_tree.xpath(
            '//rel:Relationship[contains(@Type,"/theme")]/@Target',
            namespaces={'rel': 'http://schemas.openxmlformats.org/package/2006/relationships'}
        )
        if targets:
            path = os.path.normpath(os.path.join(ppt_root, targets[0]))
            if os.path.exists(path):
                return path

    # Fallback
    default = os.path.join(ppt_root, "theme", "theme1.xml")
    return default if os.path.exists(default) else None


def resolve_theme_font(ppt_root, theme_font_ref):
    theme_path = get_theme_path(ppt_root)
    if not theme_path:
        return theme_font_ref
    theme_tree = etree.parse(theme_path)
    mapping = {
        '+mn-lt': '//a:minorFont/a:latin/@typeface',
        '+mj-lt': '//a:majorFont/a:latin/@typeface',
        '+mn-ea': '//a:minorFont/a:ea/@typeface',
        '+mj-ea': '//a:majorFont/a:ea/@typeface',
        '+mn-cs': '//a:minorFont/a:cs/@typeface',
        '+mj-cs': '//a:majorFont/a:cs/@typeface',
    }
    xp = mapping.get(theme_font_ref)
    if not xp:
        return theme_font_ref
    vals = theme_tree.xpath(xp, namespaces=ns)
    return vals[0] if vals else theme_font_ref

def get_linked_file(xml_path, link_type, ppt_root):
    """
    Resolve the first Relationship whose Type contains `link_type`
    from <part>/ _rels / <name>.rels into an absolute filesystem path.

    Examples of link_type: "slideLayout", "slideMaster", "theme"
    """
    if not xml_path:
        return None

    # 1) Find the .rels path for the *part* (slide, layout, master)
    if str(xml_path).endswith(".rels"):
        rels_path = xml_path
        # Infer the *part* directory from the rels path:
        # e.g. ppt/slides/_rels/slide1.xml.rels  -> part_dir = ppt/slides
        rels_dir = os.path.dirname(rels_path)
        part_dir = os.path.dirname(rels_dir)
        part_name = os.path.basename(rels_path).replace(".rels", "")
        part_path = os.path.join(part_dir, part_name)
    else:
        part_path = xml_path
        part_dir  = os.path.dirname(part_path)
        rels_path = os.path.join(part_dir, "_rels", os.path.basename(part_path) + ".rels")

    if not os.path.exists(rels_path):
        return None

    # 2) Read relationships
    rel_tree = etree.parse(rels_path)
    targets = rel_tree.xpath(
        f'//rel:Relationship[contains(@Type, "{link_type}")]/@Target',
        namespaces={'rel': 'http://schemas.openxmlformats.org/package/2006/relationships'}
    )
    if not targets:
        return None

    target = targets[0]

    # 3) Resolve relative Target against the *part* directory (not the _rels dir)
    abs_path = os.path.normpath(os.path.join(part_dir, target))
    if os.path.exists(abs_path):
        return abs_path

    # 4) Secondary attempt: relative to ppt_root (covers odd Targets)
    candidate = os.path.normpath(os.path.join(ppt_root, target.lstrip("/")))
    if os.path.exists(candidate):
        return candidate

    return None

def first_nonempty(seq):
    if not seq:
        return None
    for x in seq:
        if x is None:
            continue
        s=str(x).strip()
        if s != "":
            return s
    return None

def as_pt(sz):
    try:
        return int(sz) / 100
    except Exception:
        return None

def as_bool01(val):
    if val is None:
        return None
    v = str(val).strip().lower()
    if v in ("1", "true", "on"):  return True
    if v in ("0", "false", "off"): return False
    return None

def preview(s, n):
    return s[:n] + ("..." if len(s) > n else "")

def process_ppt_folder(ppt_root):
    slides_dir = os.path.join(ppt_root, "slides")
    if not os.path.isdir(slides_dir):
        raise RuntimeError(f"slides/ not found under {ppt_root}")
    slide_files = sorted(glob.glob(os.path.join(slides_dir, "slide*.xml")))
    if not slide_files:
        raise RuntimeError(f"No slide*.xml under {slides_dir}")
    for slide_path in slide_files:
        print(f"\n===== {os.path.basename(slide_path)} =====")
        tree = etree.parse(slide_path)
        extract_text_styles(tree, slide_path, ppt_root)

def debug_slide_links_verbose(ppt_root, slide_num=1):
    slide_part = os.path.join(ppt_root, "slides", f"slide{slide_num}.xml")
    slide_rels = os.path.join(ppt_root, "slides", "_rels", f"slide{slide_num}.xml.rels")
    print(f"Slide part: {slide_part}  exists={os.path.exists(slide_part)}")
    print(f"Slide rels: {slide_rels}  exists={os.path.exists(slide_rels)}")

    if os.path.exists(slide_rels):
        rel_tree = etree.parse(slide_rels)
        targets = rel_tree.xpath(
            '//rel:Relationship/@Type | //rel:Relationship/@Target',
            namespaces={'rel': 'http://schemas.openxmlformats.org/package/2006/relationships'}
        )
        # Pretty print pairs Type/Target
        rels = rel_tree.xpath('//rel:Relationship', namespaces={'rel': 'http://schemas.openxmlformats.org/package/2006/relationships'})
        for r in rels:
            print("  - Type:", r.get("Type"), "Target:", r.get("Target"))

    layout = get_linked_file(slide_part, "slideLayout", ppt_root)
    print(f"→ layout: {layout}  exists={os.path.exists(layout) if layout else None}")
    if layout:
        layout_rels = os.path.join(os.path.dirname(layout), "_rels", os.path.basename(layout) + ".rels")
        print(f"  layout .rels: {layout_rels}  exists={os.path.exists(layout_rels)}")
        if os.path.exists(layout_rels):
            rel_tree = etree.parse(layout_rels)
            rels = rel_tree.xpath('//rel:Relationship', namespaces={'rel': 'http://schemas.openxmlformats.org/package/2006/relationships'})
            for r in rels:
                print("    - Type:", r.get("Type"), "Target:", r.get("Target"))

    master = get_linked_file(layout, "slideMaster", ppt_root) if layout else None
    if not master:
        master = get_linked_file(slide_part, "slideMaster", ppt_root)
    print(f"→ master: {master}  exists={os.path.exists(master) if master else None}")

    pres_rels = os.path.join(ppt_root, "_rels", "presentation.xml.rels")
    print(f"presentation.rels: {pres_rels} exists={os.path.exists(pres_rels)}")
    theme = get_theme_path(ppt_root)
    print(f"→ theme:  {theme}   exists={os.path.exists(theme) if theme else None}")



In [6]:
ppt_root = "industrial_group_project/data/raw/PPT 1 - Onjuist 1.pptx"

# Optional: quick link sanity check for slide 1
debug_slide_links(ppt_root, slide_num=1)

# Extract styles for all slides
process_ppt_folder(ppt_root)

NameError: name 'debug_slide_links' is not defined