# 01 â€” Extract features (ULTRA v5, layout `idx` + theme tokens)

This version matches placeholders by `type+idx`, resolves theme token fonts (`+mj/+mn`) to actual families, and captures tables and chart titles. It also keeps a strict DOCX path (style chain + docDefaults). Values show **real** inheritance; `src_*` columns indicate where each value came from.

In [1]:
from pathlib import Path
import pandas as pd, zipfile, re, os, time, uuid
from xml.etree import ElementTree as ET

CWD = Path.cwd()
ROOT = (CWD if CWD.name != "notebooks" else CWD.parent).resolve()
RAW = ROOT / "data" / "raw"
FEAT_OUT = ROOT / "reports" / "features"
FEAT_OUT.mkdir(parents=True, exist_ok=True)

NS = {
    "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
}

def _z_read(z, inner):
    with z.open(inner) as f:
        return ET.parse(f).getroot()

print("Project root:", ROOT)
print("Raw folder:", RAW)
print("Features out:", FEAT_OUT)

Project root: C:\Users\ecsde\Desktop\industrial_group_project
Raw folder: C:\Users\ecsde\Desktop\industrial_group_project\data\raw
Features out: C:\Users\ecsde\Desktop\industrial_group_project\reports\features


In [2]:
def theme_fonts_pptx(z):
    out = {"majorLatin": None, "minorLatin": None}
    names = [n for n in z.namelist() if n.startswith("ppt/theme/theme") and n.endswith(".xml")]
    if not names: return out
    root = _z_read(z, names[0])
    maj = root.find(".//a:themeElements/a:fontScheme/a:majorFont/a:latin", NS)
    if maj is not None and "typeface" in maj.attrib: out["majorLatin"] = maj.attrib["typeface"]
    minf = root.find(".//a:themeElements/a:fontScheme/a:minorFont/a:latin", NS)
    if minf is not None and "typeface" in minf.attrib: out["minorLatin"] = minf.attrib["typeface"]
    return out

def theme_colors_pptx(z):
    names = [n for n in z.namelist() if n.startswith("ppt/theme/theme") and n.endswith(".xml")]
    if not names: return {}
    root = _z_read(z, names[0])
    m = {}
    for cs in root.findall(".//a:clrScheme", NS):
        for child in list(cs):
            name = child.tag.split('}')[1]
            srgb = child.find(".//a:srgbClr", NS)
            sysc = child.find(".//a:sysClr", NS)
            if srgb is not None and "val" in srgb.attrib:
                m[name] = srgb.attrib["val"].upper()
            elif sysc is not None and "lastClr" in sysc.attrib:
                m[name] = sysc.attrib["lastClr"].upper()
    return m

def resolve_typeface_token(tf, theme_fonts):
    if not tf: return None
    tf = tf.strip()
    if tf.startswith("+mj"):  # major Latin
        return theme_fonts.get("majorLatin")
    if tf.startswith("+mn"):  # minor Latin
        return theme_fonts.get("minorLatin")
    return tf

def fontref_to_family(idx, theme_fonts):
    if not idx: return None
    idx = idx.lower()
    if idx == "major": return theme_fonts.get("majorLatin")
    if idx == "minor": return theme_fonts.get("minorLatin")
    return None

In [3]:
def slide_layout_path(z, slide_path):
    relp = slide_path.replace("slides/","slides/_rels/") + ".rels"
    if relp not in z.namelist(): return None
    rels = _z_read(z, relp)
    for rnode in rels.findall(".//r:Relationship", NS):
        if rnode.attrib.get("Type","").endswith("/slideLayout"):
            return f"ppt/slideLayouts/{Path(rnode.attrib['Target']).name}"
    return None

def slide_master_path(z, layout_path):
    if not layout_path: return None
    relp = layout_path.replace("slideLayouts/","slideLayouts/_rels/") + ".rels"
    if relp not in z.namelist(): return None
    rels = _z_read(z, relp)
    for rnode in rels.findall(".//r:Relationship", NS):
        if rnode.attrib.get("Type","").endswith("/slideMaster"):
            return f"ppt/slideMasters/{Path(rnode.attrib['Target']).name}"
    return None

In [4]:
def rpr_props(rpr, theme_fonts, theme_colors):
    if rpr is None: return (None, None, None), ("missing","missing","missing")
    sz = fam = col = None
    src_sz = src_fam = src_col = "missing"
    if "sz" in rpr.attrib:
        sz = float(rpr.attrib["sz"]) / 100.0; src_sz = "rPr"
    lat = rpr.find("a:latin", NS)
    if lat is not None and "typeface" in lat.attrib:
        fam = resolve_typeface_token(lat.attrib["typeface"], theme_fonts)
        if fam: src_fam = "rPr:latin"
    if fam is None:
        fr = rpr.find("a:fontRef", NS)
        if fr is not None and "idx" in fr.attrib:
            fam = fontref_to_family(fr.attrib["idx"], theme_fonts)
            if fam: src_fam = f"rPr:fontRef:{fr.attrib['idx']}"
    sr = rpr.find("a:solidFill/a:srgbClr", NS)
    if sr is not None and "val" in sr.attrib:
        col = sr.attrib["val"].upper(); src_col = "rPr:srgb"
    else:
        sch = rpr.find("a:solidFill/a:schemeClr", NS)
        if sch is not None and "val" in sch.attrib:
            key = sch.attrib["val"]; col = theme_colors.get(key); src_col = f"rPr:scheme:{key}"
    return (sz, fam, col), (src_sz, src_fam, src_col)

def txbody_lststyle(sp, theme_fonts, theme_colors):
    out = {}
    lst = sp.find(".//a:txBody/a:lstStyle", NS)
    if lst is None: return out
    for lvl in range(1,10):
        pp = lst.find(f"a:lvl{lvl}pPr", NS)
        if pp is None: continue
        dr = pp.find("a:defRPr", NS)
        if dr is None: continue
        (sz,fam,col),_ = rpr_props(dr, theme_fonts, theme_colors)
        fr = dr.find("a:fontRef", NS)
        out[lvl-1] = {
            "size_pt": sz,
            "font_family": fam or fontref_to_family(fr.attrib.get("idx") if fr is not None else None, theme_fonts),
            "color_rgb": col,
            "fontRef_idx": (fr.attrib["idx"] if fr is not None and "idx" in fr.attrib else None)
        }
    return out

def placeholder_type_idx(sp):
    ph = sp.find(".//p:nvSpPr/p:nvPr/p:ph", NS)
    if ph is None: return None, None
    return ph.attrib.get("type"), ph.attrib.get("idx")

In [5]:
def master_textstyles(z, master_path, theme_fonts, theme_colors):
    styles = {"title":{}, "body":{}, "other":{}}
    if not master_path or master_path not in z.namelist(): return styles
    root = _z_read(z, master_path)
    for kind, xp in [("title",".//p:txStyles/p:titleStyle"),
                     ("body",".//p:txStyles/p:bodyStyle"),
                     ("other",".//p:txStyles/p:otherStyle")]:
        node = root.find(xp, NS)
        if node is None: continue
        for lvl in range(1,10):
            pp = node.find(f"a:lvl{lvl}pPr", NS)
            if pp is None: continue
            dr = pp.find("a:defRPr", NS)
            if dr is None: continue
            (sz,fam,col),_ = rpr_props(dr, theme_fonts, theme_colors)
            fr = dr.find("a:fontRef", NS)
            styles[kind][lvl-1] = {
                "size_pt": sz,
                "font_family": fam or fontref_to_family(fr.attrib.get("idx") if fr is not None else None, theme_fonts),
                "color_rgb": col,
                "fontRef_idx": (fr.attrib["idx"] if fr is not None and "idx" in fr.attrib else None)
            }
    return styles

def presentation_default_textstyle(z, theme_fonts, theme_colors):
    out = {}
    pres = "ppt/presentation.xml"
    if pres not in z.namelist(): return out
    root = _z_read(z, pres)
    node = root.find(".//p:defaultTextStyle", NS)
    if node is None: return out
    for lvl in range(1,10):
        pp = node.find(f"a:lvl{lvl}pPr", NS)
        if pp is None: continue
        dr = pp.find("a:defRPr", NS)
        if dr is None: continue
        (sz,fam,col),_ = rpr_props(dr, theme_fonts, theme_colors)
        fr = dr.find("a:fontRef", NS)
        out[lvl-1] = {
            "size_pt": sz,
            "font_family": fam or fontref_to_family(fr.attrib.get("idx") if fr is not None else None, theme_fonts),
            "color_rgb": col,
            "fontRef_idx": (fr.attrib["idx"] if fr is not None and "idx" in fr.attrib else None)
        }
    return out

In [6]:
def layout_placeholder_styles(z, layout_path, theme_fonts, theme_colors):
    defr = {}
    lsty = {}
    if not layout_path or layout_path not in z.namelist():
        return defr, lsty
    root = _z_read(z, layout_path)

    for sp in root.findall(".//p:sp", NS):
        ph_type, ph_idx = placeholder_type_idx(sp)
        if ph_type is None: 
            continue
        p = sp.find(".//a:txBody/a:p", NS)
        if p is not None:
            ppr = p.find("a:pPr", NS)
            if ppr is not None:
                dr = ppr.find("a:defRPr", NS)
                if dr is not None:
                    (sz,fam,col),_ = rpr_props(dr, theme_fonts, theme_colors)
                    fr = dr.find("a:fontRef", NS)
                    for lvl in range(0,10):
                        defr[(ph_type, ph_idx, lvl)] = {
                            "size_pt": sz,
                            "font_family": fam or fontref_to_family(fr.attrib.get("idx") if fr is not None else None, theme_fonts),
                            "color_rgb": col
                        }
        lst = txbody_lststyle(sp, theme_fonts, theme_colors)
        for lvl, entry in lst.items():
            lsty[(ph_type, ph_idx, lvl)] = entry
    return defr, lsty

In [7]:
def extract_pptx_runs_v5(path: Path) -> pd.DataFrame:
    rows = []
    with zipfile.ZipFile(path, 'r') as z:
        tf = theme_fonts_pptx(z)
        tc = theme_colors_pptx(z)
        dflt = presentation_default_textstyle(z, tf, tc)

        slide_paths = sorted([n for n in z.namelist() if n.startswith("ppt/slides/slide") and n.endswith(".xml")],
                             key=lambda n: int(re.search(r"slide(\d+)\.xml", n).group(1)))

        for spath in slide_paths:
            sroot = _z_read(z, spath)
            slide_idx = int(re.search(r"slide(\d+)\.xml", spath).group(1))
            layout = slide_layout_path(z, spath)
            master = slide_master_path(z, layout) if layout else None
            mstyles = master_textstyles(z, master, tf, tc) if master else {"title":{}, "body":{}, "other":{}}
            l_defr, l_lsty = layout_placeholder_styles(z, layout, tf, tc)

            for sp in sroot.findall(".//p:sp", NS):
                ph_type, ph_idx = placeholder_type_idx(sp)
                group = "title" if ph_type in ("title","ctrTitle","centeredTitle") else ("body" if ph_type=="body" else "other")
                slide_lsty = txbody_lststyle(sp, tf, tc)

                for p in sp.findall(".//a:p", NS):
                    lvl = 0
                    ppr = p.find("a:pPr", NS)
                    if ppr is not None and "lvl" in ppr.attrib:
                        try: lvl = int(ppr.attrib["lvl"])
                        except: lvl = 0
                    defR = ppr.find("a:defRPr", NS) if ppr is not None else None
                    endR = p.find("a:endParaRPr", NS)

                    (p_sz1,p_fam1,p_col1),_ = rpr_props(defR, tf, tc) if defR is not None else ((None,None,None),("missing","missing","missing"))
                    (p_sz2,p_fam2,p_col2),_ = rpr_props(endR, tf, tc) if endR is not None else ((None,None,None),("missing","missing","missing"))
                    p_sz = p_sz1 if p_sz1 is not None else p_sz2
                    p_fam = p_fam1 if p_fam1 is not None else p_fam2
                    p_col = p_col1 if p_col1 is not None else p_col2
                    p_src_sz = "para:defRPr" if p_sz1 is not None else ("para:endParaRPr" if p_sz2 is not None else "missing")
                    p_src_fam = "para:defRPr" if p_fam1 is not None else ("para:endParaRPr" if p_fam2 is not None else "missing")
                    p_src_col = "para:defRPr" if p_col1 is not None else ("para:endParaRPr" if p_col2 is not None else "missing")

                    sl = slide_lsty.get(lvl, {})
                    sl_sz = sl.get("size_pt")
                    sl_fam = sl.get("font_family") or fontref_to_family(sl.get("fontRef_idx"), tf)
                    sl_col = sl.get("color_rgb")

                    ley_def = l_defr.get((ph_type, ph_idx, lvl)) or l_defr.get((ph_type, None, lvl)) or {}
                    ley_lst = l_lsty.get((ph_type, ph_idx, lvl)) or l_lsty.get((ph_type, None, lvl)) or {}
                    le_sz = ley_def.get("size_pt") or ley_lst.get("size_pt")
                    le_fam = ley_def.get("font_family") or ley_lst.get("font_family") or fontref_to_family(ley_lst.get("fontRef_idx"), tf) if ley_lst else None
                    le_col = ley_def.get("color_rgb") or ley_lst.get("color_rgb")

                    ms = mstyles.get(group, {}).get(lvl, {})
                    ms_sz = ms.get("size_pt"); ms_fam = ms.get("font_family") or fontref_to_family(ms.get("fontRef_idx"), tf); ms_col = ms.get("color_rgb")
                    ds = dflt.get(lvl, {})
                    ds_sz = ds.get("size_pt"); ds_fam = ds.get("font_family") or fontref_to_family(ds.get("fontRef_idx"), tf); ds_col = ds.get("color_rgb")

                    for r in p.findall("a:r", NS):
                        rpr = r.find("a:rPr", NS)
                        (r_sz,r_fam,r_col),_ = rpr_props(rpr, tf, tc)

                        size_pt = r_sz if r_sz is not None else p_sz if p_sz is not None else sl_sz if sl_sz is not None else le_sz if le_sz is not None else ms_sz if ms_sz is not None else ds_sz
                        fam = r_fam if r_fam is not None else p_fam if p_fam is not None else sl_fam if sl_fam is not None else le_fam if le_fam is not None else ms_fam if ms_fam is not None else ds_fam
                        color = r_col if r_col is not None else p_col if p_col is not None else sl_col if sl_col is not None else le_col if le_col is not None else ms_col if ms_col is not None else ds_col

                        src_size = ("run" if r_sz is not None else p_src_sz if p_sz is not None else "slide:lstStyle" if sl_sz is not None else "layout" if le_sz is not None else "master" if ms_sz is not None else "defaultText" if ds_sz is not None else "missing")
                        src_font = ("run" if r_fam is not None else p_src_fam if p_fam is not None else "slide:lstStyle" if sl_fam is not None else "layout" if le_fam is not None else "master" if ms_fam is not None else "defaultText" if ds_fam is not None else "missing")
                        src_color = ("run" if r_col is not None else p_src_col if p_col is not None else "slide:lstStyle" if sl_col is not None else "layout" if le_col is not None else "master" if ms_col is not None else "defaultText" if ds_col is not None else "missing")

                        t = r.find("a:t", NS); text = t.text if t is not None else ""
                        rows.append({
                            "file": path.name, "kind": "pptx_run", "slide_idx": slide_idx,
                            "text": text, "font_family": fam, "font_size_pt": size_pt, "color_rgb": color,
                            "src_font": src_font, "src_size": src_size, "src_color": src_color,
                            "para_level": lvl, "placeholder_type": ph_type or "none", "placeholder_idx": ph_idx or "none"
                        })

            # tables
            for tc in sroot.findall(".//p:graphicFrame//a:tbl//a:tc", NS):
                txb = tc.find("a:txBody", NS)
                if txb is None: continue
                for p in txb.findall("a:p", NS):
                    lvl = 0
                    ppr = p.find("a:pPr", NS)
                    if ppr is not None and "lvl" in ppr.attrib:
                        try: lvl = int(ppr.attrib["lvl"])
                        except: lvl = 0
                    defR = ppr.find("a:defRPr", NS) if ppr is not None else None
                    (p_sz,p_fam,p_col),_ = rpr_props(defR, tf, tc) if defR is not None else ((None,None,None),("missing","missing","missing"))
                    for r in p.findall("a:r", NS):
                        rpr = r.find("a:rPr", NS)
                        (r_sz,r_fam,r_col),_ = rpr_props(rpr, tf, tc)
                        t = r.find("a:t", NS); text = t.text if t is not None else ""
                        rows.append({
                            "file": path.name, "kind": "pptx_table_run", "slide_idx": slide_idx,
                            "text": text, "font_family": r_fam or p_fam, "font_size_pt": r_sz or p_sz, "color_rgb": r_col or p_col,
                            "src_font": "run" if r_fam is not None else "para" if p_fam is not None else "missing",
                            "src_size": "run" if r_sz is not None else "para" if p_sz is not None else "missing",
                            "src_color": "run" if r_col is not None else "para" if p_col is not None else "missing",
                            "para_level": lvl, "placeholder_type": "tbl", "placeholder_idx": "none"
                        })

            # chart titles
            for rp in sroot.findall(".//p:graphicFrame//c:chart//c:title//c:tx//c:rich//a:p", NS):
                ppr = rp.find("a:pPr", NS)
                defR = ppr.find("a:defRPr", NS) if ppr is not None else None
                (p_sz,p_fam,p_col),_ = rpr_props(defR, tf, tc) if defR is not None else ((None,None,None),("missing","missing","missing"))
                for rr in rp.findall("a:r", NS):
                    rpr = rr.find("a:rPr", NS)
                    (r_sz,r_fam,r_col),_ = rpr_props(rpr, tf, tc)
                    t = rr.find("a:t", NS); text = t.text if t is not None else ""
                    rows.append({
                        "file": path.name, "kind": "pptx_chart_title_run", "slide_idx": slide_idx,
                        "text": text, "font_family": r_fam or p_fam, "font_size_pt": r_sz or p_sz, "color_rgb": r_col or p_col,
                        "src_font": "run" if r_fam is not None else "para" if p_fam is not None else "missing",
                        "src_size": "run" if r_sz is not None else "para" if p_sz is not None else "missing",
                        "src_color": "run" if r_col is not None else "para" if p_col is not None else "missing",
                        "para_level": 0, "placeholder_type": "chart", "placeholder_idx": "none"
                    })
    return pd.DataFrame(rows)

In [8]:
# DOCX strict extractor
def word_theme(z):
    fonts = {"majorLatin": None, "minorLatin": None}
    colors = {}
    names = [n for n in z.namelist() if n.startswith("word/theme/theme") and n.endswith(".xml")]
    if names:
        root = _z_read(z, names[0])
        maj = root.find(".//a:themeElements/a:fontScheme/a:majorFont/a:latin", NS)
        if maj is not None and "typeface" in maj.attrib: fonts["majorLatin"] = maj.attrib["typeface"]
        minf = root.find(".//a:themeElements/a:fontScheme/a:minorFont/a:latin", NS)
        if minf is not None and "typeface" in minf.attrib: fonts["minorLatin"] = minf.attrib["typeface"]
        for cs in root.findall(".//a:clrScheme", NS):
            for child in list(cs):
                name = child.tag.split('}')[1]
                srgb = child.find(".//a:srgbClr", NS)
                sysc = child.find(".//a:sysClr", NS)
                if srgb is not None and "val" in srgb.attrib:
                    colors[name] = srgb.attrib["val"].upper()
                elif sysc is not None and "lastClr" in sysc.attrib:
                    colors[name] = sysc.attrib["lastClr"].upper()
    return fonts, colors

def parse_docx_styles(z):
    styles = {}
    defaults = {"font_family": None, "font_size_pt": None, "color_rgb": None}
    if "word/styles.xml" not in z.namelist(): return styles, defaults
    root = _z_read(z, "word/styles.xml")
    rpr_def = root.find(".//w:docDefaults/w:rPrDefault/w:rPr", NS)
    if rpr_def is not None:
        rf = rpr_def.find("w:rFonts", NS)
        if rf is not None:
            defaults["font_family"] = rf.attrib.get("{%s}ascii" % NS["w"]) or rf.attrib.get("{%s}hAnsi" % NS["w"])
        wsz = rpr_def.find("w:sz", NS) or rpr_def.find("w:szCs", NS)
        if wsz is not None and "val" in wsz.attrib:
            try: defaults["font_size_pt"] = float(wsz.attrib["val"])/2.0
            except: pass
        wcol = rpr_def.find("w:color", NS)
        if wcol is not None and "val" in wcol.attrib:
            defaults["color_rgb"] = wcol.attrib["val"].upper()
    for st in root.findall(".//w:style", NS):
        sid = st.attrib.get("{%s}styleId" % NS["w"]); stype = st.attrib.get("{%s}type" % NS["w"])
        if not sid: continue
        entry = {"type": stype, "basedOn": None, "link": None, "font_family": None, "font_size_pt": None, "color_rgb": None}
        bo = st.find("w:basedOn", NS)
        if bo is not None and "{%s}val" % NS["w"] in bo.attrib: entry["basedOn"] = bo.attrib["{%s}val" % NS["w"]]
        lk = st.find("w:link", NS)
        if lk is not None and "{%s}val" % NS["w"] in lk.attrib: entry["link"] = lk.attrib["{%s}val" % NS["w"]]
        rpr = st.find("w:rPr", NS)
        if rpr is not None:
            rf = rpr.find("w:rFonts", NS)
            if rf is not None:
                entry["font_family"] = rf.attrib.get("{%s}ascii" % NS["w"]) or rf.attrib.get("{%s}hAnsi" % NS["w"])
            wsz = rpr.find("w:sz", NS) or rpr.find("w:szCs", NS)
            if wsz is not None and "val" in wsz.attrib:
                try: entry["font_size_pt"] = float(wsz.attrib["val"])/2.0
                except: pass
            wcol = rpr.find("w:color", NS)
            if wcol is not None and "val" in wcol.attrib:
                entry["color_rgb"] = wcol.attrib["val"].upper()
        styles[sid] = entry
    return styles, defaults

def resolve_style_chain(styles, start_sid, prop, guard=40):
    seen = set(); cur = start_sid
    while cur and cur not in seen and guard>0:
        guard -= 1; seen.add(cur)
        val = styles.get(cur, {}).get(prop)
        if val is not None: return val, cur
        link = styles.get(cur, {}).get("link")
        if link and styles.get(link, {}).get(prop) is not None:
            return styles[link][prop], link
        cur = styles.get(cur, {}).get("basedOn")
    return None, None

def resolve_word_theme_font(rfonts, theme_fonts):
    if rfonts is None: return None
    for key in ("asciiTheme","hAnsiTheme","csTheme","eastAsiaTheme"):
        kval = rfonts.attrib.get("{%s}%s" % (NS["w"], key))
        if kval:
            if "minor" in kval.lower(): return theme_fonts.get("minorLatin")
            if "major" in kval.lower(): return theme_fonts.get("majorLatin")
    return (rfonts.attrib.get("{%s}ascii" % NS["w"]) or
            rfonts.attrib.get("{%s}hAnsi" % NS["w"]) or
            rfonts.attrib.get("{%s}eastAsia" % NS["w"]) or
            rfonts.attrib.get("{%s}cs" % NS["w"]))

def resolve_word_color(wcol, theme_colors):
    if wcol is None: return None
    if "val" in wcol.attrib: return wcol.attrib["val"].upper()
    tc = wcol.attrib.get("{%s}themeColor" % NS["w"])
    if tc:
        mapping = {"dark1":"dk1","light1":"lt1","dark2":"dk2","light2":"lt2",
                   "accent1":"accent1","accent2":"accent2","accent3":"accent3",
                   "accent4":"accent4","accent5":"accent5","accent6":"accent6",
                   "hyperlink":"hlink","followedHyperlink":"folHlink"}
        return theme_colors.get(mapping.get(tc))
    return None

def extract_docx_runs_strict(path: Path) -> pd.DataFrame:
    rows = []
    with zipfile.ZipFile(path, 'r') as z:
        styles, defaults = parse_docx_styles(z)
        theme_fonts, theme_colors = word_theme(z)
        if "word/document.xml" not in z.namelist(): return pd.DataFrame(rows)
        root = _z_read(z, "word/document.xml")

    for p_idx, p in enumerate(root.findall(".//w:p", NS)):
        ppr = p.find("w:pPr", NS)
        pstyle_id = None
        ppr_rpr = ppr.find("w:rPr", NS) if ppr is not None else None
        if ppr is not None:
            pstyle = ppr.find("w:pStyle", NS)
            if pstyle is not None and "{%s}val" % NS["w"] in pstyle.attrib:
                pstyle_id = pstyle.attrib["{%s}val" % NS["w"]]
        for r_idx, r in enumerate(p.findall(".//w:r", NS)):
            rpr = r.find("w:rPr", NS)
            rstyle_id = None
            rfam = rsz = rcol = None; bold=italic=underline=None
            if rpr is not None:
                rstyle = rpr.find("w:rStyle", NS)
                if rstyle is not None and "{%s}val" % NS["w"] in rstyle.attrib:
                    rstyle_id = rstyle.attrib["{%s}val" % NS["w"]]
                rfonts = rpr.find("w:rFonts", NS)
                rfam = resolve_word_theme_font(rfonts, theme_fonts)
                wsz = rpr.find("w:sz", NS) or rpr.find("w:szCs", NS)
                if wsz is not None and "val" in wsz.attrib:
                    try: rsz = float(wsz.attrib["val"])/2.0
                    except: rsz = None
                wcol = rpr.find("w:color", NS); rcol = resolve_word_color(wcol, theme_colors)
                if rpr.find("w:b", NS) is not None: bold = True
                if rpr.find("w:i", NS) is not None: italic = True
                wu = rpr.find("w:u", NS)
                if wu is not None and "val" in wu.attrib: underline = wu.attrib["val"]

            p_fam = p_sz = p_col = None
            if ppr_rpr is not None:
                pf = ppr_rpr.find("w:rFonts", NS); p_fam = resolve_word_theme_font(pf, theme_fonts)
                psz = ppr_rpr.find("w:sz", NS) or ppr_rpr.find("w:szCs", NS)
                if psz is not None and "val" in psz.attrib:
                    try: p_sz = float(psz.attrib["val"])/2.0
                    except: p_sz = None
                p_col = resolve_word_color(ppr_rpr.find("w:color", NS), theme_colors)

            s_fam = s_sz = s_col = None; ssrc_fam = ssrc_sz = ssrc_col = None
            if rstyle_id:
                val, sid = resolve_style_chain(styles, rstyle_id, "font_family")
                if val: s_fam, ssrc_fam = val, f"rStyle:{sid}"
                val, sid = resolve_style_chain(styles, rstyle_id, "font_size_pt")
                if val is not None: s_sz, ssrc_sz = val, f"rStyle:{sid}"
                val, sid = resolve_style_chain(styles, rstyle_id, "color_rgb")
                if val: s_col, ssrc_col = val, f"rStyle:{sid}"
            if pstyle_id:
                if s_fam is None:
                    val, sid = resolve_style_chain(styles, pstyle_id, "font_family")
                    if val: s_fam, ssrc_fam = val, f"pStyle:{sid}"
                if s_sz is None:
                    val, sid = resolve_style_chain(styles, pstyle_id, "font_size_pt")
                    if val is not None: s_sz, ssrc_sz = val, f"pStyle:{sid}"
                if s_col is None:
                    val, sid = resolve_style_chain(styles, pstyle_id, "color_rgb")
                    if val: s_col, ssrc_col = val, f"pStyle:{sid}"

            d_fam = defaults.get("font_family"); d_sz = defaults.get("font_size_pt"); d_col = defaults.get("color_rgb")

            fam = rfam or p_fam or s_fam or d_fam
            size_pt = rsz or p_sz or s_sz or d_sz
            col = rcol or p_col or s_col or d_col

            src_font = ("run" if rfam else ("para_rPr" if p_fam else (ssrc_fam or ("docDefaults" if d_fam else "missing"))))
            src_size = ("run" if rsz is not None else ("para_rPr" if p_sz is not None else (ssrc_sz or ("docDefaults" if d_sz is not None else "missing"))))
            src_color = ("run" if rcol else ("para_rPr" if p_col else (ssrc_col or ("docDefaults" if d_col else "missing"))))

            t = r.find("w:t", NS); text = t.text if t is not None else ""

            rows.append({
                "file": path.name, "kind": "docx_run",
                "para_idx": p_idx, "run_idx": r_idx, "text": text,
                "font_family": fam, "font_size_pt": size_pt, "color_rgb": col,
                "bold": bold, "italic": italic, "underline": underline,
                "src_font": src_font, "src_size": src_size, "src_color": src_color
            })
    return pd.DataFrame(rows)

In [9]:
def write_csv_atomic(path, df):
    tmp = path.with_suffix(path.suffix + f".tmp-{uuid.uuid4().hex}")
    try:
        df.to_csv(tmp, index=False)
        try:
            os.replace(tmp, path)  # atomic on Windows
            print("Wrote", path.name, len(df), "rows")
        except PermissionError:
            # CSV is probably open in Excel; fall back to a timestamped file
            alt = path.with_name(f"{path.stem}.{int(time.time())}{path.suffix}")
            os.replace(tmp, alt)
            print(f"Target locked -> wrote {alt.name} instead", len(df), "rows")
    finally:
        if tmp.exists():
            try: tmp.unlink()
            except: pass

# Sweep
files = list(RAW.glob("*.pptx")) + list(RAW.glob("*.docx"))
print("Found", len(files), "files")
for f in files:
    try:
        if f.suffix.lower()==".pptx":
            df = extract_pptx_runs_v5(f)
        else:
            df = extract_docx_runs_strict(f)
        if df.empty:
            print("SKIP empty:", f.name); continue
        out = FEAT_OUT / f"{f.stem}.features.csv"
        write_csv_atomic(out, df)
    except Exception as e:
        print("ERROR on", f.name, "->", e)

Found 14 files
Wrote PPT 1 - Juist.features.csv 117 rows
Wrote PPT 1 - Onjuist 1.features.csv 115 rows
Wrote PPT 1 - Onjuist 2.features.csv 113 rows
Wrote PPT 2 - Juist.features.csv 56 rows
Wrote PPT 2 - Onjuist.features.csv 43 rows
Wrote Presentation_Template.features.csv 273 rows
Wrote Sustainable Agriculture Project Proposal Infographics by Slidesgo.features.csv 15 rows
Wrote Concurrentieanalyse.features.csv 654 rows


  wsz = rpr_def.find("w:sz", NS) or rpr_def.find("w:szCs", NS)
  wsz = rpr.find("w:sz", NS) or rpr.find("w:szCs", NS)
  psz = ppr_rpr.find("w:sz", NS) or ppr_rpr.find("w:szCs", NS)
  wsz = rpr.find("w:sz", NS) or rpr.find("w:szCs", NS)


Wrote Zoekwoordenonderzoek .features.csv 442 rows
Wrote Zoekwoordenonderzoek 2.features.csv 759 rows
Wrote Zoekwoordenonderzoek 3.features.csv 844 rows
Wrote Zoekwoordenonderzoek 4.features.csv 482 rows
Wrote Zoekwoordenonderzoek 5 .features.csv 643 rows
Wrote Zoekwoordenonderzoek juist.features.csv 195 rows


In [None]:
# Helper to spot-check a specific text\n
import pandas as pd\n
def grep_features(stem_substring, text_substring, max_rows=25):\n
    targets = [p for p in (FEAT_OUT).glob("*.features.csv") if stem_substring.lower() in p.name.lower()]\n
    if not targets:\n
        print("No features CSV matching", stem_substring); return\n
    df = pd.read_csv(targets[0])\n
    hit = df[df["text"].astype(str).str.contains(text_substring, na=False)]\n
    cols = ["kind","slide_idx","placeholder_type","placeholder_idx","para_level","text","font_family","font_size_pt","src_font","src_size"]\n
    display(hit[cols].head(max_rows))\n
# Example:\n
# grep_features("Greenhouse Management Project", "Greenhouse")\n