# 01 ‚Äî Extract features (fixed DOCX inheritance)

In Word, runs often inherit font and size from paragraph styles or document defaults. If you only look at the run, you see `NaN`. This version climbs the style ladder: **run ‚Üí paragraph style ‚Üí doc defaults**. Think ‚Äúbrick ‚Üí house blueprint ‚Üí city code.‚Äù We fill the values from the first place where they are explicitly written.

In [1]:
from pathlib import Path
import pandas as pd
import zipfile, re
from xml.etree import ElementTree as ET

CWD = Path.cwd()
ROOT = (CWD if CWD.name != "notebooks" else CWD.parent).resolve()
RAW = ROOT / "data" / "raw"
FEAT_OUT = ROOT / "reports" / "features"
FEAT_OUT.mkdir(parents=True, exist_ok=True)

NS = {
    "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
}

print("Project root:", ROOT)
print("Raw folder:", RAW)
print("Features out:", FEAT_OUT)

Project root: C:\Users\ecsde\Desktop\industrial_group_project
Raw folder: C:\Users\ecsde\Desktop\industrial_group_project\data\raw
Features out: C:\Users\ecsde\Desktop\industrial_group_project\reports\features


In [2]:
def list_slide_paths(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        return sorted([n for n in z.namelist() if n.startswith("ppt/slides/slide") and n.endswith(".xml")],
                      key=lambda n: int(re.search(r"slide(\d+)\.xml", n).group(1)))

def read_xml(zip_path, inner):
    with zipfile.ZipFile(zip_path, 'r') as z:
        with z.open(inner) as f:
            return ET.parse(f).getroot()

def extract_pptx_runs(path: Path) -> pd.DataFrame:
    rows = []
    for sp in list_slide_paths(path):
        root = read_xml(path, sp)
        m = re.search(r"slide(\d+)\.xml", sp)
        slide_idx = int(m.group(1)) if m else None
        for r in root.findall(".//a:r", NS):
            rpr = r.find("a:rPr", NS)
            size_pt = None; typeface=None; bold=None; italic=None; underline=None; color=None
            if rpr is not None:
                if "sz" in rpr.attrib:
                    size_pt = float(rpr.attrib["sz"]) / 100.0
                latin = rpr.find("a:latin", NS)
                if latin is not None and "typeface" in latin.attrib:
                    typeface = latin.attrib["typeface"]
                b = rpr.attrib.get("b"); i = rpr.attrib.get("i"); u = rpr.attrib.get("u")
                bold = True if b in ("1","true") else (False if b in ("0","false") else None)
                italic = True if i in ("1","true") else (False if i in ("0","false") else None)
                underline = u if u is not None else None
                c = rpr.find("a:solidFill/a:srgbClr", NS)
                if c is not None and "val" in c.attrib:
                    color = c.attrib["val"].upper()
            t = r.find("a:t", NS)
            text = t.text if t is not None else ""
            rows.append({
                "file": path.name, "kind": "pptx_run", "slide_idx": slide_idx,
                "text": text, "font_family": typeface, "font_size_pt": size_pt,
                "bold": bold, "italic": italic, "underline": underline, "color_rgb": color
            })
    return pd.DataFrame(rows)

In [3]:
def parse_docx_styles(zip_path: Path):
    styles = {}
    defaults = {"font_family": None, "font_size_pt": None, "color_rgb": None}
    with zipfile.ZipFile(zip_path, 'r') as z:
        if "word/styles.xml" not in z.namelist():
            return styles, defaults
        root = ET.fromstring(z.read("word/styles.xml"))
    # docDefaults
    rpr_def = root.find(".//w:docDefaults/w:rPrDefault/w:rPr", NS)
    if rpr_def is not None:
        rf = rpr_def.find("w:rFonts", NS)
        if rf is not None:
            defaults["font_family"] = rf.attrib.get("{%s}ascii" % NS["w"]) or rf.attrib.get("{%s}hAnsi" % NS["w"])
        wsz = rpr_def.find("w:sz", NS)
        if wsz is not None and "val" in wsz.attrib:
            try:
                defaults["font_size_pt"] = float(wsz.attrib["val"]) / 2.0
            except Exception:
                pass
        wcol = rpr_def.find("w:color", NS)
        if wcol is not None and "val" in wcol.attrib:
            defaults["color_rgb"] = wcol.attrib["val"].upper()

    # style table
    for st in root.findall(".//w:style", NS):
        stype = st.attrib.get("{%s}type" % NS["w"])
        sid = st.attrib.get("{%s}styleId" % NS["w"])
        if not sid or stype not in ("paragraph", "character"):
            continue
        rpr = st.find("w:rPr", NS)
        if rpr is None:
            continue
        entry = {}
        rf = rpr.find("w:rFonts", NS)
        if rf is not None:
            entry["font_family"] = rf.attrib.get("{%s}ascii" % NS["w"]) or rf.attrib.get("{%s}hAnsi" % NS["w"])
        wsz = rpr.find("w:sz", NS)
        if wsz is not None and "val" in wsz.attrib:
            try:
                entry["font_size_pt"] = float(wsz.attrib["val"]) / 2.0
            except Exception:
                pass
        wcol = rpr.find("w:color", NS)
        if wcol is not None and "val" in wcol.attrib:
            entry["color_rgb"] = wcol.attrib["val"].upper()
        if entry:
            styles[sid] = entry
    return styles, defaults

In [4]:
def extract_docx_runs(path: Path) -> pd.DataFrame:
    styles, defaults = parse_docx_styles(path)
    rows = []
    with zipfile.ZipFile(path, 'r') as z:
        if "word/document.xml" not in z.namelist():
            return pd.DataFrame(rows)
        root = ET.fromstring(z.read("word/document.xml"))

    def first(*vals):
        for v in vals:
            if v is not None:
                return v
        return None

    for p_idx, p in enumerate(root.findall(".//w:p", NS)):
        # paragraph style id if any
        ppr = p.find("w:pPr", NS)
        pstyle_id = None
        if ppr is not None:
            pstyle = ppr.find("w:pStyle", NS)
            if pstyle is not None and "{%s}val" % NS["w"] in pstyle.attrib:
                pstyle_id = pstyle.attrib["{%s}val" % NS["w"]]
        pstyle_entry = styles.get(pstyle_id, {})

        for r_idx, r in enumerate(p.findall(".//w:r", NS)):
            rpr = r.find("w:rPr", NS)
            rfam = rsz = rcol = None; bold=italic=underline=None
            if rpr is not None:
                rfonts = rpr.find("w:rFonts", NS)
                if rfonts is not None:
                    rfam = rfonts.attrib.get("{%s}ascii" % NS["w"]) or rfonts.attrib.get("{%s}hAnsi" % NS["w"])
                wsz = rpr.find("w:sz", NS)
                if wsz is not None and "val" in wsz.attrib:
                    try:
                        rsz = float(wsz.attrib["val"]) / 2.0
                    except Exception:
                        rsz = None
                wcol = rpr.find("w:color", NS)
                if wcol is not None and "val" in wcol.attrib:
                    rcol = wcol.attrib["val"].upper()
                if rpr.find("w:b", NS) is not None: bold = True
                if rpr.find("w:i", NS) is not None: italic = True
                wu = rpr.find("w:u", NS)
                if wu is not None and "val" in wu.attrib: underline = wu.attrib["val"]

            fam = first(rfam, pstyle_entry.get("font_family"), defaults.get("font_family"))
            size_pt = first(rsz, pstyle_entry.get("font_size_pt"), defaults.get("font_size_pt"))
            col = first(rcol, pstyle_entry.get("color_rgb"), defaults.get("color_rgb"))

            t = r.find("w:t", NS)
            text = t.text if t is not None else ""

            # Track where values came from (for debugging / trust)
            fam_src = "run" if rfam else ("pStyle" if pstyle_entry.get("font_family") else ("docDefaults" if defaults.get("font_family") else "missing"))
            size_src = "run" if rsz is not None else ("pStyle" if pstyle_entry.get("font_size_pt") is not None else ("docDefaults" if defaults.get("font_size_pt") is not None else "missing"))
            col_src = "run" if rcol else ("pStyle" if pstyle_entry.get("color_rgb") else ("docDefaults" if defaults.get("color_rgb") else "missing"))

            rows.append({
                "file": path.name, "kind": "docx_run", "page_like": None,
                "para_idx": p_idx, "run_idx": r_idx, "text": text,
                "font_family": fam, "font_family_src": fam_src,
                "font_size_pt": size_pt, "font_size_src": size_src,
                "bold": bold, "italic": italic, "underline": underline,
                "color_rgb": col, "color_src": col_src
            })
    return pd.DataFrame(rows)

In [5]:
# Sweep and emit CSVs; skip files that produce 0 rows to avoid empty CSVs
files = list(RAW.glob("*.pptx")) + list(RAW.glob("*.docx"))
print("Found", len(files), "files")
for f in files:
    try:
        if f.suffix.lower() == ".pptx":
            df = extract_pptx_runs(f)
        else:
            df = extract_docx_runs(f)
        if df.empty:
            print("SKIP empty:", f.name)
            continue
        out = FEAT_OUT / f"{f.stem}.features.csv"
        df.to_csv(out, index=False)
        print("Wrote", out.name, len(df), "rows")
    except Exception as e:
        print("ERROR on", f.name, "->", e)

Found 13 files
Wrote PPT 1 - Juist.features.csv 117 rows
Wrote PPT 1 - Onjuist 1.features.csv 115 rows
Wrote PPT 1 - Onjuist 2.features.csv 113 rows
Wrote PPT 2 - Juist.features.csv 56 rows
Wrote PPT 2 - Onjuist.features.csv 43 rows
Wrote Presentation_Template.features.csv 273 rows
Wrote Concurrentieanalyse.features.csv 654 rows
Wrote Zoekwoordenonderzoek .features.csv 442 rows
Wrote Zoekwoordenonderzoek 2.features.csv 759 rows
Wrote Zoekwoordenonderzoek 3.features.csv 844 rows
Wrote Zoekwoordenonderzoek 4.features.csv 482 rows
Wrote Zoekwoordenonderzoek 5 .features.csv 643 rows
Wrote Zoekwoordenonderzoek juist.features.csv 195 rows


In [6]:
# Safe preview: ignore empty files and handle parse errors gracefully
import itertools, pandas as pd
preview = list((FEAT_OUT).glob("*.features.csv"))
for p in itertools.islice(preview, 3):
    try:
        print("\nPreview:", p.name)
        display(pd.read_csv(p).head(10))
    except Exception as e:
        print("Could not preview", p.name, "->", e)


Preview: Concurrentieanalyse.features.csv


Unnamed: 0,file,kind,page_like,para_idx,run_idx,text,font_family,font_family_src,font_size_pt,font_size_src,bold,italic,underline,color_rgb,color_src
0,Concurrentieanalyse.docx,docx_run,,1,0,/,Montserrat,run,,missing,,,,,missing
1,Concurrentieanalyse.docx,docx_run,,1,1,Concurrentieanalyse,Montserrat,run,,missing,,,,,missing
2,Concurrentieanalyse.docx,docx_run,,2,0,03-09-,Montserrat,run,,missing,,,,,missing
3,Concurrentieanalyse.docx,docx_run,,2,1,2025,Montserrat,run,,missing,,,,,missing
4,Concurrentieanalyse.docx,docx_run,,2,2,,Montserrat,run,,missing,,,,,missing
5,Concurrentieanalyse.docx,docx_run,,2,3,/,Montserrat,run,,missing,,,,,missing
6,Concurrentieanalyse.docx,docx_run,,2,4,,Montserrat,run,,missing,,,,,missing
7,Concurrentieanalyse.docx,docx_run,,2,5,Bedrijf x,Montserrat,run,,missing,,,,,missing
8,Concurrentieanalyse.docx,docx_run,,2,6,,Montserrat,run,,missing,,,,,missing
9,Concurrentieanalyse.docx,docx_run,,3,0,SEO Concurrentieanalyse ‚Äì,,missing,,missing,,,,,missing



Preview: Document.features.csv
Could not preview Document.features.csv -> No columns to parse from file

Preview: PPT 1 - Juist.features.csv


Unnamed: 0,file,kind,slide_idx,text,font_family,font_size_pt,bold,italic,underline,color_rgb
0,PPT 1 - Juist.pptx,pptx_run,1,/,Montserrat,60.0,,,,CC0046
1,PPT 1 - Juist.pptx,pptx_run,1,Sprint 0 üîç,Montserrat,60.0,,,,
2,PPT 1 - Juist.pptx,pptx_run,1,,Montserrat,60.0,,,,
3,PPT 1 - Juist.pptx,pptx_run,1,6 april 2023,,18.0,,,,
4,PPT 1 - Juist.pptx,pptx_run,1,Fontys,Montserrat,60.0,,,,
5,PPT 1 - Juist.pptx,pptx_run,2,Road to online success,Montserrat SemiBold,28.0,True,,,
6,PPT 1 - Juist.pptx,pptx_run,2,Gebruikersniveaus,Montserrat SemiBold,28.0,True,,,
7,PPT 1 - Juist.pptx,pptx_run,2,&,Montserrat SemiBold,28.0,True,,,
8,PPT 1 - Juist.pptx,pptx_run,2,acties,Montserrat SemiBold,28.0,True,,,
9,PPT 1 - Juist.pptx,pptx_run,2,Flows,Montserrat SemiBold,28.0,True,,,
