# docx_tools

This file includes useful tools for docx file processing.

## Replace placeholders

In [34]:
import os
import re
from docx import Document

def replace_placeholders(directory, old_pattern, new_pattern):
    pattern = re.compile(old_pattern)

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                try:
                    doc = Document(file_path)
                    modified = False

                    # Replace placeholders in paragraphs
                    for para in doc.paragraphs:
                        try:
                            new_text, count = pattern.subn(new_pattern, para.text)
                            if count > 0:
                                para.text = new_text
                                modified = True
                        except re.error as e:
                            print(f"Regex error in {file_path}, paragraph: {str(e)}")
                            print(f"Problematic text: {para.text}")

                    # Replace placeholders in tables
                    for table in doc.tables:
                        for row in table.rows:
                            for cell in row.cells:
                                try:
                                    new_text, count = pattern.subn(new_pattern, cell.text)
                                    if count > 0:
                                        cell.text = new_text
                                        modified = True
                                except re.error as e:
                                    print(f"Regex error in {file_path}, table cell: {str(e)}")
                                    print(f"Problematic text: {cell.text}")

                    # Replace placeholders in headers and footers
                    for section in doc.sections:
                        for header in section.header.paragraphs:
                            try:
                                new_text, count = pattern.subn(new_pattern, header.text)
                                if count > 0:
                                    header.text = new_text
                                    modified = True
                            except re.error as e:
                                print(f"Regex error in {file_path}, header: {str(e)}")
                                print(f"Problematic text: {header.text}")
                        for footer in section.footer.paragraphs:
                            try:
                                new_text, count = pattern.subn(new_pattern, footer.text)
                                if count > 0:
                                    footer.text = new_text
                                    modified = True
                            except re.error as e:
                                print(f"Regex error in {file_path}, footer: {str(e)}")
                                print(f"Problematic text: {footer.text}")

                    if modified:
                        doc.save(file_path)
                        print(f"Placeholders replaced in {file_path}")
                    else:
                        print(f"No placeholders found in {file_path}")
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")

# run example
old_pattern = r"«([^»]+)»"
# old_pattern = r"<[^>]+>"
# old_pattern = r"\[[^\]]+\]"
new_pattern = r"[\1]"
# new_pattern = r"«\1»"
template_folder = "../templates/ol-dd-template/"
replace_placeholders(template_folder, old_pattern, new_pattern)

Placeholders replaced in ../templates/ol-dd-template/CompanyName.docx


## Highlight placeholders

In [35]:
import os
import re
from docx import Document
from docx.shared import RGBColor
from docx.oxml.ns import qn
from docx.oxml import OxmlElement

def highlight_text(run, color):
    rPr = run._r.get_or_add_rPr()
    highlight = OxmlElement('w:highlight')
    highlight.set(qn('w:val'), color)
    rPr.append(highlight)

def highlight_placeholders(folder_path, pattern_regex):
    pattern = re.compile(pattern_regex)

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                try:
                    doc = Document(file_path)
                    modified = False

                    # Highlight placeholders in paragraphs
                    for para in doc.paragraphs:
                        new_runs = []
                        for run in para.runs:
                            matches = list(pattern.finditer(run.text))
                            if matches:
                                last_end = 0
                                for match in matches:
                                    start, end = match.span()
                                    if start > last_end:
                                        new_runs.append(run.text[last_end:start])
                                    placeholder = run.text[start:end]
                                    new_run = para.add_run(placeholder)
                                    highlight_text(new_run, 'yellow')
                                    new_runs.append(new_run)
                                    last_end = end
                                if last_end < len(run.text):
                                    new_runs.append(run.text[last_end:])
                                modified = True
                            else:
                                new_runs.append(run)
                        para.clear()
                        for new_run in new_runs:
                            if isinstance(new_run, str):
                                para.add_run(new_run)
                            else:
                                para._p.append(new_run._r)

                    # Highlight placeholders in tables
                    for table in doc.tables:
                        for row in table.rows:
                            for cell in row.cells:
                                for para in cell.paragraphs:
                                    new_runs = []
                                    for run in para.runs:
                                        matches = list(pattern.finditer(run.text))
                                        if matches:
                                            last_end = 0
                                            for match in matches:
                                                start, end = match.span()
                                                if start > last_end:
                                                    new_runs.append(run.text[last_end:start])
                                                placeholder = run.text[start:end]
                                                new_run = para.add_run(placeholder)
                                                highlight_text(new_run, 'yellow')
                                                new_runs.append(new_run)
                                                last_end = end
                                            if last_end < len(run.text):
                                                new_runs.append(run.text[last_end:])
                                            modified = True
                                        else:
                                            new_runs.append(run)
                                    para.clear()
                                    for new_run in new_runs:
                                        if isinstance(new_run, str):
                                            para.add_run(new_run)
                                        else:
                                            para._p.append(new_run._r)

                    if modified:
                        doc.save(file_path)
                        print(f"Placeholders highlighted in {file_path}")
                    else:
                        print(f"No placeholders found in {file_path}")
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")

# Example usage
pattern_regex = r"\[[^\]]+\]"
template_folder = "../templates/ol-dd-template"
highlight_placeholders(template_folder, pattern_regex)


Placeholders highlighted in ../templates/ol-dd-template/CompanyName.docx


## List placeholders

In [37]:
import re
import json
import os
from docx import Document
from collections import Counter

def extract_placeholders(docx_path, regex_pattern):
    try:
        doc = Document(docx_path)
        placeholders = []

        for paragraph in doc.paragraphs:
            matches = re.findall(regex_pattern, paragraph.text)
            placeholders.extend(matches)

        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    matches = re.findall(regex_pattern, cell.text)
                    placeholders.extend(matches)

        return placeholders
    except Exception as e:
        raise Exception(f"Error extracting placeholders from {docx_path}: {str(e)}")

def process_folder(folder_path, regex_pattern):
    all_placeholders = []
    files_processed = 0

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                try:
                    placeholders = extract_placeholders(file_path, regex_pattern)
                    all_placeholders.extend(placeholders)
                    files_processed += 1
                except Exception as e:
                    raise Exception(f"Error processing {file_path}: {str(e)}")

    placeholder_counts = Counter(all_placeholders)
    sorted_placeholders = [placeholder for placeholder, _ in sorted(placeholder_counts.items(), key=lambda x: x[1], reverse=True)]
    return sorted_placeholders, files_processed

pattern_regex = r"\[[^\]]+\]"
template_folder = "../templates/ol-dd-template"

placeholders, files_processed = process_folder(template_folder, pattern_regex)

if files_processed == 0:
    print(f"No .docx files found in {template_folder} or its subfolders.")
elif not placeholders:
    print(f"No placeholders found in the processed files.")
else:
    placeholder_dict = {placeholder: "" for placeholder in placeholders}
    print(f"Placeholders extracted from {files_processed} files, output:")
    print(json.dumps(placeholder_dict, indent=2))

Placeholders extracted from 1 files, output:
{
  "[Year_End]": "",
  "[Director1Name]": "",
  "[AGM_Date]": "",
  "[Address]": "",
  "[CompanyName]": "",
  "[UEN]": "",
  "[Director2Name]": "",
  "[Directorsremuneration]": "",
  "[Director3Name]": "",
  "[ShareHolder1]": "",
  "[ShareHolder2]": "",
  "[ShareHolder3]": "",
  "[ShareHolder4]": ""
}


## Generate documents

In [41]:
import json
import os
from docx import Document
from docx.shared import RGBColor
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn

def generate_documents(template_folder, json_string, output_folder):
    # Load the placeholders from the JSON string
    placeholders = json.loads(json_string)

    # Walk through the template folder
    for root, _, files in os.walk(template_folder):
        for file in files:
            if file.endswith('.docx'):
                # Construct paths
                template_path = os.path.join(root, file)
                relative_path = os.path.relpath(template_path, template_folder)
                output_path = os.path.join(output_folder, relative_path)

                # Create output directory if it doesn't exist
                os.makedirs(os.path.dirname(output_path), exist_ok=True)

                # Load the original document
                doc = Document(template_path)

                # Replace placeholders in paragraphs
                for paragraph in doc.paragraphs:
                    replace_and_highlight(paragraph, placeholders)

                # Replace placeholders in tables
                for table in doc.tables:
                    for row in table.rows:
                        for cell in row.cells:
                            for paragraph in cell.paragraphs:
                                replace_and_highlight(paragraph, placeholders)

                # Save the new document
                doc.save(output_path)

def replace_and_highlight(paragraph, placeholders):
    for placeholder, value in placeholders.items():
        if placeholder in paragraph.text:
            inline = paragraph._p.xpath('.//w:r')
            for i in inline:
                if placeholder in i.text:
                    text = i.text.replace(placeholder, value)
                    i.text = text
                    if value in text:
                        highlight_run(i, value)

def highlight_run(run, text):
    rPr = run.get_or_add_rPr()
    highlight = OxmlElement('w:highlight')
    highlight.set(qn('w:val'), 'yellow')
    rPr.append(highlight)


template_folder = "../templates/ol-dd-template"
output_folder = "../output/ol-dd-generated"
json_input = """
{
  "[Director3Name]": "",
  "[ShareHolder4]": "",
  "[AGM_Date]": "2024-01-01",
  "[Directorsremuneration]": "",
  "[ShareHolder2]": "Joyce",
  "[ShareHolder1]": "James",
  "[Director1Name]": "James",
  "[Director2Name]": "Joyce",
  "[Address]": "123, XYZ ST",
  "[CompanyName]": "ABC",
  "[ShareHolder3]": "",
  "[UEN]": "",
  "[Year_End]": "2024-12-31"
}
"""

generate_documents(template_folder, json_input, output_folder)
print(f"Generated new documents in {output_folder}")


Generated new documents in ../output/ol-dd-generated
