In [1]:
import docx
file_path = "kiritsubo_paragraphs.docx"

In [None]:
import re
import datetime

def extract_links(paragraph_text):
    """
    Process paragraphs with links using string operations instead of regex.
    """

    number_pattern = re.compile(r"(\d+\.\d+)")
    

    links = []
    
    # Look for the characteristic beginning of the link
    link_marker = "[link to original paragraph"
    if link_marker in paragraph_text:
        # Find the position of the link
        start_idx = paragraph_text.rfind(link_marker)
        
        # Extract the link part
        link = paragraph_text[start_idx:]
        
        # Clean the text by removing the link
        clean_text = paragraph_text[:start_idx].strip()
        
        # Extract section and paragraph numbers
        try:
            # Extract the numbers part (e.g., "1.1")
            numbers_part = re.search(number_pattern, link).group(0)
            section_str, paragraph_str = numbers_part.split(".")
            
            link_info = {
                'textCollectionId': "original",
                "hierarchy" :{
                    'section': int(section_str),
                    'chapter': int(section_str),
                    'paragraph': int(paragraph_str)
                }
            }
            
            links.append(link_info)
            # print(f"Found link in {para_id}: Section {link_info['section']}, Paragraph {link_info['paragraph']}")
        except Exception as e:
            print(f"Error parsing link: {e}")
    
        return clean_text, links
    
    return paragraph_text, links

from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

def get_text_format(paragraph):
    formatting =[]
    for run in paragraph.runs:
        # Check if text is bold
        is_bold = run.bold
        
        # Check if text is italic
        is_italic = run.italic
        
        # Check if text is underlined
        is_underlined = run.underline

        if is_italic or is_underlined:
            start = paragraph.text.find(run.text)
            end = start + len(run.text)
            fmt = {
                "start": start,
                "end": end,
                "type": []
            }
            # if is_bold:
            #     fmt["type"].append("bold")
            if is_italic:
                fmt["type"].append("italic")
            if is_underlined:
                fmt["type"].append("underlined")
            formatting.append(fmt)
    return formatting

def process_indent(value):
    if value is None:
        return 0
    
    return value / 914400


alignment_map = {
    WD_PARAGRAPH_ALIGNMENT.LEFT: "left",
    WD_PARAGRAPH_ALIGNMENT.CENTER: "center",
    WD_PARAGRAPH_ALIGNMENT.RIGHT: "right",
    WD_PARAGRAPH_ALIGNMENT.JUSTIFY: "justify"
}

def get_paragraph_format(paragraph):
    left_indent = paragraph.paragraph_format.left_indent
    right_indent = paragraph.paragraph_format.right_indent
    first_line_indent = paragraph.paragraph_format.first_line_indent
    alignment = paragraph.paragraph_format.alignment

    text_styles = get_text_format(paragraph)

    return {
        "left_indent": process_indent(left_indent),
        "right_indent": process_indent(right_indent),
        "first_line_indent": process_indent(first_line_indent),
        "alignment": "left" if alignment is None else alignment_map[alignment],
        "text_styles": text_styles
    }

def extract_paragraphs(docx_path, text_collection_id, section, chapter):
    # Load the document
    doc = docx.Document(docx_path)
    current_time_iso = datetime.datetime.now().isoformat()
    # Extract text from each paragraph
    json_results = []
    for i, paragraph in enumerate(doc.paragraphs):
        if paragraph.text.strip():  # Skip empty paragraphs
            # paragraphs_text.append({f"P{i+1}":paragraph.text})
            clean_text, links = extract_links(paragraph.text)
            paragraph_json = {
                "id": f"P{i+1}",
                "textCollectionId": text_collection_id,
                "hierarchy": {
                    "section": section,
                    "chapter": chapter,
                    "paragraph": i+1  # 1-based index
                },
                "content": {
                    "text": clean_text,
                    "formatting": get_paragraph_format(paragraph)
                },
                "links": links,
                "metadata": {
                    "created": current_time_iso,
                    "modified": current_time_iso
                }
            }
            json_results.append(paragraph_json)
    
    return json_results

In [12]:
text = extract_paragraphs(file_path, 'TC1', 1, 1)
print(text[0])

{'id': 'P1', 'textCollectionId': 'TC1', 'hierarchy': {'section': 1, 'chapter': 1, 'paragraph': 1}, 'content': {'text': 'IN WHOSE reign was it that a woman of rather undistinguished lineage captured the heart of the Emperor and enjoyed his favor above all the other imperial wives and concubines? Certain consorts, whose high noble status gave them a sense of vain entitlement, despised and reviled her as an unworthy upstart from the very moment she began her service. Ladies of lower rank were even more vexed, for they knew His Majesty would never bestow the same degree of affection and attention on them. As a result, the mere presence of this woman at morning rites or evening ceremonies seemed to provoke hostile reactions among her rivals, and the anxiety she suffered as a consequence of these ever-increasing displays of jealousy was such a heavy burden that gradually her health began to fail.', 'formatting': {'left_indent': 0, 'right_indent': 0, 'first_line_indent': 0, 'alignment': 'left

In [13]:
import json
with open("data/content/TC1_CH1.json", "w", encoding="utf-8") as f:
    json.dump(text, f, ensure_ascii=False, indent=2)

In [43]:
doc = docx.Document(file_path)
paragraph = doc.paragraphs[0]

# Accessing paragraph indentation
left_indent = paragraph.paragraph_format.left_indent
right_indent = paragraph.paragraph_format.right_indent
first_line_indent = paragraph.paragraph_format.first_line_indent

# Accessing paragraph alignment
alignment = paragraph.paragraph_format.alignment

# Accessing paragraph spacing
space_before = paragraph.paragraph_format.space_before
space_after = paragraph.paragraph_format.space_after
line_spacing = paragraph.paragraph_format.line_spacing

print(f"Left indent: {left_indent}")
print(f"Right indent: {right_indent}")
print(f"First line indent: {first_line_indent}")
print(f"Alignment: {alignment}")
print(f"Space before: {space_before}")
print(f"Space after: {space_after}")
print(f"Line spacing: {line_spacing}")

Left indent: None
Right indent: None
First line indent: None
Alignment: RIGHT (2)
Space before: None
Space after: None
Line spacing: None


In [51]:
docx.enum.text.WD_PARAGRAPH_ALIGNMENT.LEFT

<WD_PARAGRAPH_ALIGNMENT.LEFT: 0>

In [28]:
doc = docx.Document(file_path)

for i in range(30):
    paragraph = doc.paragraphs[i]
    if paragraph.text.strip():
        print(f"Paragraph {i+1}: {paragraph.text}")

Paragraph 1: IN WHOSE reign was it that a woman of rather undistinguished lineage captured the heart of the Emperor and enjoyed his favor above all the other imperial wives and concubines? Certain consorts, whose high noble status gave them a sense of vain entitlement, despised and reviled her as an unworthy upstart from the very moment she began her service. Ladies of lower rank were even more vexed, for they knew His Majesty would never bestow the same degree of affection and attention on them. As a result, the mere presence of this woman at morning rites or evening ceremonies seemed to provoke hostile reactions among her rivals, and the anxiety she suffered as a consequence of these ever-increasing displays of jealousy was such a heavy burden that gradually her health began to fail. [link to original paragraph 1.1]
Paragraph 3: His Majesty could see how forlorn she was, how often she returned to her family home. He felt sorry for her and wanted to help, and though he could scarcely 