In [2]:
import docx
file_path = "kiritsubo_paragraphs.docx"

In [4]:
def extract_paragraphs(docx_path):
    # Load the document
    doc = docx.Document(docx_path)
    
    # Extract text from each paragraph
    paragraphs_text = []
    for i, paragraph in enumerate(doc.paragraphs):
        if paragraph.text.strip():  # Skip empty paragraphs
            paragraphs_text.append({f"P{i+1}":paragraph.text})
    
    return paragraphs_text

In [6]:
text = extract_paragraphs(file_path)
print(text[0])

{'P1': 'IN WHOSE reign was it that a woman of rather undistinguished lineage captured the heart of the Emperor and enjoyed his favor above all the other imperial wives and concubines? Certain consorts, whose high noble status gave them a sense of vain entitlement, despised and reviled her as an unworthy upstart from the very moment she began her service. Ladies of lower rank were even more vexed, for they knew His Majesty would never bestow the same degree of affection and attention on them. As a result, the mere presence of this woman at morning rites or evening ceremonies seemed to provoke hostile reactions among her rivals, and the anxiety she suffered as a consequence of these ever-increasing displays of jealousy was such a heavy burden that gradually her health began to fail. [link to original paragraph 1.1]'}


In [33]:
import re
def process_paragraphs_with_links(paragraphs_data):
    """
    Process paragraphs with links using string operations instead of regex.
    """
    processed_dict = {}
    number_pattern = re.compile(r"(\d+\.\d+)")
    
    for para_dict in paragraphs_data:
        for para_id, text in para_dict.items():
            links = []
            clean_text = text
            
            # Look for the characteristic beginning of the link
            link_marker = "[link to original paragraph"
            if link_marker in text:
                # Find the position of the link
                start_idx = text.rfind(link_marker)
                
                # Extract the link part
                link = text[start_idx:]
                
                # Clean the text by removing the link
                clean_text = text[:start_idx].strip()
                
                # Extract section and paragraph numbers
                try:
                    # Extract the numbers part (e.g., "1.1")
                    numbers_part = re.search(number_pattern, link).group(0)
                    section_str, paragraph_str = numbers_part.split(".")
                    
                    link_info = {
                        'text_collection': "original",
                        'section': int(section_str),
                        'paragraph': int(paragraph_str)
                    }
                    
                    links.append(link_info)
                    # print(f"Found link in {para_id}: Section {link_info['section']}, Paragraph {link_info['paragraph']}")
                except Exception as e:
                    print(f"Error parsing link in {para_id}: {e}")
            
            processed_dict[para_id] = {
                'text': clean_text,
                'links': links
            }
    
    return processed_dict

In [34]:
data_with_links = process_paragraphs_with_links(text)
print(data_with_links['P1'])

{'text': 'IN WHOSE reign was it that a woman of rather undistinguished lineage captured the heart of the Emperor and enjoyed his favor above all the other imperial wives and concubines? Certain consorts, whose high noble status gave them a sense of vain entitlement, despised and reviled her as an unworthy upstart from the very moment she began her service. Ladies of lower rank were even more vexed, for they knew His Majesty would never bestow the same degree of affection and attention on them. As a result, the mere presence of this woman at morning rites or evening ceremonies seemed to provoke hostile reactions among her rivals, and the anxiety she suffered as a consequence of these ever-increasing displays of jealousy was such a heavy burden that gradually her health began to fail.', 'links': [{'text_collection': 'original', 'section': 1, 'paragraph': 1}]}


In [None]:
import re
import datetime
from typing import List, Dict, Any

def process_paragraphs_to_json_format(paragraphs_data, section, chapter, link_marker = "[link to original paragraph") -> List[Dict[str, Any]]:
    """
    Process paragraphs with links and format them according to the specified JSON structure.
    
    Args:
        paragraphs_ extract_paragraphs()
        
    Returns:
        List of dictionaries in the specified JSON format
    """
    result_list = []
    number_pattern = re.compile(r"(\d+\.\d+)")
    current_time_iso = datetime.datetime.now().isoformat()
    
    for i, para_dict in enumerate(paragraphs_data, 1):
        for para_id, text in para_dict.items():
            # Create the paragraph ID in the format P1_X
            formatted_id = f"P1_{i}"
            
            links = []
            clean_text = text
            
            if link_marker in text:
                # Find the position of the link
                start_idx = text.rfind(link_marker)
                
                # Extract the link part
                link = text[start_idx:]
                
                # Clean the text by removing the link
                clean_text = text[:start_idx].strip()
                
                # Extract section and paragraph numbers
                try:
                    # Extract the numbers part (e.g., "1.1")
                    numbers_part = re.search(number_pattern, link).group(0)
                    section_str, paragraph_str = numbers_part.split(".")
                    
                    link_info = {
                        'text_collection': "original",
                        'section': int(section_str),
                        'paragraph': int(paragraph_str)
                    }
                    
                    links.append(link_info)
                except Exception as e:
                    print(f"Error parsing link in {para_id}: {e}")
            
            # Create the JSON structure
            paragraph_json = {
                "id": formatted_id,
                "textCollectionId": "TC1",
                "hierarchy": {
                    "section": section,
                    "chapter": chapter,
                    "paragraph": i  # 1-based index
                },
                "content": {
                    "text": clean_text,
                    "formatting": []
                },
                "links": links,
                "metadata": {
                    "created": current_time_iso,
                    "modified": current_time_iso
                }
            }
            
            result_list.append(paragraph_json)
    
    return result_list

In [37]:
out = process_paragraphs_to_json_format(text)

In [38]:
out[0]

{'id': 'P1_1',
 'textCollectionId': 'TC1',
 'hierarchy': {'section': 1, 'chapter': 1, 'paragraph': 1},
 'content': {'text': 'IN WHOSE reign was it that a woman of rather undistinguished lineage captured the heart of the Emperor and enjoyed his favor above all the other imperial wives and concubines? Certain consorts, whose high noble status gave them a sense of vain entitlement, despised and reviled her as an unworthy upstart from the very moment she began her service. Ladies of lower rank were even more vexed, for they knew His Majesty would never bestow the same degree of affection and attention on them. As a result, the mere presence of this woman at morning rites or evening ceremonies seemed to provoke hostile reactions among her rivals, and the anxiety she suffered as a consequence of these ever-increasing displays of jealousy was such a heavy burden that gradually her health began to fail.',
  'formatting': []},
 'links': [{'text_collection': 'original', 'section': 1, 'paragraph'

In [39]:
import json
with open("data/content/TC1_CH1.json", "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)