In [119]:
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import Table
from docx.text.paragraph import Paragraph
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm
from reportlab.lib.enums import TA_LEFT, TA_RIGHT
from reportlab.platypus import Paragraph, Table, TableStyle, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
from reportlab.lib.styles import ParagraphStyle
from reportlab.lib.enums import TA_CENTER
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.text.paragraph import Paragraph as DocxParagraph
from docx.table import Table as DocxTable
from reportlab.lib.pagesizes import letter
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer, Image
)
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle, Spacer, Image


In [120]:
def read_docx_paragraphs(file_path):
    """
    Reads a .docx Word document and returns a list of non-empty paragraph strings.
    
    Parameters:
    - file_path (str): Full path to the .docx file
    
    Returns:
    - List[str]: A list of cleaned, non-empty paragraph texts
    """
    document = Document(file_path)
    paragraphs = [para.text.strip() for para in document.paragraphs if para.text.strip()]
    return paragraphs

In [121]:
file_path = "demo_pdfs\PLACEMENT SLIP.docx"  
paragraphs = read_docx_paragraphs(file_path)

In [122]:
for i, line in enumerate(paragraphs[:15]):
    print(f"{i+1}. {line}")

1. Placement Slip – P_GMC_21086
2. POLICY DETAILS
3. PROPOSED INSURER DETAILS
4. POLICY PERIOD
5. INSURED BUSINESS
6. SUM INSURED & MEMBER COUNT
7. GENERAL INFORMATION
8. COVERAGE DETAILS
9. MATERNITY BENEFITS
10. ADD-ON COVERAGES
11. COST CONTAINMENT
12. OTHER CLAUSES
13. CLAIM EXPERIENCE
14. PREMIUM DETAILS
15. PAYMENT DETAILS


In [123]:
def extract_tables(file_path):
    """
    Extracts all tables from a .docx file and returns them as list of rows,
    where each row is a list of cell text values.
    """
    doc = Document(file_path)
    tables = []

    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)
        tables.append(table_data)

    return tables

In [124]:
tables = extract_tables("demo_pdfs\PLACEMENT SLIP.docx")

for row in tables[0][:10]:
    print(row)

['Field', 'Value']
['Type of Policy', 'GROUP HEALTH POLICY - EMPLOYER/EMPLOYEE']
['Insured Name', 'BHUBANESHWARI COAL MINING LTD \nand/or affiliated and/or interrelated and/or subsidiary companies and/or corporations as they now are or may hereafter be created and/or constituted and/or for whom the Insured receive instructions to insure and/or for whom the Insured have or assume a responsibility to arrange insurance contractually, as their respective rights and interest may appear hereinafter known as the Insured.']
['Communication Address', 'PLOT NO 1554, FLAT NO 2, 1ST FLOOR, SHARMACHHAK , TALCHER, ORISSA TALCHER-759100']
['Proposal Type', 'Renewal']
['Current Policy Number', 'Expiring Policy No. 12120034240400000004']
['Current TPA', 'MEDI ASSIST INDIA TPA PVT. LTD.']


In [125]:


def iter_block_items(parent):
    """
    Iterate over paragraphs and tables in the document, in order.
    """
    for child in parent.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield DocxParagraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield DocxTable(child, parent)

def extract_ordered_blocks(file_path):
    doc = Document(file_path)
    blocks = []

    for block in iter_block_items(doc):
        if isinstance(block, DocxParagraph):
            text = block.text.strip()
            if text:
                blocks.append({"type": "paragraph", "text": text})
        elif isinstance(block, DocxTable):
            table_data = []
            for row in block.rows:
                row_data = [cell.text.strip() for cell in row.cells]
                table_data.append(row_data)
            blocks.append({"type": "table", "data": table_data})
    
    return blocks


In [126]:
blocks = extract_ordered_blocks(file_path)


In [127]:
for block in blocks:
    if block["type"] == "paragraph":
        print(block["text"])
    else:
        print("Table:")
        for row in block["data"]:
            print("   ", row)


Placement Slip – P_GMC_21086
POLICY DETAILS
Table:
    ['Field', 'Value']
    ['Type of Policy', 'GROUP HEALTH POLICY - EMPLOYER/EMPLOYEE']
    ['Insured Name', 'BHUBANESHWARI COAL MINING LTD \nand/or affiliated and/or interrelated and/or subsidiary companies and/or corporations as they now are or may hereafter be created and/or constituted and/or for whom the Insured receive instructions to insure and/or for whom the Insured have or assume a responsibility to arrange insurance contractually, as their respective rights and interest may appear hereinafter known as the Insured.']
    ['Communication Address', 'PLOT NO 1554, FLAT NO 2, 1ST FLOOR, SHARMACHHAK , TALCHER, ORISSA TALCHER-759100']
    ['Proposal Type', 'Renewal']
    ['Current Policy Number', 'Expiring Policy No. 12120034240400000004']
    ['Current TPA', 'MEDI ASSIST INDIA TPA PVT. LTD.']
PROPOSED INSURER DETAILS
Table:
    ['Name of Insurer', 'City of Issuing Office', 'Divisional Office Number']
    ['THE NEW INDIA ASSURANCE

In [128]:
def parse_blocks_to_json(blocks):
    """
    Parses ordered Word blocks into a structured JSON-like dictionary.
    Handles:
    - Section headers (ALL CAPS)
    - Key-value pairs (from paragraphs)
    - Bullet points
    - Tables (converted to list of dicts)
    """
    import re
    from collections import defaultdict

    data = defaultdict(dict)
    current_section = None
    table_pending = False  # used to attach next table to last section
    bullet_index = 1       # for unnamed bullets like exclusions

    for block in blocks:
        if block["type"] == "paragraph":
            text = block["text"]

            # Detect Section Header (ALL CAPS, e.g. "CLAIMS EXPERIENCE")
            if text.isupper() and len(text.split()) <= 5:
                current_section = text.title()

                # default: assume dict
                default_value = {}

                # Sections that should be lists
                if any(keyword in current_section.lower() for keyword in ["exclusion", "maternity benefits"]):
                    default_value = []

                data[current_section] = default_value
                bullet_index = 1
                continue


            if current_section is None:
                continue  # skip orphan paragraphs at the top

            #  Bullet-like lines (e.g. exclusions or list of features)
            if text.startswith("•") or re.match(r"^[\-\*•] ", text):
                bullet = text[1:].strip() if text[0] in "•-* " else text.strip()
                if isinstance(data[current_section], list):
                    data[current_section].append(bullet)
                else:
                    data[current_section][f"Note_{bullet_index}"] = bullet
                    bullet_index += 1

            #  Pipe-split counts (like Employees: 198 | Dependents: 516)
            elif "|" in text:
                parts = text.split("|")
                for part in parts:
                    if ":" in part:
                        k, v = part.split(":", 1)
                        data[current_section][k.strip()] = v.strip()

            #  Key-value lines like: “Sum Insured: ₹1,00,000”
            elif ":" in text:
                k, v = text.split(":", 1)
                data[current_section][k.strip()] = v.strip()

            else:
            # Fallback: store as note or bullet
                if isinstance(data[current_section], dict):
                    data[current_section][f"Note_{bullet_index}"] = text
                    bullet_index += 1
                elif isinstance(data[current_section], list):
                    data[current_section].append(text)

            bullet_index += 1

        elif block["type"] == "table":
            # Attach table to the last section if appropriate
            if current_section:
                table_data = block["data"]
                headers = table_data[0]
                rows = table_data[1:]

                row_dicts = []
                for row in rows:
                    entry = {}
                    for col, cell in zip(headers, row):
                        entry[col.strip()] = cell.strip()
                    row_dicts.append(entry)

                data[current_section] = row_dicts

    return dict(data)


In [129]:
parsed_data = parse_blocks_to_json(blocks)

In [130]:
from pprint import pprint
pprint(parsed_data)

{'Add-On Coverages': [{'Add On Coverages': 'Chemotherapy (IPD/Day)',
                       'Expired Terms (OPT-1)': 'Not covered'},
                      {'Add On Coverages': 'Air Ambulance',
                       'Expired Terms (OPT-1)': 'Not covered'},
                      {'Add On Coverages': 'Dental Treatment (non-accidental '
                                           'OPD)',
                       'Expired Terms (OPT-1)': 'Not covered'},
                      {'Add On Coverages': 'OPD Benefit',
                       'Expired Terms (OPT-1)': 'Not covered'},
                      {'Add On Coverages': 'Critical Illness Benefit',
                       'Expired Terms (OPT-1)': 'Not covered'},
                      {'Add On Coverages': 'Increase in Family SI',
                       'Expired Terms (OPT-1)': 'Not covered'},
                      {'Add On Coverages': 'Lasik +/- 7.0',
                       'Expired Terms (OPT-1)': 'NA'},
                      {'Add On Coverages': 'D

In [131]:
parsed_sections = list(parsed_data.keys())
print("\n".join(parsed_sections))

Policy Details
Proposed Insurer Details
Policy Period
Insured Business
Sum Insured & Member Count
General Information
Coverage Details
Maternity Benefits
Add-On Coverages
Cost Containment
Other Clauses
Claim Experience
Premium Details
Payment Details
Approvals


Report generation

In [132]:

styles = getSampleStyleSheet()
wrap_style = ParagraphStyle(name="wrap", parent=styles["Normal"], fontSize=9, leading=12, spaceAfter=4)
title_style = ParagraphStyle(name="Title", parent=styles["Title"], fontSize=14, alignment=1)
section_style = ParagraphStyle(name="Section", parent=styles["Heading2"], fontSize=12, spaceBefore=12, spaceAfter=6)


In [139]:


def generate_pdf(parsed_data, output_path="placement_slip_fixed.pdf"):
    doc = SimpleDocTemplate(output_path, pagesize=letter,
                            leftMargin=0.5*inch, rightMargin=0.5*inch,
                            topMargin=0.5*inch, bottomMargin=0.5*inch)
    elements = []

    # --- Logo and Title ---
    try:
        elements.append(Image("Edme_logo.png", width=7.5*inch, height=1*inch))
    except:
        pass
    elements.append(Paragraph("Placement Slip – P_GMC_21086", title_style))
    elements.append(Spacer(1, 10))

    # --- Define correct section order ---
    section_order = [
        "Policy Details", "Proposed Insurer Details", "Policy Period", "Insured Business",
        "Sum Insured & Member Count", "General Information", "Coverage Details",
        "Maternity Benefits", "Add-On Coverages", "Cost Containment", "Other Clauses",
        "Claim Experience", "Premium Details", "Payment Details", "Approvals"
    ]

    # --- Render Loop ---
    for section in section_order:
        data = parsed_data.get(section)
        if not data:
            continue

        elements.append(Paragraph(section.replace("-", " "), section_style))

        # Render Key-Value Table
        if isinstance(data, list) and isinstance(data[0], dict):
            # Auto-detect keys
            keys = list(data[0].keys())
            if len(keys) == 2:  # Simple KV or 2-col table
                table_data = [[Paragraph(k, wrap_style) for k in keys]]
                for row in data:
                    table_data.append([Paragraph(str(row.get(k, "").replace("₹", "").replace('\xa0', ' ').strip()), wrap_style) for k in keys])
                col_count = len(keys)
                table = Table(table_data, colWidths=[3.5*inch if col_count == 2 else 2*inch]*col_count)
                table.setStyle(TableStyle([
                    ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#4472C4")),
                    ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
                    ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
                    ("FONTNAME", (0, 1), (-1, -1), "Helvetica"),
                    ("FONTSIZE", (0, 0), (-1, -1), 9),
                    ("GRID", (0, 0), (-1, -1), 0.5, colors.grey),
                    ("VALIGN", (0, 0), (-1, -1), "TOP")
                ]))
                elements.append(table)
                elements.append(Spacer(1, 10))

            elif len(keys) > 2:  # Multi-column tables (e.g., Claim Experience)
                table_data = [[Paragraph(k, wrap_style) for k in keys]]
                for row in data:
                    table_data.append([Paragraph(str(row.get(k, "").replace("₹", "").replace('\xa0', ' ').strip()), wrap_style) for k in keys])
                table = Table(table_data, colWidths=[6.5*inch/len(keys)]*len(keys))
                table.setStyle(TableStyle([
                    ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#4472C4")),
                    ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
                    ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
                    ("FONTNAME", (0, 1), (-1, -1), "Helvetica"),
                    ("FONTSIZE", (0, 0), (-1, -1), 8),
                    ("GRID", (0, 0), (-1, -1), 0.5, colors.lightgrey),
                    ("VALIGN", (0, 0), (-1, -1), "TOP")
                ]))
                elements.append(table)
                elements.append(Spacer(1, 10))

    doc.build(elements)
    print(f"PDF successfully generated: {output_path}")


In [140]:
generate_pdf(parsed_data)


PDF successfully generated: placement_slip_fixed.pdf
