# Text-Based PDF Extraction + PDF Generation

1. Extract structured data from a **text-based PDF** using:

   - `pdfplumber` for text extraction
   - `rapidfuzz` for fuzzy keyword matching

2. Generate a PDF report using:
   - `reportlab` to convert extracted fields into a formatted PDF

The code extracts fields like:

- Policy Number
- Insured Name
- Sum Insured
- Premium
- Policy Start
- Policy End

Then, it creates a nicely formatted PDF titled "Insurance Policy Summary" containing those values.

Output:

- Printed extracted data in notebook
- Generated PDF file: `policy_summary_from_textpdf.pdf`


In [4]:
import pandas as pd
import pdfplumber
from rapidfuzz import process, fuzz
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas


In [5]:
pdf_path = "demo_pdfs\policy_1.pdf"

In [6]:
with pdfplumber.open(pdf_path) as pdf:
    first_page = pdf.pages[0]
    raw_text = first_page.extract_text()

print(raw_text)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Insurance Policy Document
Policy Number: EDME-00123-XY
Insured Name: Rahul Sharma
Sum Insured: 5,00,000
Premium: 12,500
Policy Start: 01-Apr-2024
Policy End: 31-Mar-2025


In [7]:
# Define the fields you want to extract
fields = {
    "Policy Number": ["number"],
    "Insured Name": ["name"],
    "Sum Insured": ["sum", "insured"],
    "Premium": ["premium"],
    "Policy Start": ["start"],
    "Policy End": ["end"]
}

# Extract lines from text
lines = raw_text.splitlines()

# Placeholder for parsed data
parsed_data = {}

# Loop through each label
for label, required_keywords in fields.items():
    result = process.extractOne(label, lines, scorer=fuzz.token_sort_ratio, score_cutoff=50)
    
    if result:
        match_line, score, _ = result
        
        if all(keyword.lower() in match_line.lower() for keyword in required_keywords):
            try:
                value = match_line.split(":")[1].strip()
            except IndexError:
                value = None
            parsed_data[label] = value
            print(f"{label}: {value}")
        else:
            print(f"Match score was high, but context mismatch for '{label}'. Ignored: {match_line}")
            parsed_data[label] = None
    else:
        print(f"No close match found for '{label}'")
        parsed_data[label] = None


Policy Number: EDME-00123-XY
Insured Name: Rahul Sharma
Sum Insured: 5,00,000
Premium: 12,500
Policy Start: 01-Apr-2024
Policy End: 31-Mar-2025


In [8]:
# 'parsed_data' should already be populated from your extraction logic
df = pd.DataFrame([parsed_data])  # Wrap in list to create a row

# Save to Excel
output_path = "parsed_policy_data.xlsx"
df.to_excel(output_path, index=False)

print(f"Data saved to {output_path}")


Data saved to parsed_policy_data.xlsx


In [9]:

# Generate PDF report from parsed_data
pdf_path = "policy_summary_from_textpdf.pdf"
c = canvas.Canvas(pdf_path, pagesize=A4)

width, height = A4
y = height - 50

c.setFont("Helvetica-Bold", 16)
c.drawString(50, y, "Insurance Policy Summary")
y -= 40

c.setFont("Helvetica", 12)
for key, value in parsed_data.items():
    c.drawString(50, y, f"{key}: {value}")
    y -= 25

c.save()
print("PDF summary saved as:", pdf_path)


PDF summary saved as: policy_summary_from_textpdf.pdf
