In [1]:
import random
from io import BytesIO
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib.enums import TA_JUSTIFY
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer,
    Table, TableStyle, PageBreak, Image
)
from reportlab.lib import colors
from PIL import Image as PILImage, ImageDraw, ImageFont

class UltimateRAGBenchmarkGenerator:
    def __init__(self, filename="RAG_BENCHMARK.pdf"):
        self.filename = filename
        self.styles = getSampleStyleSheet()
        self._setup_styles()

    # ---------- STYLES ----------
    def _setup_styles(self):
        self.styles.add(ParagraphStyle(
            name='MainText',
            parent=self.styles['Normal'],
            fontSize=10,
            leading=14,
            alignment=TA_JUSTIFY,
            spaceAfter=10
        ))

    # ---------- HEADER / FOOTER ----------
    def _header_footer(self, canvas, doc):
        canvas.saveState()
        canvas.setFont("Helvetica", 8)
        canvas.drawString(
            inch,
            letter[1] - 0.5 * inch,
            "ACME CORPORATION — INTERNAL USE ONLY"
        )
        canvas.drawCentredString(
            letter[0] / 2,
            0.5 * inch,
            f"Page {doc.page}"
        )
        canvas.restoreState()

    # ---------- FIGURE GENERATORS ----------
    def line_plot(self, title):
        img = PILImage.new("RGB", (600, 300), "white")
        d = ImageDraw.Draw(img)
        d.text((20, 10), title, fill="black")
        d.line([(60, 250), (560, 250)], fill="black", width=2)
        d.line([(60, 40), (60, 250)], fill="black", width=2)
        pts = [(60 + i * 80, 250 - random.randint(20, 200)) for i in range(6)]
        d.line(pts, fill="black", width=3)
        return self._img_buf(img)

    def bar_chart(self, title):
        img = PILImage.new("RGB", (600, 300), "white")
        d = ImageDraw.Draw(img)
        d.text((20, 10), title, fill="black")
        base = 260
        for i in range(6):
            h = random.randint(40, 200)
            d.rectangle([80 + i * 70, base - h, 120 + i * 70, base], outline="black", fill=None)
        return self._img_buf(img)

    def step_plot(self, title):
        img = PILImage.new("RGB", (600, 300), "white")
        d = ImageDraw.Draw(img)
        d.text((20, 10), title, fill="black")
        x, y = 60, 250
        for _ in range(5):
            nx = x + random.randint(60, 100)
            ny = random.randint(80, 240)
            d.line([(x, y), (nx, y)], fill="black", width=2)
            d.line([(nx, y), (nx, ny)], fill="black", width=2)
            x, y = nx, ny
        return self._img_buf(img)

    def scatter_plot(self, title):
        img = PILImage.new("RGB", (600, 300), "white")
        d = ImageDraw.Draw(img)
        d.text((20, 10), title, fill="black")
        for _ in range(40):
            x = random.randint(70, 550)
            y = random.randint(60, 260)
            d.ellipse([x-2, y-2, x+2, y+2], fill="black")
        return self._img_buf(img)

    def timeline_diagram(self, title):
        img = PILImage.new("RGB", (600, 300), "white")
        d = ImageDraw.Draw(img)
        d.text((20, 10), title, fill="black")
        d.line([(80, 150), (520, 150)], fill="black", width=2)
        for i in range(5):
            x = 80 + i * 100
            d.line([(x, 140), (x, 160)], fill="black", width=2)
            d.text((x - 10, 170), f"T{i+1}", fill="black")
        return self._img_buf(img)

    def _img_buf(self, img):
        buf = BytesIO()
        img.save(buf, format="PNG")
        buf.seek(0)
        return buf

    # ---------- TEXT GENERATORS ----------
    def paragraph(self):
        # Multiple sentence pools for diversity
        blocks = [
            "Operational throughput exhibited non-linear adjustments over the observation window, influenced by regional scheduling constraints.",
            "Certain dependencies introduced latency that could not be isolated to a single functional unit.",
            "Comparative analysis against prior intervals suggests gradual stabilization rather than abrupt correction.",
            "Internal coordination benefited from informal escalation paths that were not formally documented.",
            "These conditions persisted without materially altering aggregate outcomes.",
            "A multi-step procedure required careful cross-checks between related sections to maintain consistency.",
            "Minor inconsistencies in notation arose across the different submodules, impacting interpretation.",
            "Embedded diagrams provided contextual information not easily referenced in the surrounding prose.",
            "Tables contained numeric sequences that, when extracted incorrectly, reversed intended meaning.",
            "Some sections referenced visual data with ambiguous labels, complicating automated retrieval."
        ]
        random.shuffle(blocks)
        return " ".join(blocks[:random.randint(3, 5)])

    # ---------- TABLES (ALL DIFFERENT SHAPES) ----------
    def table_a(self):
        return Table(
            [["Unit", "Score", "Rank"],
             ["A1", "78.4", "3"],
             ["B2", "91.2", "1"],
             ["C7", "66.9", "5"]],
            colWidths=[2 * inch, 2 * inch, 1 * inch],
            style=[('GRID', (0, 0), (-1, -1), 0.5, colors.black)]
        )

    def table_b(self):
        return Table(
            [["Region", "Index α", "Index β", "Index γ", "Status"],
             ["North", "0.82", "1.14", "0.77", "Open"],
             ["East", "0.64", "1.02", "0.69", "Limited"]],
            colWidths=[1.2 * inch] * 5,
            style=[('GRID', (0, 0), (-1, -1), 0.5, colors.black)]
        )

    def table_c(self):
        return Table(
            [["Metric", "Value"],
             ["Throughput", "1243"],
             ["Deviation", "-3.2%"],
             ["Adjusted Baseline", "ZX-4419"]],
            colWidths=[3 * inch, 2 * inch],
            style=[('GRID', (0, 0), (-1, -1), 0.5, colors.black)]
        )

    # ---------- DOCUMENT GENERATOR ----------
    def generate(self):
        doc = SimpleDocTemplate(
            self.filename,
            pagesize=letter,
            topMargin=90,
            bottomMargin=72
        )

        story = []

        # Cover page
        story.append(Paragraph(
            "Internal Operations & Knowledge Consolidation 2024",
            self.styles['Heading1']
        ))
        story.append(PageBreak())

        # Section templates
        figures = [self.line_plot, self.bar_chart, self.step_plot, self.scatter_plot, self.timeline_diagram]
        tables = [self.table_a, self.table_b, self.table_c]
        section_titles = [
            "Operational Overview",
            "Regional Observations",
            "Infrastructure Summary",
            "Extended Records",
            "Financial Notes",
            "Analytics Highlights",
            "Supplementary Data"
        ]

        for i, title in enumerate(section_titles):
            story.append(Paragraph(title, self.styles['Heading2']))

            # Add 3-5 diverse paragraphs (chunking challenge)
            for _ in range(random.randint(3,5)):
                story.append(Paragraph(self.paragraph(), self.styles['MainText']))

            story.append(Spacer(1, 0.2 * inch))

            # Add a table (different shape)
            story.append(random.choice(tables)())
            story.append(Spacer(1, 0.3 * inch))

            # Add a unique figure
            fig = figures[i % len(figures)]
            story.append(Image(fig(f"Figure {i+1}"), width=5 * inch, height=2.5 * inch))

            # Occasionally repeat partial text + new figure to simulate duplicates
            if i % 2 == 0:
                story.append(Paragraph(self.paragraph(), self.styles['MainText']))
                story.append(Image(fig(f"Figure {i+1}-alt"), width=5 * inch, height=2.5 * inch))

            story.append(PageBreak())

        # Build PDF with header/footer
        doc.build(story, onFirstPage=self._header_footer, onLaterPages=self._header_footer)
        print(f"Generated: {self.filename}")


In [2]:
UltimateRAGBenchmarkGenerator().generate()

Generated: RAG_BENCHMARK.pdf
