In [2]:
!pip install python-pptx

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xlsxwriter-3.2.5-py3-none-any.whl (172 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.5 python-pptx-1.0.2


In [3]:
import google.generativeai as genai
import os
from PIL import Image
import json
import io

# --- New Requirement for PPTX processing ---
# You must install the python-pptx library:


from pptx import Presentation
from pptx.util import Inches



# --- Configuration ---
# IMPORTANT: Set your API key as an environment variable for security.
# In your terminal, run: export GOOGLE_API_KEY='YOUR_API_KEY'
try:
    # api_key = os.environ.get("GOOGLE_API_KEY")

    api_key = ''
    if not api_key:
        raise ValueError("GOOGLE_API_KEY environment variable not set.")
    genai.configure(api_key=api_key)
except ValueError as e:
    print(e)
    # Handle the error gracefully in a real application
    exit()

# --- 1. KNOWLEDGE BASE & RETRIEVAL SIMULATION (The "R" in RAG) ---
# This remains the same. It's our database of rules.
KNOWLEDGE_BASE = {
    "disclaimer_guidelines": {
        "content": "All marketing content for 'Innovate Inc.' must include the official legal disclaimer: '© 2025 Innovate Inc. All rights reserved. Performance results are not guaranteed.' The disclaimer must be clearly visible at the footer of the document or in the description of a social media post.",
        "keywords": ["disclaimer", "legal", "footer", "copyright"]
    },
    "client_type_guidelines": {
        "content": "Content must be tailored to one of two client types: 'Enterprise' or 'Startup'. Enterprise content should use formal language and focus on scalability and security. Startup content should use more casual language and focus on growth and innovation.",
        "keywords": ["client", "enterprise", "startup", "tone", "audience"]
    },
    "chart_label_guidelines": {
        "content": "All charts, graphs, and visual data representations must be fully labeled. This includes: a descriptive title, a label for the X-axis, and a label for the Y-axis. A legend is required if more than one data series is present.",
        "keywords": ["chart", "graph", "image", "axis", "labels", "title", "legend"]
    },
    "table_completeness_guidelines": {
        "content": "Tables used in documentation must be complete. Every row must have a value for every column. If a value is not applicable, use 'N/A'. No cells should be left blank.",
        "keywords": ["table", "data", "complete", "cell", "blank"]
    }
}

def retrieve_relevant_guidelines(query: str) -> str:
    """Simulates retrieving relevant documents from a vector database."""
    print(f"\n[RAG] Retrieving guidelines for query: '{query}'")
    query_words = query.lower().split()
    relevant_docs = []
    for doc_id, data in KNOWLEDGE_BASE.items():
        if any(keyword in query_words for keyword in data["keywords"]):
            relevant_docs.append(data["content"])
    retrieved_context = "\n\n---\n\n".join(relevant_docs)
    return retrieved_context


# --- 2. GENERATION (The "G" in RAG) using Gemini API ---

def get_audit_prompt_template() -> str:
    """Returns the master prompt template for auditing a single slide."""
    return """
    You are a meticulous and highly skilled marketing compliance auditor. Your task is to audit the provided marketing content from a single presentation slide based *only* on the specific guidelines provided to you.

    **Your Persona:**
    - You are precise and objective.
    - You reference the guidelines to justify your findings.
    - You provide constructive feedback for the specific slide you are analyzing.

    **Instructions:**
    1.  Carefully review the "Retrieved Compliance Guidelines".
    2.  Analyze the "Marketing Content to Audit" (which may include text, table data, and an image from a single slide).
    3.  Complete the audit by providing a response in a strict JSON format. Do not add any text or formatting outside of the JSON structure.

    **Retrieved Compliance Guidelines:**
    ```
    {retrieved_context}
    ```

    **Your JSON Output Structure Must Be:**
    ```json
    {{
      "audit_summary": {{
        "overall_finding": "Compliant", "Non-Compliant", or "Partial-Compliance"
      }},
      "checklist": [
        {{"check_name": "Disclaimer Presence", "finding": "Pass", "Fail", or "N/A"}},
        {{"check_name": "Client Type Tone", "finding": "Pass", "Fail", or "N/A"}},
        {{"check_name": "Image/Chart Analysis", "finding": "Pass", "Fail", or "N/A"}},
        {{"check_name": "Table Completeness", "finding": "Pass", "Fail", or "N/A"}}
      ],
      "detailed_feedback": [
        {{"area": "Disclaimer Presence", "is_compliant": true/false, "comment": "Your analysis for the disclaimer check."}},
        {{"area": "Client Type Tone", "is_compliant": true/false, "comment": "Your analysis for the client type check."}},
        {{"area": "Image/Chart Analysis", "is_compliant": true/false, "comment": "Your analysis for the image. If no image was provided, state that."}},
        {{"area": "Table Completeness", "is_compliant": true/false, "comment": "Your analysis for the table. If no table was provided, state that."}}
      ]
    }}
    ```
    """

def audit_slide_content(slide_number: int, slide_text: str, slide_images: list):
    """Orchestrates the RAG pipeline for a single slide's content."""
    print(f"\n----- Auditing Slide {slide_number} -----")

    query = "audit disclaimer client tone chart labels table completeness"
    retrieved_context = retrieve_relevant_guidelines(query)

    if not retrieved_context:
        print(f"Could not audit slide {slide_number} as no relevant guidelines were found.")
        return None

    prompt_template = get_audit_prompt_template()
    prompt = prompt_template.format(retrieved_context=retrieved_context)

    model = genai.GenerativeModel('gemini-1.5-flash')

    content_to_audit = [f"**Marketing Content for Slide {slide_number}:**\n", slide_text]
    if slide_images:
        print(f"[AI] {len(slide_images)} image(s) found on slide {slide_number}. Including in audit.")
        # Add all images from the slide to the context
        content_to_audit.extend(slide_images)
    else:
        print(f"[AI] No images found on slide {slide_number}.")

    print(f"[AI] Sending request to Gemini API for slide {slide_number}...")

    try:
        response = model.generate_content([prompt] + content_to_audit)
        json_text = response.text.strip().replace("```json", "").replace("```", "")
        audit_result = json.loads(json_text)
        return audit_result
    except Exception as e:
        print(f"\n--- ERROR on Slide {slide_number} ---")
        print(f"An error occurred during API call or JSON parsing: {e}")
        if 'response' in locals():
            print("Raw response from API:", response.text)
        return None

# --- 3. NEW: PRESENTATION PROCESSING LOGIC ---
def process_presentation(pptx_path: str):
    """
    Opens a .pptx file, extracts content from each slide, and audits it.
    """
    print(f"=========================================================")
    print(f"STARTING AUDIT FOR PRESENTATION: {pptx_path}")
    print(f"=========================================================")

    try:
        prs = Presentation(pptx_path)
    except Exception as e:
        print(f"Error opening presentation file: {e}")
        return

    full_audit_report = {}

    for i, slide in enumerate(prs.slides):
        slide_number = i + 1
        slide_text_parts = []
        slide_images = []

        # Extract text from all shapes
        for shape in slide.shapes:
            if shape.has_text_frame:
                slide_text_parts.append(shape.text_frame.text)

            # Extract image data
            if hasattr(shape, 'image'):
                image = shape.image
                image_bytes = image.blob
                pil_image = Image.open(io.BytesIO(image_bytes))
                slide_images.append(pil_image)

            # Extract table data and convert to Markdown format
            if shape.has_table:
                table = shape.table
                table_md = "\n\n**Table Data:**\n"
                # Header
                header = [cell.text for cell in table.rows[0].cells]
                table_md += f"| {' | '.join(header)} |\n"
                # Separator
                table_md += f"|{'|'.join(['---'] * len(header))}|\n"
                # Rows
                for row in list(table.rows)[1:]:
                    row_data = [cell.text for cell in row.cells]
                    table_md += f"| {' | '.join(row_data)} |\n"
                slide_text_parts.append(table_md)

        full_slide_text = "\n".join(slide_text_parts)

        slide_audit = audit_slide_content(slide_number, full_slide_text, slide_images)
        if slide_audit:
            full_audit_report[f"slide_{slide_number}"] = slide_audit

    print("\n\n=========================================================")
    print("              FULL PRESENTATION AUDIT REPORT             ")
    print("=========================================================")
    print(json.dumps(full_audit_report, indent=2))
    return full_audit_report

# --- 4. EXAMPLE USAGE ---
def create_dummy_presentation(filename="dummy_presentation.pptx"):
    """Creates a sample .pptx file for demonstration purposes."""
    prs = Presentation()

    # Slide 1: Non-compliant title slide
    slide_1_layout = prs.slide_layouts[5] # Title only
    slide_1 = prs.slides.add_slide(slide_1_layout)
    title = slide_1.shapes.title
    title.text = "Our new tech is awesome for startups!"

    # Slide 2: Non-compliant chart and table
    slide_2_layout = prs.slide_layouts[5]
    slide_2 = prs.slides.add_slide(slide_2_layout)
    title = slide_2.shapes.title
    title.text = "Performance Metrics"
    # Add an incomplete chart image
    try:
        from PIL import ImageDraw
        img = Image.new('RGB', (400, 250), color = (255, 255, 255))
        draw = ImageDraw.Draw(img)
        draw.rectangle([70, 80, 120, 200], fill='cyan')
        draw.rectangle([170, 120, 220, 200], fill='magenta')
        img_stream = io.BytesIO()
        img.save(img_stream, format='PNG')
        img_stream.seek(0)
        slide_2.shapes.add_picture(img_stream, Inches(1), Inches(2.0))
    except ImportError:
        print("Pillow library not found. Cannot generate image for dummy PPTX.")

    # Add an incomplete table
    rows, cols = 3, 3
    table = slide_2.shapes.add_table(rows, cols, Inches(1), Inches(5.0), Inches(6), Inches(0.8)).table
    table.cell(0, 0).text = 'Feature'
    table.cell(0, 1).text = 'Our Processor'
    table.cell(0, 2).text = 'Competitor A'
    table.cell(1, 0).text = 'Speed'
    table.cell(1, 1).text = '100 TFlops'
    table.cell(1, 2).text = '50 TFlops'
    table.cell(2, 0).text = 'Power'
    table.cell(2, 1).text = '75W'
    # Missing value in cell (2, 2)

    # Slide 3: Compliant text slide
    slide_3_layout = prs.slide_layouts[1] # Title and content
    slide_3 = prs.slides.add_slide(slide_3_layout)
    title = slide_3.shapes.title
    title.text = "Enterprise Grade Security"
    body = slide_3.placeholders[1]
    body.text = "Our platform provides robust security for enterprise clients.\n\n© 2025 Innovate Inc. All rights reserved. Performance results are not guaranteed."

    prs.save(filename)
    print(f"\nCreated a dummy presentation: {filename}")
    return filename

if __name__ == "__main__":
    dummy_pptx_file = create_dummy_presentation()
    process_presentation(dummy_pptx_file)



Created a dummy presentation: dummy_presentation.pptx
STARTING AUDIT FOR PRESENTATION: dummy_presentation.pptx

----- Auditing Slide 1 -----

[RAG] Retrieving guidelines for query: 'audit disclaimer client tone chart labels table completeness'
[AI] No images found on slide 1.
[AI] Sending request to Gemini API for slide 1...

----- Auditing Slide 2 -----

[RAG] Retrieving guidelines for query: 'audit disclaimer client tone chart labels table completeness'
[AI] 1 image(s) found on slide 2. Including in audit.
[AI] Sending request to Gemini API for slide 2...

----- Auditing Slide 3 -----

[RAG] Retrieving guidelines for query: 'audit disclaimer client tone chart labels table completeness'
[AI] No images found on slide 3.
[AI] Sending request to Gemini API for slide 3...


              FULL PRESENTATION AUDIT REPORT             
{
  "slide_1": {
    "audit_summary": {
      "overall_finding": "Partial-Compliance"
    },
    "checklist": [
      {
        "check_name": "Disclaimer Prese