In [None]:
# ------------------ SETUP ------------------
from google.colab import files
uploaded = files.upload()

!pip install -q -U google-generativeai pymupdf

Saving ModernSOlutions.jpg to ModernSOlutions.jpg
Saving sample_invoice_2.pdf to sample_invoice_2.pdf
Saving sample_invoice_1.pdf to sample_invoice_1.pdf
Saving Amazoninvoice.png to Amazoninvoice.png
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.4/155.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

import google.generativeai as genai
from google.colab import userdata
from pathlib import Path
import fitz  # PyMuPDF
import json
import pandas as pd
import os
from IPython.display import Markdown

In [None]:

# ------------------ CONFIG ------------------
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:

MODEL_CONFIG = {
    "temperature": 0.2,
    "top_p": 1,
    "top_k": 32,
    "max_output_tokens": 4096,
}

safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]

In [None]:

# Initialize models with correct configurations
image_model = genai.GenerativeModel(
    model_name="models/gemini-1.5-flash",  # Updated to use gemini-1.5-flash model for images
    generation_config=MODEL_CONFIG,
    safety_settings=safety_settings
)

text_model = genai.GenerativeModel(
    model_name="models/gemini-1.5-flash",  # Use gemini-1.5-flash for text parsing as well
    generation_config=MODEL_CONFIG,
    safety_settings=safety_settings
)

In [None]:

# ------------------ HELPERS ------------------
def clean_and_parse_json(raw_output):
    try:
        # Strip extra markdown characters and unwanted characters from the response
        cleaned_output = raw_output.strip("```json").strip("```").strip()

        # Ensure there is no extra trailing backtick or other unexpected characters at the end
        cleaned_output = cleaned_output.rstrip("`")

        # Debug the raw output to ensure it's valid JSON
        print(f"Cleaned Raw Output:\n{cleaned_output}")

        # Parse the cleaned JSON string
        parsed = json.loads(cleaned_output)

        # Check if "items" exist in parsed JSON
        if "items" in parsed:
            # Convert the "items" list to DataFrame and save it to CSV
            df = pd.DataFrame(parsed["items"])

            # Save CSV to the current working directory
            csv_file_path = "/content/parsed_invoice_items.csv"
            df.to_csv(csv_file_path, index=False)
            print(f"CSV file saved at {csv_file_path}")

        return parsed
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        print("Check the output format to ensure it matches valid JSON.")
        print(f"Raw output: {raw_output}")
        return None
import mimetypes
from pathlib import Path

def image_format(image_path):
    img = Path(image_path)
    mime_type, _ = mimetypes.guess_type(img)

    if not mime_type or not mime_type.startswith("image/"):
        raise ValueError(f"Unsupported or unknown image type for: {image_path}")

    return [{"mime_type": mime_type, "data": img.read_bytes()}]
     #format required by the Gemini API when you send image data programmatically — like when you’re using a Python client, not a UI like Gemini chat

def pdf_format(pdf_path):
    pdf = fitz.open(pdf_path)
    full_text = ""
    for page in pdf:
        full_text += page.get_text()
    return full_text

def gemini_image_parse(image_path, prompt):
    image_info = image_format(image_path)
    input_prompt = [prompt, image_info[0]]
    response = image_model.generate_content(input_prompt)
    return response.text

def gemini_text_parse(text, prompt):
    input_prompt = f"{prompt}\n\n{text}"
    response = text_model.generate_content(input_prompt)
    return response.text

In [None]:
# ------------------ PROMPT SETS ------------------
prompt_sets = [
    """
You are an invoice parser. Extract general metadata as JSON:
{
  "invoice_number": "",
  "invoice_date": "",
  "seller_name": "",
  "buyer_name": ""
}
""",
    """
Extract all item line details as JSON:
{
  "items": [
    {"description": "", "quantity": 0, "price": 0.0, "total": 0.0}
  ]
}
""",
    """
Extract totals and tax values as JSON:
{
  "subtotal": 0.0,
  "tax": 0.0,
  "total_amount": 0.0,
  "payment_method": ""
}
"""
]

In [None]:
# ------------------ MAIN PARSING LOOP ------------------
final_parsed_results = []
for filename in uploaded.keys():
    print(f"\n🔍 Processing: {filename}")
    file_path = f"/content/{filename}"
    is_image = filename.lower().endswith(('.png', '.jpg', '.jpeg'))
    is_pdf = filename.lower().endswith('.pdf')

    parsed_data = {}
    raw_text = pdf_format(file_path) if is_pdf else None
    for idx, prompt in enumerate(prompt_sets):
        if is_image:
            result = gemini_image_parse(file_path, prompt)
        elif is_pdf:
            result = gemini_text_parse(raw_text, prompt)
        else:
            continue

        # Clean and parse JSON from the response
        parsed = clean_and_parse_json(result)
        if parsed:
            parsed_data.update(parsed)
        else:
            print(f"❌ Prompt {idx+1} failed for {filename}")

    if parsed_data:
        parsed_data['source_file'] = filename
        final_parsed_results.append(parsed_data)
        display(Markdown(f"### ✅ Parsed `{filename}`\n```json\n{json.dumps(parsed_data, indent=2)}\n```"))
    else:
        print(f"❌ Could not parse {filename}")



🔍 Processing: ModernSOlutions.jpg
Cleaned Raw Output:
{
  "invoice_number": "1001",
  "invoice_date": "March 20, 2024",
  "seller_name": "MODERN SOLUTIONS",
  "buyer_name": "JOHN DOE\nJohn Doe\nJane Smith"
}
Cleaned Raw Output:
{
  "items": [
    {
      "description": "Consultation",
      "quantity": 1,
      "price": 150.00,
      "total": 150.00
    },
    {
      "description": "Website Design",
      "quantity": 1,
      "price": 1200.00,
      "total": 1200.00
    },
    {
      "description": "SEO Optimization",
      "quantity": "3/h",
      "price": 300.00,
      "total": 300.00
    }
  ]
}
CSV file saved at /content/parsed_invoice_items.csv
Cleaned Raw Output:
{
  "subtotal": 1650.0,
  "tax": 182.0,
  "total_amount": 1782.0,
  "payment_method": ""
}


### ✅ Parsed `ModernSOlutions.jpg`
```json
{
  "invoice_number": "1001",
  "invoice_date": "March 20, 2024",
  "seller_name": "MODERN SOLUTIONS",
  "buyer_name": "JOHN DOE\nJohn Doe\nJane Smith",
  "items": [
    {
      "description": "Consultation",
      "quantity": 1,
      "price": 150.0,
      "total": 150.0
    },
    {
      "description": "Website Design",
      "quantity": 1,
      "price": 1200.0,
      "total": 1200.0
    },
    {
      "description": "SEO Optimization",
      "quantity": "3/h",
      "price": 300.0,
      "total": 300.0
    }
  ],
  "subtotal": 1650.0,
  "tax": 182.0,
  "total_amount": 1782.0,
  "payment_method": "",
  "source_file": "ModernSOlutions.jpg"
}
```


🔍 Processing: sample_invoice_2.pdf
Cleaned Raw Output:
{
  "invoice_number": "INV-1002",
  "invoice_date": "2024-04-03",
  "seller_name": "Office World",
  "buyer_name": "ACME Corp."
}

Cleaned Raw Output:
{
  "items": [
    {
      "description": "Office Desk",
      "quantity": 3,
      "price": 120.00,
      "total": 360.00
    },
    {
      "description": "Ergonomic Chair",
      "quantity": 2,
      "price": 150.00,
      "total": 300.00
    }
  ]
}

CSV file saved at /content/parsed_invoice_items.csv
Cleaned Raw Output:
{
  "subtotal": 660.0,
  "tax": 66.0,
  "total_amount": 726.0,
  "payment_method": "Bank Transfer"
}



### ✅ Parsed `sample_invoice_2.pdf`
```json
{
  "invoice_number": "INV-1002",
  "invoice_date": "2024-04-03",
  "seller_name": "Office World",
  "buyer_name": "ACME Corp.",
  "items": [
    {
      "description": "Office Desk",
      "quantity": 3,
      "price": 120.0,
      "total": 360.0
    },
    {
      "description": "Ergonomic Chair",
      "quantity": 2,
      "price": 150.0,
      "total": 300.0
    }
  ],
  "subtotal": 660.0,
  "tax": 66.0,
  "total_amount": 726.0,
  "payment_method": "Bank Transfer",
  "source_file": "sample_invoice_2.pdf"
}
```


🔍 Processing: sample_invoice_1.pdf
Cleaned Raw Output:
{
  "invoice_number": "INV-1001",
  "invoice_date": "2024-04-01",
  "seller_name": "Tech Supplies Ltd.",
  "buyer_name": "John Doe"
}

Cleaned Raw Output:
{
  "items": [
    {
      "description": "Laptop - Model X123",
      "quantity": 1,
      "price": 800.00,
      "total": 800.00
    },
    {
      "description": "Wireless Mouse",
      "quantity": 2,
      "price": 25.00,
      "total": 50.00
    }
  ]
}

CSV file saved at /content/parsed_invoice_items.csv
Cleaned Raw Output:
{
  "subtotal": 850.00,
  "tax": 85.00,
  "total_amount": 935.00,
  "payment_method": "Credit Card"
}



### ✅ Parsed `sample_invoice_1.pdf`
```json
{
  "invoice_number": "INV-1001",
  "invoice_date": "2024-04-01",
  "seller_name": "Tech Supplies Ltd.",
  "buyer_name": "John Doe",
  "items": [
    {
      "description": "Laptop - Model X123",
      "quantity": 1,
      "price": 800.0,
      "total": 800.0
    },
    {
      "description": "Wireless Mouse",
      "quantity": 2,
      "price": 25.0,
      "total": 50.0
    }
  ],
  "subtotal": 850.0,
  "tax": 85.0,
  "total_amount": 935.0,
  "payment_method": "Credit Card",
  "source_file": "sample_invoice_1.pdf"
}
```


🔍 Processing: Amazoninvoice.png
Cleaned Raw Output:
{
  "invoice_number": "354-993",
  "invoice_date": "2024-04-15",
  "seller_name": "Amazon.com Services LLC",
  "buyer_name": "David Wilson"
}
Cleaned Raw Output:
{
  "items": [
    {
      "description": "Echo Dot",
      "quantity": 2,
      "price": 45.00,
      "total": 90.00
    },
    {
      "description": "Fire TV Stick",
      "quantity": 1,
      "price": 40.00,
      "total": 40.00
    },
    {
      "description": "Batteries",
      "quantity": 4,
      "price": 10.00,
      "total": 40.00
    }
  ]
}
CSV file saved at /content/parsed_invoice_items.csv
Cleaned Raw Output:
{
  "subtotal": 170.00,
  "tax": 14.03,
  "total_amount": 184.03,
  "payment_method": "Visa"
}


### ✅ Parsed `Amazoninvoice.png`
```json
{
  "invoice_number": "354-993",
  "invoice_date": "2024-04-15",
  "seller_name": "Amazon.com Services LLC",
  "buyer_name": "David Wilson",
  "items": [
    {
      "description": "Echo Dot",
      "quantity": 2,
      "price": 45.0,
      "total": 90.0
    },
    {
      "description": "Fire TV Stick",
      "quantity": 1,
      "price": 40.0,
      "total": 40.0
    },
    {
      "description": "Batteries",
      "quantity": 4,
      "price": 10.0,
      "total": 40.0
    }
  ],
  "subtotal": 170.0,
  "tax": 14.03,
  "total_amount": 184.03,
  "payment_method": "Visa",
  "source_file": "Amazoninvoice.png"
}
```

In [10]:
import os
from google.colab import files

# ------------------ SAVE TO CSV ------------------
all_items = []
for invoice in final_parsed_results:
    if "items" in invoice:
        for item in invoice["items"]:
            # Add basic invoice metadata
            item["invoice_number"] = invoice.get("invoice_number")
            item["invoice_date"] = invoice.get("invoice_date")
            item["source_file"] = invoice.get("source_file")

            # Add totals and payment metadata
            item["subtotal"] = invoice.get("subtotal")
            item["tax"] = invoice.get("tax")
            item["total_amount"] = invoice.get("total_amount")
            item["payment_method"] = invoice.get("payment_method")

            all_items.append(item)

# Save and download if items exist
csv_file_path = "/content/parsed_invoice_items.csv"
if all_items:
    df = pd.DataFrame(all_items)
    df.to_csv(csv_file_path, index=False)
    print(f"\n✅ All invoice items saved at: {csv_file_path}")
else:
    print("⚠️ No items found to save.")
    csv_file_path = None

# Check and trigger download
if csv_file_path and os.path.exists(csv_file_path):
    print(f"📁 File ready for download: {csv_file_path}")
    files.download(csv_file_path)
else:
    print("❌ File not found or failed to save.")



✅ All invoice items saved at: /content/parsed_invoice_items.csv
📁 File ready for download: /content/parsed_invoice_items.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>