In [1]:
# !pip install pytesseract

In [2]:
import os
from pdf2img import convert_from_path
from pytesseract import image_to_string
import google.generativeai as genai
from PIL import Image
import key # API Key (obtain this from google API and store it in .py file)

genai.configure(api_key=key.api_key)

In [3]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [4]:
# Path to poppler
poppler_path = r"D:\CODING\Test\Python\poppler-24.02.0\Library\bin"

In [5]:
def convert_pdf2img(pdf_file):
    return convert_from_path(pdf_file, poppler_path=poppler_path)

def convert_img2txt(file):
    txt = image_to_string(file)
    return txt

# Function to convert PDF to text using Tesseract
def get_txt_from_pdf(pdf_file):
    images = convert_pdf2img(pdf_file)
    final_txt = ""
    for pg, img in enumerate(images):
        final_txt += image_to_string(img)
    return final_txt


In [6]:
## Safety Settings of Model
## Make Sure to update the threshold according to your requirements
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE"
  }
]
model = genai.GenerativeModel(model_name="gemini-pro", safety_settings=safety_settings)

In [7]:
def gemini_output(extracted_text, system_prompt, user_prompt):
    input_prompt = [system_prompt, extracted_text, user_prompt]
    response = model.generate_content(input_prompt)
    return response.text

def gemini_output_feedback(extracted_text, system_prompt, user_prompt):
    input_prompt = [system_prompt, extracted_text, user_prompt]
    response = model.generate_content(input_prompt)
    return response.prompt_feedback

In [8]:
system_prompt = """
               Your expertise lies in understanding receipts and invoices. You'll be given pictures or text containing information. 
               Your job is to answer questions based on what you find in these images or texts.
               """

user_prompt = """
Extract the details from the invoice/text and convert them into clean JSON format with appropriate tags:
"""

In [9]:
from pathlib import Path
pdf_path = r"D:\CODING\Test\Python\pdfs\test_pdfs\1_eTJ5_mezfc-K3WKyxpls8Q (1).pdf"
extracted_text = get_txt_from_pdf(pdf_path)
# genai_output_result = gemini_output(extracted_text, system_prompt, user_prompt)
# print(extracted_text)

In [10]:
gemini_output_feedback(extracted_text, system_prompt, user_prompt)

safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: NEGLIGIBLE
}

In [11]:
print(extracted_text)

D. Brawn Manufacture

Invoice no. DVT-AX-345678

Payment date: 03/12/2006

Reference Designation Qty Unit price Total CHF Sales
Work

SERVICE D COMPLETE OVERHAUL 4 5500.00 5500.00 220
SERVICE D REFRESHING COMPLETE CASE 1 380.00 380.00 220

AND RHODIUM BATH
Exterior parts:

JO.297,065.FP FLAT GASKET 1 3.00 3.00 220
JO.197.075.FP FLAT GASKET 1 4.00 4.00 220
JO.199.059.0S FLAT ROUND GASKET 1 6.00 6.00 220
VI.261.036.BC W.G.FIXATION SCREWS 10 4.00 40.00 220
Al.465.055.BC WHITE GOLD "FOIL" 1 70.00 70.00 220
PAIR OF HAND
LENGTH: 10/13.50MM
CALIBRE 2868
SPECIAL DISCOUNT -3003.00 -3003.00
Discount -900.00 -900.00
Total CHF 2100.00
RETURN AFTER REPAIR
NO COMMERCIAL VALUE
Payment:
Mr. John Doe
Green Street 15, Office 4
1234 Vermut

New Caledonia

Credit Card: Visa
Card No: 112345678



In [12]:
genai_output_result = gemini_output(extracted_text, system_prompt, user_prompt)
print(genai_output_result)

```JSON
{
  "invoice_number": "DVT-AX-345678",
  "payment_date": "03/12/2006",
  "customer": {
    "name": "Mr. John Doe",
    "address": "Green Street 15, Office 4\n1234 Vermut\nNew Caledonia"
  },
  "payment": {
    "type": "Credit Card",
    "card_number": "112345678"
  },
  "services": [
    {
      "description": "SERVICE D COMPLETE OVERHAUL",
      "quantity": 4,
      "unit_price": 5500.00,
      "total_price": 5500.00,
      "sales_ref": 220
    },
    {
      "description": "SERVICE D REFRESHING COMPLETE CASE AND RHODIUM BATH",
      "quantity": 1,
      "unit_price": 380.00,
      "total_price": 380.00,
      "sales_ref": 220
    }
  ],
  "parts": [
    {
      "description": "JO.297,065.FP FLAT GASKET",
      "quantity": 1,
      "unit_price": 3.00,
      "total_price": 3.00,
      "sales_ref": 220
    },
    {
      "description": "JO.197.075.FP FLAT GASKET",
      "quantity": 1,
      "unit_price": 4.00,
      "total_price": 4.00,
      "sales_ref": 220
    },
    {
      

In [13]:
from pathlib import Path
pdf_folder = r"D:\CODING\Test\Python\pdfs\test_pdfs"

# Create a list of PDF file paths
pdf_paths = [file for file in Path(pdf_folder).glob("*.pdf")]
print(pdf_paths)

def process_pdf(pdf_path):
    extracted_text = get_txt_from_pdf(pdf_path)
    genai_output_result = gemini_output(extracted_text, system_prompt, user_prompt)
    return genai_output_result

[WindowsPath('D:/CODING/Test/Python/pdfs/test_pdfs/1_eTJ5_mezfc-K3WKyxpls8Q (1).pdf'), WindowsPath('D:/CODING/Test/Python/pdfs/test_pdfs/invoice.pdf'), WindowsPath('D:/CODING/Test/Python/pdfs/test_pdfs/wordpress-pdf-invoice-plugin-sample.pdf')]


In [19]:
import json
import re

for pdf_path in pdf_paths:
    output = process_pdf(str(pdf_path))

    # Removes triple backticks
    output = output.strip()
    
    # Use re.search to find the first match
    match = re.search(r'\{([^}]+)\}', output)

    # Extract the value within curly braces
    result = ""
    if match:
        result = match.group(0)
        print(result)  # Print the JSON data
    else:
        print("No match found.")

    try:
        # Parsing JSON output
        parsed_output = json.loads(result)
        print(json.dumps(parsed_output, indent=2))  # Print the parsed JSON with indentation
        
    except json.JSONDecodeError as e:
        print("\n\n")
        print(f"Error parsing JSON for {pdf_path}: {e}")
        print("JSON Output:", output)

{
"invoice_number": "DVT-AX-345678",
"payment_date": "03/12/2006",
"customer": {
"name": "Mr. John Doe",
"address": {
"street_address": "Green Street 15, Office 4",
"city": "1234 Vermut",
"country": "New Caledonia"
}



Error parsing JSON for D:\CODING\Test\Python\pdfs\test_pdfs\1_eTJ5_mezfc-K3WKyxpls8Q (1).pdf: Expecting ',' delimiter: line 10 column 2 (char 216)
JSON Output: ```JSON
{
"invoice_number": "DVT-AX-345678",
"payment_date": "03/12/2006",
"customer": {
"name": "Mr. John Doe",
"address": {
"street_address": "Green Street 15, Office 4",
"city": "1234 Vermut",
"country": "New Caledonia"
}
},
"payment_method": {
"type": "Credit Card",
"card_number": "112345678",
"card_type": "Visa"
},
"items": [
{
"description": "SERVICE D COMPLETE OVERHAUL",
"quantity": 4,
"unit_price": 5500.00,
"total_price": 5500.00,
"tax": 220
},
{
"description": "SERVICE D REFRESHING COMPLETE CASE AND RHODIUM BATH",
"quantity": 1,
"unit_price": 380.00,
"total_price": 380.00,
"tax": 220
},
{
"description": 