In [17]:
import cv2
import numpy as np

def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.GaussianBlur(img, (5, 5), 0)
    _, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return img

In [18]:
image = preprocess_image("/InvoiceProcessing/data/invoice2.jpg")
cv2.imwrite("/InvoiceProcessing/data/pp2_invoice2.jpg", image)

True

In [20]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

In [21]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decode

In [None]:
def extract_text(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    print(generated_ids)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    #return text

In [27]:
extract_text("/InvoiceProcessing/data/pp2_invoice2.jpg")


tensor([[ 2, 35,  2]])


':'

In [28]:
def eextract_text(image_path):
    try:
        image = Image.open(image_path).convert("RGB")

        # Check if image is processed correctly
        pixel_values = processor(images=image, return_tensors="pt").pixel_values
        print("Pixel values shape:", pixel_values.shape)  # Should not be empty

        # Generate text
        generated_ids = model.generate(pixel_values)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Debug output
        print("Extracted Text:", text)
        return text
    except Exception as e:
        print(f"Error during OCR: {e}")
        return None

text_output = eextract_text("/InvoiceProcessing/data/pp2_invoice2.jpg")  # Change to your image path

Pixel values shape: torch.Size([1, 3, 384, 384])
Extracted Text: :


In [29]:


# Use a test image from the internet
image = Image.open("/InvoiceProcessing/data/pp2_invoice2.jpg").convert("RGB")  # Use your image

pixel_values = processor(images=image, return_tensors="pt").pixel_values
print("Pixel values shape:", pixel_values.shape)  # Should be (1, 3, 384, 384)

# Generate text
generated_ids = model.generate(pixel_values)
print("Generated IDs:", generated_ids)  # Check if IDs are generated

# Decode text
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Extracted Text:", text)

Pixel values shape: torch.Size([1, 3, 384, 384])
Generated IDs: tensor([[ 2, 35,  2]])
Extracted Text: :


In [39]:
from paddleocr import PaddleOCR

def extract_text_paddle(image_path):
    ocr = PaddleOCR(lang='en')  # Supports multiple languages
    result = ocr.ocr(image_path, cls=True)
    
    extracted_text = []
    for line in result:
        for word in line:
            extracted_text.append(word[1][0])  # Get detected text
    
    return " ".join(extracted_text)


In [43]:
text_paddle = extract_text_paddle("/InvoiceProcessing/data/pp2_invoice2.jpg")
print("PaddleOCR Extracted Text:", text_paddle)

[2025/03/11 20:21:48] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\Devaraj/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\Devaraj/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 3

In [42]:
text_paddle1 = extract_text_paddle("/InvoiceProcessing/data/pp2_invoice1.jpg")
print("PaddleOCR Extracted Text:", text_paddle1)

[2025/03/11 20:21:35] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\Devaraj/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\Devaraj/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 3

In [44]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch

In [53]:
processorl = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
modell = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
def extract_invoice_fields(image_path, text):
    encoding = processorl(image_path, text, return_tensors="pt")
    outputs = modell(**encoding)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions

In [56]:
def extract_invoice_fields(image_path, text):
    # Load image using PIL
    image = Image.open(image_path).convert("RGB")  # Convert to RGB format

    # Encode image and text
    encoding = processorl(images=image, text=text, return_tensors="pt")

    # Run model
    outputs = modell(**encoding)
    predictions = torch.argmax(outputs.logits, dim=-1)

    return predictions


In [57]:
# Example usage
image_path = "/InvoiceProcessing/data/pp2_invoice1.jpg"
#text_paddle1 = "Your extracted OCR text here"  # Replace with actual text


In [60]:
from PIL import Image
from paddleocr import PaddleOCR
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch

# Initialize PaddleOCR
ocr = PaddleOCR(lang='en')

# Initialize LayoutLMv3
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")

def extract_invoice_fields(image_path):
    # Load image with PIL
    image = Image.open(image_path).convert("RGB")

    # Extract text using PaddleOCR
    ocr_result = ocr.ocr(image_path, cls=True)
    extracted_text = " ".join([line[1][0] for line in ocr_result[0]])

    # Encode image and extracted text
    encoding = processor(images=image, text=extracted_text, return_tensors="pt")

    # Run model
    outputs = model(**encoding)
    predictions = torch.argmax(outputs.logits, dim=-1)

    return predictions

# Example usage
image_path = "/InvoiceProcessing/data/pp2_invoice1.jpg"
predictions = extract_invoice_fields(image_path)
print(predictions)


[2025/03/11 20:41:53] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\Devaraj/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\Devaraj/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 3

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025/03/11 20:42:16] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.47299623489379883
[2025/03/11 20:42:22] ppocr DEBUG: rec_res num  : 42, elapsed : 6.0560033321380615


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.