In [21]:
import os
import csv
import requests
import pytesseract
from PIL import Image
from io import BytesIO
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import re

In [22]:
# Load constants
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [23]:
# Function to download images from URLs
def download_image(url):
    response = requests.get(url)
    if response.status_code == 200:
        return Image.open(BytesIO(response.content))
    return None

# Function to apply OCR and extract text
def extract_text_from_image(image):
    return pytesseract.image_to_string(image)

# Function to clean and extract value + unit
def extract_value_and_unit(ocr_text, entity_name):
    entity_units = entity_unit_map.get(entity_name, allowed_units)
    print(f"OCR Text: {ocr_text}")  # Debugging print for OCR output
    
    # Regular expression to find numbers followed by units
    for unit in entity_units:
        pattern = rf"(\d+\.?\d*)\s*{unit}"
        match = re.search(pattern, ocr_text, re.IGNORECASE)
        if match:
            value = match.group(1)
            print(f"Match found: {value} {unit}")  # Debugging print for matched value and unit
            return f"{value} {unit}"
    
    print(f"No match found for entity {entity_name}")  # Debugging print if no match
    return ""


# Fallback function to extract probable numbers if no entity name is found
def extract_probable_numbers(ocr_text):
    # Regular expression to capture any number, including floating-point
    pattern = r"(\d+\.?\d*)"
    matches = re.findall(pattern, ocr_text)
    
    if matches:
        probable_numbers = ", ".join(matches)  # Join all numbers found
        print(f"Probable numbers found: {probable_numbers}")  # Debugging print for probable numbers
        return probable_numbers
    else:
        print("No numbers found")  # Debugging print if no numbers found
        return ""

# Hugging Face model pipeline for image captioning (optional)
def image_captioning(image):
    caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    caption = caption_pipeline(image)[0]['generated_text']
    return caption

In [24]:
# Initialize GPT-2 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a pad_token by default, so we add one
tokenizer.pad_token = tokenizer.eos_token  # Use the eos_token as the pad_token

model = AutoModelForCausalLM.from_pretrained("gpt2")

In [25]:
def generate_entity_extraction(text, entity_name):
    # Prepare the input with attention mask
    inputs = tokenizer(
        f"Extract the {entity_name} from the following text:\n\n{text}\n\n",
        return_tensors="pt",  # Return as PyTorch tensors
        padding=True,         # Explicitly enable padding
        truncation=True,      # Truncate if the text is too long
        add_special_tokens=True  # Add special tokens if necessary
    )

    # Add attention mask to avoid warning
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate response
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,  # Pass attention mask
        max_new_tokens=50,              # Set max tokens to control length
        do_sample=True,                 # Sampling for diverse outputs
        temperature=0.7                 # Adjust temperature for creativity
    )

    # Decode the generated tokens
    gpt_output = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"GPT Output: {gpt_output}")  # Debugging print for GPT output

    # Post-process GPT output to extract numeric values and allowed units
    entity_units = entity_unit_map.get(entity_name, allowed_units)
    for unit in entity_units:
        pattern = rf"(\d+\.?\d*)\s*{unit}"
        match = re.search(pattern, gpt_output, re.IGNORECASE)
        if match:
            value = match.group(1)
            print(f"Match found in GPT: {value} {unit}")  # Debugging print for matched value and unit from GPT
            return f"{value} {unit}"
    
    print(f"No match found in GPT for entity {entity_name}")  # Debugging print if no match in GPT output
    return ""

In [26]:
# Main prediction function
def generate_predictions(test_file, output_file):
    # Read the test data
    with open(test_file, 'r') as test_f, open(output_file, 'w') as out_f:
        test_reader = csv.DictReader(test_f)
        fieldnames = ['index', 'prediction', 'fallback_numbers']
        writer = csv.DictWriter(out_f, fieldnames=fieldnames)
        writer.writeheader()

        for row in test_reader:
            index = row['index']
            image_url = row['image_link']
            entity_name = row['entity_name']
            
            # Download image
            image = download_image(image_url)
            if image is None:
                writer.writerow({'index': index, 'prediction': '', 'fallback_numbers': ''})
                continue

            # Apply OCR to extract text from image
            ocr_text = extract_text_from_image(image)

            # Optional: Generate image caption (if needed)
            caption = image_captioning(image)
            ocr_text += " " + caption  # Append caption to OCR results

            print(f"Final OCR + Caption Text: {ocr_text}")  # Debugging print for combined OCR and caption text

            # Use OCR to extract value + unit
            prediction = extract_value_and_unit(ocr_text, entity_name)
            
            # If no prediction from OCR, attempt using GPT model
            if not prediction:
                prediction = generate_entity_extraction(ocr_text, entity_name)

            # Fallback to extract probable numbers if no entity name match
            if not prediction:
                fallback_numbers = extract_probable_numbers(ocr_text)
            else:
                fallback_numbers = ''

            writer.writerow({'index': index, 'prediction': prediction, 'fallback_numbers': fallback_numbers})

In [27]:
# Usage
test_file_path = 'dataset/sample_test.csv'
output_file_path = 'output_new.csv'
generate_predictions(test_file_path, output_file_path)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Final OCR + Caption Text: = 7 O| i

 a black and white photo of a snow covered area 
OCR Text: = 7 O| i

 a black and white photo of a snow covered area 
No match found for entity height
GPT Output: Extract the height from the following text:

= 7 O| i

 a black and white photo of a snow covered area 


= 7

I will try to correct this text.

= 7

= 7

= 7

= 7


= 7

= 7


= 7

= 7

= 7


No match found in GPT for entity height
Probable numbers found: 7


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Final OCR + Caption Text: Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 
OCR Text: Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 
No match found for entity width
GPT Output: Extract the width from the following text:

Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 


The size of the table is 16.54" and the height is 23"

The table is 4.16" and the width is 9"

A table with a camera in this format is 16.54" and the height
No match found in GPT for entity width
Probable numbers found: 42, 16.54, 200, 78.74


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Final OCR + Caption Text: Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 
OCR Text: Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 
No match found for entity height
GPT Output: Extract the height from the following text:

Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 


We have a large group of professional photographers who like to post photos and share them with the world. Join our group and get involved in the community. If so, you may enjoy the image. We are here to help out the community.


No match found in GPT for entity height
Probable numbers found: 42, 16.54, 200, 78.74


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Final OCR + Caption Text: Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 
OCR Text: Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 
No match found for entity depth
GPT Output: Extract the depth from the following text:

Size Width Length
One Size 42cm/16.54" 200cmi78.74"
 a collage of photos of a person with a camera 


Download your photo file and extract it to your PC.


You can see that the image is not a square. You may want to change the resolution to be around the size of your image file.

You can also see that the
No match found in GPT for entity depth
Probable numbers found: 42, 16.54, 200, 78.74


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Final OCR + Caption Text: Size Width Length
One Size 10.50emi/4.13" 90cm/35.43"
 a collage of photos of people standing in front of a wall 
OCR Text: Size Width Length
One Size 10.50emi/4.13" 90cm/35.43"
 a collage of photos of people standing in front of a wall 
No match found for entity depth
GPT Output: Extract the depth from the following text:

Size Width Length
One Size 10.50emi/4.13" 90cm/35.43"
 a collage of photos of people standing in front of a wall 


Now it will open up a number of other ways to do things like make the walls wider or taller.

So let's say we want to make the walls wider or taller this way:

We can do this by using an algorithm
No match found in GPT for entity depth
Probable numbers found: 10.50, 4.13, 90, 35.43


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Final OCR + Caption Text: Size Width Length
One Size 10.50emi/4.13" 90cm/35.43"
 a collage of photos of people standing in front of a wall 
OCR Text: Size Width Length
One Size 10.50emi/4.13" 90cm/35.43"
 a collage of photos of people standing in front of a wall 
No match found for entity height
GPT Output: Extract the height from the following text:

Size Width Length
One Size 10.50emi/4.13" 90cm/35.43"
 a collage of photos of people standing in front of a wall 


To remove the picture, simply double-click on the photo and take the photo off your computer.

To remove the picture, simply double-click on the photo and take the photo off your computer. To remove the photo, simply double
No match found in GPT for entity height
Probable numbers found: 10.50, 4.13, 90, 35.43
