In [10]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import re
from paddleocr import PaddleOCR

# Load the train dataset
train_file = r'C:\Users\singh\Desktop\Amazon ML Challenge 2024\66e31d6ee96cd_student_resource_3 (1)\student_resource 3\dataset\train - Copy.csv'  
train_df = pd.read_csv(train_file)

# Add an index column to uniquely identify rows
train_df['index'] = range(1, len(train_df) + 1)

# Create a directory for images if it doesn't exist
image_dir = 'downloaded_images'
os.makedirs(image_dir, exist_ok=True)

# Function to download images
def download_image(image_url, image_name, save_dir):
    try:
        response = requests.get(image_url, timeout=10)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img.save(os.path.join(save_dir, image_name))
        else:
            print(f"Failed to download image: {image_url}, status: {response.status_code}")
    except Exception as e:
        print(f"Error downloading image {image_url}: {e}")

# Download images for the train dataset
for index, row in tqdm(train_df.iterrows(), total=len(train_df)):
    image_url = row['image_link']
    image_name = f"{row['index']}.jpg"
    image_path = os.path.join(image_dir, image_name)
    if not os.path.exists(image_path):
        download_image(image_url, image_name, image_dir)

# Initialize the PaddleOCR model
ocr_model = PaddleOCR(use_angle_cls=True, lang='en')

# Preprocess the OCR text
def preprocess_text(text):
    text = re.sub(r'[^\d\w\s%]', '', text)  # Remove special characters except %
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    return text.lower()

# Function to handle common OCR mistakes
def common_mistake(unit):
    if unit in allowed_units:
        return unit
    # Handle common abbreviations and replacements
    if unit == 'g':  # Convert 'g' to 'gram'
        return 'gram'
    if unit == 'mg':  # Convert 'mg' to 'milligram'
        return 'milligram'
    if unit == 'lb' or unit == 'lbs': 
        return 'pound'
    if unit == 'w':  
        return 'watt'
    if unit.replace('ter', 'tre') in allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in allowed_units:
        return unit.replace('feet', 'foot')
    if unit.replace('gms', 'grams') in allowed_units:
        return unit.replace('gms', 'grams')
    return unit

# Entity-unit map with abbreviations
entity_unit_map = {
    'width': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'item_weight': {'gram', 'g', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton'},
    'maximum_weight_recommendation': {'gram', 'g', 'kilogram', 'kg', 'microgram', 'µg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton'},
    'voltage': {'kilovolt', 'kv', 'millivolt', 'mv', 'volt', 'v'},
    'wattage': {'kilowatt', 'kw', 'watt', 'w'},
    'item_volume': {'centilitre', 'cl', 'cubic foot', 'ft³', 'cubic inch', 'in³', 'cup', 'cup', 'decilitre', 'dl', 'fluid ounce', 'fl oz', 'gallon', 'gal', 'imperial gallon', 'imp gal', 'litre', 'l', 'microlitre', 'µl', 'millilitre', 'ml', 'pint', 'pt', 'quart', 'qt'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Function to extract text using OCR from an image
def extract_text_with_ocr(image_path):
    try:
        result = ocr_model.ocr(image_path)
        text = ' '.join([line[1][0] for line in result[0]]).lower()  # Extract detected text
        print(f"OCR Extracted Text: {text}")  # Debug: print the extracted text
        return text
    except Exception as e:
        print(f"Error extracting OCR from {image_path}: {e}")
        return ""

# Function to extract values and units from OCR text
def extract_entity_value(ocr_text, entity_name):
    ocr_text = preprocess_text(ocr_text)  # Clean the text first

    if entity_name in entity_unit_map:
        allowed_units_for_entity = entity_unit_map[entity_name]
    else:
        allowed_units_for_entity = allowed_units

    # Regex to find numerical values followed by units
    pattern = rf"(\d+\.?\d*)\s*({'|'.join(allowed_units_for_entity)})"
    matches = re.findall(pattern, ocr_text)

    if matches:
        print(f"Matches found for entity '{entity_name}': {matches}")
        value, unit = matches[0]
        unit = common_mistake(unit)  # Apply common_mistake here
        return f"{value} {unit}"
    else:
        print(f"No matches found for entity '{entity_name}' in text: {ocr_text}")
        return ""


# Process each row to extract entity values
def process_row(row):
    image_path = os.path.join(image_dir, f"{row['index']}.jpg")
    entity_name = row['entity_name']

    if os.path.exists(image_path):
        ocr_text = extract_text_with_ocr(image_path)
        extracted_value = extract_entity_value(ocr_text, entity_name)
        return extracted_value if extracted_value else "NA"
    else:
        return "Image not found"

# Apply the entity extraction on the entire dataset
train_df['predicted_entity_value'] = train_df.apply(process_row, axis=1)
print("Entity extraction complete!")

# Save the results
train_df[['index', 'predicted_entity_value']].to_csv('submission.csv', index=False)
print("Submission file saved!")

100%|██████████| 49/49 [00:00<00:00, 9799.78it/s]

[2024/09/15 09:44:33] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\singh/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\singh/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=




[2024/09/15 09:44:35] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.16343045234680176
[2024/09/15 09:44:35] ppocr DEBUG: cls num  : 19, elapsed : 0.13264918327331543
[2024/09/15 09:44:36] ppocr DEBUG: rec_res num  : 19, elapsed : 1.1949717998504639
OCR Extracted Text: propos nature ingredient menager multi-usage terrede sommieres 100%naturel argile 100% pure et naturelle, la terre de sommieres presente des proprietesabsorbantes qui permettent le nettoyage a sec des taches recalcitrantes sur toutes les surfaces (moquette, tapis, parquet...). elle est aussi efficace pour desodoriser le linge. ingredient bentonite dosage conseill selon usage ge fermobrde la chorte hm 100% 500g laboratoire propos'naturee
Matches found for entity 'item_weight': [('500', 'g')]
[2024/09/15 09:44:36] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.13700413703918457
[2024/09/15 09:44:36] ppocr DEBUG: cls num  : 19, elapsed : 0.05676841735839844
[2024/09/15 09:44:37] ppocr DEBUG: rec_res num  : 19, elapsed : 0.7798650

In [16]:
train_df

Unnamed: 0,image_link,group_id,entity_name,entity_value,index,predicted_entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,1,500 g
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,2,
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,3,0709 g
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,4,051 g
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,5,
5,https://m.media-amazon.com/images/I/61QsBSE7jg...,731432,item_weight,1400 milligram,6,1400 mg
6,https://m.media-amazon.com/images/I/81xsq6vf2q...,731432,item_weight,1400 milligram,7,1400 mg
7,https://m.media-amazon.com/images/I/71DiLRHeZd...,731432,item_weight,1400 milligram,8,1400 mg
8,https://m.media-amazon.com/images/I/91Cma3Rzse...,731432,item_weight,1400 milligram,9,1400 mg
9,https://m.media-amazon.com/images/I/71jBLhmTNl...,731432,item_weight,1400 milligram,10,1400 mg
