In [None]:
# Install necessary packages
!apt-get install -y tesseract-ocr
!pip install pytesseract pillow requests pandas
import requests
from PIL import Image, ImageOps, ImageEnhance
import pytesseract
from io import BytesIO
import re
import pandas as pd
import multiprocessing
from tqdm import tqdm

# Entity to unit mapping with abbreviations and plural forms
entity_unit_map = {
    'width': {'centimetre', 'centimetres', 'cm', 'foot', 'feet', 'ft', 'inch', 'inches', 'in', 'metre', 'metres', 'm', 'millimetre', 'millimetres', 'mm', 'yard', 'yards', 'yd'},
    'depth': {'centimetre', 'centimetres', 'cm', 'foot', 'feet', 'ft', 'inch', 'inches', 'in', 'metre', 'metres', 'm', 'millimetre', 'millimetres', 'mm', 'yard', 'yards', 'yd'},
    'height': {'centimetre', 'centimetres', 'cm', 'foot', 'feet', 'ft', 'inch', 'inches', 'in', 'metre', 'metres', 'm', 'millimetre', 'millimetres', 'mm', 'yard', 'yards', 'yd'},
    'item_weight': {'gram', 'grams', 'g', 'kilogram', 'kilograms', 'kg', 'microgram', 'micrograms', 'μg', 'milligram', 'milligrams', 'mg', 'ounce', 'ounces','gms', 'oz','z', 'pound', 'pounds', 'lb','lbs','ibs', 'ton', 'tons', 't'},
    'maximum_weight_recommendation': {'gram', 'grams', 'g', 'kilogram', 'kilograms', 'kg', 'microgram', 'micrograms', 'μg', 'milligram', 'milligrams', 'mg','gms', 'ounce', 'ounces','z', 'oz', 'pound', 'pounds', 'lbs','ibs','lb', 'ton', 'tons', 't'},
    'voltage': {'volt', 'volts', 'v', 'kilovolt', 'kilovolts', 'kv', 'millivolt', 'millivolts', 'mv'},
    'wattage': {'watt', 'watts', 'w', 'kilowatt', 'kilowatts', 'kw'},
    'item_volume': {'centilitre', 'centilitres', 'cl', 'cubic foot', 'cubic feet', 'cubic inch', 'cubic inches', 'cup', 'cups', 'decilitre', 'decilitres', 'dl', 'fluid ounce', 'fluid ounces', 'fl oz', 'gallon', 'gallons', 'imperial gallon', 'imperial gallons', 'litre', 'litres', 'l', 'microlitre', 'microlitres', 'µl', 'millilitre', 'millilitres', 'ml', 'pint', 'pints', 'quart', 'quarts'}
}

# Mapping for abbreviations/plural forms to base units
unit_to_base = {
    'cm': 'centimetre', 'centimetres': 'centimetre', 'cms': 'centimetre', 'g': 'gram', 'grams': 'gram', 'kgs': 'kilogram', 'kg': 'kilogram',
    'μg': 'microgram', 'mg': 'milligram', 'oz': 'ounce', 'lb': 'pound', 'tons': 'ton', 't': 'ton','ibs':'pound','lbs':'pound',
    'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt', 'w': 'watt', 'kw': 'kilowatt', 'l': 'litre', 'ml': 'millilitre', 'µl': 'microlitre',
    'yards': 'yard', 'feet': 'foot', 'ft': 'foot', 'inches': 'inch'
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Function to normalize units to their base form
def normalize_unit(unit):
    return unit_to_base.get(unit, unit)

# Function to extract text from image URL and try different preprocessing steps
def extract_text_from_url(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))
        gray_image = ImageOps.grayscale(image)
        text = pytesseract.image_to_string(gray_image, lang='eng').lower()

        if any(keyword in text for keyword in allowed_units):
            return text

        inverted_image = ImageOps.invert(gray_image)
        enhancer = ImageEnhance.Contrast(inverted_image)
        inverted_image = enhancer.enhance(8)
        text = pytesseract.image_to_string(inverted_image, lang='eng').lower()
        return text
    except Exception as e:
        print(f"An error occurred in extract_text_from_url: {e}")
        return "Error"

# Function to clean the extracted text
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip().lower()

# Function to find specific entities (like height, weight, wattage) in the text
def find_specified_entity(text, specified_entity):
    found_entity = "Not found"
    cleaned_text = clean_text(text)
    pattern = re.compile(r'(\d+\.?\d*)\s*([a-zA-Z]+)')
    matches = pattern.findall(cleaned_text)

    for match in matches:
        value, unit = match[0], match[1].strip().lower()
        unit = unit.replace(' ', '')
        normalized_unit = normalize_unit(unit)

        if normalized_unit in allowed_units:
            for entity, units in entity_unit_map.items():
                if normalized_unit in units and entity == specified_entity:
                    found_entity = f"{value} {normalized_unit}"
                    break
            if found_entity != "Not found":
                break
    if found_entity == "Not found":
      return ''
    return found_entity

# Function to process each row
def process_row(row):
    image_link = row['image_link']
    text = extract_text_from_url(image_link)
    entities = find_specified_entity(text, row['entity_name'])
    return text, entities

# Multiprocessing handler
def process_data_multiprocessing(df, num_workers):
    with multiprocessing.Pool(num_workers) as pool:
        results = list(tqdm(pool.imap(process_row, [row for _, row in df.iterrows()]), total=len(df)))
    return results

# Load dataset from CSV
DATASET_FOLDER = '/content/test.csv'
try:
    test = pd.read_csv(DATASET_FOLDER)
    test = test.iloc[45000:60000]
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    test = pd.DataFrame()  # Initialize an empty DataFrame in case of error

# Run the multiprocessing handler to extract text and detect entities
if not test.empty:
    try:
        results = process_data_multiprocessing(test, num_workers=64)
        test['text'] = [result[0] for result in results]
        test['predictions'] = [result[1] for result in results]
        print("Text and predictions added to the DataFrame.")
    except Exception as e:
        print(f"An error occurred during processing: {e}")

    # Save the results to a new CSV file
    output_filename = '/content/test_out120000.csv'
    try:
        test[['index', 'predictions']].to_csv(output_filename, index=False)
        print(f"Results saved to {output_filename}.")
    except Exception as e:
        print(f"An error occurred while saving the results: {e}")
else:
    print("DataFrame is empty. Processing not performed.")

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Dataset loaded successfully.


100%|██████████| 15000/15000 [3:01:12<00:00,  1.38it/s]


Text and predictions added to the DataFrame.
Results saved to /content/test_out.csv4.
