In [2]:
import pandas as pd
import requests
from PIL import Image
import pytesseract
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_image(url):
    response = requests.get(url, timeout=10)
    return Image.open(BytesIO(response.content))

def extract_text_from_image(image):
    return pytesseract.image_to_string(image)

def process_image_row(row):
    index, image_url = row
    try:
        # Download the image
        image = download_image(image_url)

        # Extract text from the image
        text = extract_text_from_image(image).lower()
        return index, text
    except Exception as e:
        print(f"Error processing {image_url}: {str(e)}")
        return index, ''

def process_csv(csv_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Ensure there's a column named 'image_link'
    if 'image_link' not in df.columns:
        raise ValueError("CSV file must have a column named 'image_link'")

    # Create a new column for extracted text
    df['extracted_text'] = ''

    # Use ThreadPoolExecutor to parallelize the process
    with ThreadPoolExecutor(max_workers=8) as executor:
        # Create tasks for each row in the DataFrame
        futures = {executor.submit(process_image_row, (index, row['image_link'])): index for index, row in df.iterrows()}

        # Process results as they complete
        for future in as_completed(futures):
            index, text = future.result()
            df.at[index, 'extracted_text'] = text
            print(f"Processed image {index + 1}/{len(df)}")

    # Remove the 'image_link' column
    df = df.drop('image_link', axis=1)

    # Save results to a new CSV file
    output_file = 'ocr_results.csv'
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    process_csv("train_1000.csv")


In [8]:
import pandas as pd
import re


def clean_text(text):
    # Convert to string if the input is not already a string
    if not isinstance(text, str):
        text = str(text)
    # Remove any non-ASCII characters and normalize spaces
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    return re.sub(r'\s+', ' ', text).strip()


def extract_measurement(text, patterns):
    text = clean_text(text)
    all_matches = []
    for pattern, unit_map in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            value, unit = match
            unit = unit.lower()
            if unit in unit_map:
                unit = unit_map[unit]
            all_matches.append(f"{value} {unit.lower()}")
    return ', '.join(all_matches) if all_matches else 'N/A'

def convert_to_grams(value, unit):
    """Convert various units to grams."""
    conversion_factors = {
        'g': 1,
        'kg': 1000,
        'lb': 453.592,
        'oz': 28.3495,
        't': 1000000
    }
    return value * conversion_factors.get(unit, 1)

def extract_weight(text):
    patterns = [
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(g|gram|grams|gm|kg|kilogram|kilograms|lb|lbs|pound|pounds|oz|ounce|ounces|t|ton|tons)\b'
,
         {'gram': 'g', 'grams': 'g', 'kilogram': 'kg', 'kilograms': 'kg', 'pound': 'lb', 'pounds': 'lb', 'ounce': 'oz', 'ounces': 'oz', 'ton': 't', 'tons': 't'})
    ]
    return extract_measurement(text, patterns)

def extract_max_weight(text):
    patterns = [
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(g|gram|grams|gm|kg|kilogram|kilograms|lb|lbs|pound|pounds|oz|ounce|ounces|t|ton|tons)\b',
         {'gram': 'g', 'grams': 'g', 'kilogram': 'kg', 'kilograms': 'kg', 'pound': 'lb', 'pounds': 'lb', 'ounce': 'oz', 'ounces': 'oz', 'ton': 't', 'tons': 't'})
    ]
    
    # Extract all measurements
    measurements = extract_measurement(text, patterns)
    
    if not measurements:
        return ''  # No weights found
    
    # Initialize lists to hold weight values and units
    weight_data = []

    # Split the measurements and convert to grams
    for measurement in measurements.split(', '):
        parts = measurement.split(' ')
        if len(parts) == 2:
            value, unit = parts
            try:
                weight_data.append((float(value), unit))
            except ValueError:
                continue  # Skip if the value conversion fails

    if not weight_data:
        return ''  # No valid weights found

    # Convert all weights to grams
    weight_in_grams = [(convert_to_grams(value, unit), unit) for value, unit in weight_data]
    
    # Find the maximum weight in grams
    max_weight_grams, max_unit = max(weight_in_grams, key=lambda x: x[0])
    
    # Convert the maximum weight back to the appropriate unit
    conversion_factors = {'g': 1, 'kg': 1000, 'lb': 453.592, 'oz': 28.3495, 't': 1000000}
    for unit, factor in sorted(conversion_factors.items(), key=lambda x: x[1], reverse=True):
        if max_weight_grams >= factor:
            max_weight = max_weight_grams / factor
            return f"{max_weight:.2f} {unit.lower()}"
    
    return f"{max_weight_grams:.2f} g"


def extract_dimension(text):
    patterns = [
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(m|meter|meters|CM|C.M.|cm|centimeter|centimeters|MM|mm|millimeter|millimeters|ft|foot|feet|in|inch|inches)\b',
         {'meter': 'm', 'meters': 'm', 'centimeter': 'cm', 'centimeters': 'cm', 'millimeter': 'mm', 'millimeters': 'mm', 'foot': 'ft', 'feet': 'ft', 'inch': 'in', 'inches': 'in'})
    ]
    return extract_measurement(text, patterns)

def extract_voltage(text):
    patterns = [
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(V|volt|volts|kV|kilovolt|kilovolts|mV|millivolt|millivolts)\b',
         {'volt': 'V', 'volts': 'V', 'kilovolt': 'kV', 'kilovolts': 'kV', 'millivolt': 'mV', 'millivolts': 'mV'})
    ]
    return extract_measurement(text, patterns)

def extract_wattage(text):
    patterns = [
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(W|watt|watts|kW|kilowatt|kilowatts|mW|milliwatt|milliwatts)\b',
         {'watt': 'W', 'watts': 'W', 'kilowatt': 'kW', 'kilowatts': 'kW', 'milliwatt': 'mW', 'milliwatts': 'mW'})
    ]
    return extract_measurement(text, patterns)

def extract_volume(text):
    patterns = [
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(l|liter|liters|ml|milliliter|milliliters|gal|gallon|gallons|fl\s*oz|fluid\s*ounce|fluid\s*ounces)\b',
         {'liter': 'l', 'liters': 'l', 'milliliter': 'ml', 'milliliters': 'ml', 'gallon': 'gal', 'gallons': 'gal', 'fluid ounce': 'fl oz', 'fluid ounces': 'fl oz'}),
        (r'\b(\d{1,3}(?:[.,]\s*\d{1,3})?)\s*(cu\s*in|cubic\s*inch|cubic\s*inches|cu\s*ft|cubic\s*foot|cubic\s*feet|cu\s*m|cubic\s*meter|cubic\s*meters|cu\s*cm|cubic\s*centimeter|cubic\s*centimeters)\b',
         {'cubic inch': 'cu in', 'cubic inches': 'cu in', 'cubic foot': 'cu ft', 'cubic feet': 'cu ft', 'cubic meter': 'cu m', 'cubic meters': 'cu m', 'cubic centimeter': 'cu cm', 'cubic centimeters': 'cu cm'})
    ]
    return extract_measurement(text, patterns)

import pandas as pd
import re



def process_csv(input_file, output_file):
    # Read the input CSV file
    df = pd.read_csv(input_file)
    df['extracted_text'] = df['extracted_text'].fillna('')

    # Ensure required columns exist
    required_columns = ['extracted_text', 'entity_name',"entity_value"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"CSV file must have columns: {', '.join(required_columns)}")
    
    # Create a dictionary mapping entity names to their respective extraction functions
    extraction_functions = {
        'item_weight': extract_weight,
        'width': extract_dimension,
        'height': extract_dimension,
        'depth': extract_dimension,
        'maximum_weight_recommendation': extract_max_weight,
        'voltage': extract_voltage,
        'wattage': extract_wattage,
        'item_volume': extract_volume
    }
    
    # Process each row
    df['prediction'] = df.apply(lambda row: 
        extraction_functions.get(row['entity_name'].lower(), lambda x: 'N/A')(row['extracted_text']),
        axis=1)
    
    # Replace 'N/A' with empty string and ensure all values are strings
    df['prediction'] = df['prediction'].replace('N/A', '').astype(str)
    
    # Create a new DataFrame with index and prediction
    result_df = df[['prediction']].copy()
    
    # Reset index to ensure the index column starts from 0
    result_df.reset_index(inplace=True)
    result_df.rename(columns={'index': 'index'}, inplace=True)
    
    # Save the result to a new CSV file, including the index
    result_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Example usage
input_file = 'ocr_results.csv'
output_file = 'output.csv'
process_csv(input_file, output_file)



FileNotFoundError: [Errno 2] No such file or directory: 'ocr_results_preprocessed.csv'

In [25]:
import pandas as pd
import re

# Dictionary mapping of entity to their allowed units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Dictionary mapping short forms to long forms of units
unit_conversions = {
    'cm': 'centimetre',
    'ft': 'foot',
    'in': 'inch',
    'm': 'metre',
    'mm': 'millimetre',
    'yd': 'yard',
    'g': 'gram',
    'kg': 'kilogram',
    'mcg': 'microgram',
    'mg': 'milligram',
    'oz': 'ounce',
    'lb': 'pound',
    'lbs': 'pound',
    't': 'ton',
    'kv': 'kilovolt',
    'mv': 'millivolt',
    'v': 'volt',
    'kw': 'kilowatt',
    'w': 'watt',
    'cl': 'centilitre',
    'cu ft': 'cubic foot',
    'cu in': 'cubic inch',
    'dl': 'decilitre',
    'fl oz': 'fluid ounce',
    'gal': 'gallon',
    'imp gal': 'imperial gallon',
    'L': 'litre',
    'µL': 'microlitre',
    'ml': 'millilitre',
    'pt': 'pint',
    'qt': 'quart'
}

# Function to convert units
def convert_units(text):
    # This regex captures numbers with units directly attached (e.g., '10cm') or separated by space (e.g., '10 cm')
    pattern = r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)'
    
    # Function to replace short form with long form using the conversion dictionary
    def replace_unit(match):
        number, unit = match.groups()
        unit = unit.lower()  # Convert unit to lowercase
        if unit in unit_conversions:  # Check if the unit is in the conversion dictionary
            return f"{number} {unit_conversions[unit]}"  # Replace with long form
        return match.group(0)  # If not found, return the original match

    # Apply the regex substitution to the text
    converted_text = re.sub(pattern, replace_unit, text)
    return converted_text

# Function to process the CSV file
def process_csv(input_file, output_file):
    # Read the input CSV file
    df = pd.read_csv(input_file)
    
    # Handle null values in the 'prediction' column
    df['prediction'] = df['prediction'].fillna('')  # Replace NaN with an empty string or any other placeholder
    
    # Convert units in the 'prediction' column
    df['prediction'] = df['prediction'].apply(convert_units)
    
    # Save the result to a new CSV file
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Example usage
input_file = 'output_preprocessed.csv'
output_file = 'final_preprocessed.csv'
process_csv(input_file, output_file)


Results saved to final_preprocessed.csv


In [26]:
import pandas as pd


# Load the CSV file
df = pd.read_csv('final_preprocessed.csv')

# Update the 'prediction' column to only keep the first value before the ', ' (comma and space)
# Handle null cells by checking if the value is a string before splitting
df['prediction'] = df['prediction'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else x)
df['prediction'] = df['prediction'].replace(',', '.', regex=True)
# Save the updated DataFrame back to a CSV file

df.to_csv('final_preprocessed.csv', index=False)

print("CSV file updated successfully!")


CSV file updated successfully!


In [None]:
df=pd.read_csv("final_preprocessed.csv")

import pandas as pd

# Define a function to calculate Precision, Recall, and F1 Score
def calculate_f1_score(df):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    # Loop over each row to classify each prediction as TP, FP, FN, or TN
    for index, row in df.iterrows():
        gt = row['entity_value']
        out = row['prediction']
        
        if out != "" and gt != "" and out == gt:
            true_positives += 1
        elif out != "" and gt != "" and out != gt:
            false_positives += 1
        elif out == "" and gt != "":
            false_negatives += 1
        elif out == "" and gt == "":
            true_negatives += 1

    # Calculate Precision, Recall, and F1 Score
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

calculate_f1_score(df)