# Amazon ML Challenge HYPER FAST APPROACH

## Importing packages

In [1]:
import numpy as np
import pandas as pd


## Loading the data

In [3]:
test = pd.read_csv("../dataset/sample_test.csv")
test_out = pd.read_csv("../dataset/sample_test_out.csv")
test.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/41-NCxNuBx...,658003,width
1,1,https://m.media-amazon.com/images/I/41-NCxNuBx...,658003,depth
2,2,https://m.media-amazon.com/images/I/417NJrPEk+...,939426,maximum_weight_recommendation
3,3,https://m.media-amazon.com/images/I/417SThj+Sr...,276700,voltage
4,4,https://m.media-amazon.com/images/I/417SThj+Sr...,276700,wattage


In [4]:
IMAGE_DIRECTORY = "../images/sample/"
test['image_link'] = [i.strip("https://m.media-amazon.com/images/I/")[:-2] for i in test['image_link']]

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        88 non-null     int64 
 1   image_link   88 non-null     object
 2   group_id     88 non-null     int64 
 3   entity_name  88 non-null     object
dtypes: int64(2), object(2)
memory usage: 2.9+ KB


## Performing OCR Using Tesseract

In [6]:
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"D:\Programs\TesseractOCR\tesseract.exe"

In [9]:
%time
output = []
for image_name in test['image_link']:
    output.append(pytesseract.image_to_string(IMAGE_DIRECTORY + image_name + '.jpg'))


CPU times: total: 172 ms
Wall time: 49.3 s


In [14]:
print(output)

CPU times: total: 0 ns
Wall time: 0 ns
['20CM\n\nL5G\n\n', '20CM\n\nL5G\n\n', 'Deodorizing module Cat litter shovel\n\n‘Adsorb harmful substances and reduce odor Regardless of soll sand,\n\n‘wood sand, crystal sand,\nfilter unused cat litter\n\n', 'Blade Diameter 05am\nRated Voltage: nav\nFrequency: son\nRated input Power 2100W\n\nfoLoad Speed 500\n\n9° HID W330\n\nMax Curing Capacity: Gr HD WI6oan\n\n‘Max Cuting Capacity {9 HO)aW20an\n\n', 'Blade Diameter 05am\nRated Voltage: nav\nFrequency: son\nRated input Power 2100W\n\nfoLoad Speed 500\n\n9° HID W330\n\nMax Curing Capacity: Gr HD WI6oan\n\n‘Max Cuting Capacity {9 HO)aW20an\n\n', '', 'sow=40W\n470 Im\n\n', 'sow=40W\n470 Im\n\n', 'Ideal Bed & Armchair Cover\n\nNeat size foldaway\n\nReversible; black on one\nside & grey on the other\n\n9010\n\n29cm\n\nSturdy & lightweight\n\nQ\n\nein 2.6 Ibs 1.2 kg\n\n60cm\n', '', '“0.2”\n\n', '“0.2”\n\n', '', '', '7\n\n44"\n\n6.75 LBS\n\n', 'i“\n\nDesktop Size\n\n40"\n\n<u\n\no> 9 Ly XeUl ~, QZ" UIL

## Extracting Numbers and Units from the text

In [12]:
import re
unit_map = {
    "width": {"centimetre", "cm", "foot", "ft", "millimetre", "mm", "metre", "m", "inch", "in", "yard", "yd"},
    "depth": {"centimetre", "cm", "foot", "ft", "millimetre", "mm", "metre", "m", "inch", "in", "yard", "yd"},
    "height": {"centimetre", "cm", "foot", "ft", "millimetre", "mm", "metre", "m", "inch", "in", "yard", "yd"},
    "item_weight": {"milligram", "mg", "kilogram", "kg", "microgram", "µg", "gram", "g", "ounce", "oz", "ton", "pound", "lb"},
    "maximum_weight_recommendation": {"milligram", "mg", "kilogram", "kg", "microgram", "µg", "gram", "g", "ounce", "oz", "ton", "pound", "lb"},
    "voltage": {"millivolt", "mv", "kilovolt", "kv", "volt", "v"},
    "wattage": {"kilowatt", "kw", "watt", "w"},
    "item_volume": {"cubic foot", "ft3", "microlitre", "µL", "cup", "fluid ounce", "fl oz", "centilitre", "cL", "imperial gallon", "gal", "pint", "pt", "decilitre", "dL", "litre", "L", "millilitre", "mL", "quart", "qt", "cubic inch", "in3", "gallon", "gal"}
}

entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Generate regex pattern from unit_map
def generate_unit_pattern(unit_map):
    units = set()
    for unit_set in unit_map.values():
        units.update(unit_set)
    
    # Escape special regex characters and join units into a pattern
    escaped_units = [re.escape(unit) for unit in units]
    pattern = r'\d+(?:[\.,]\d+)?\s*(?:'+'|'.join(escaped_units) + r')?'
    return pattern

# Compile the regular expression pattern
unit_pattern = generate_unit_pattern(unit_map)

def extract_numbers_and_units(text, correct_ocr_errors=True):
    # Use the generated regex pattern to find numbers and units
    if correct_ocr_errors:
      corrections = {
          'O': '0',  # O mistaken for zero
          'D': '0',  # D mistaken for zero
          'Q': '0',  # Q mistaken for zero
          'Z': '2',  # Z mistaken for two
          'S': '5',  # S mistaken for five
          'B': '8',  # B mistaken for eight
          'G': '6',  # G mistaken for six
          'I': '1',  # Uppercase I mistaken for one
          'l': '1',  # Lowercase l mistaken for one
          'T': '7'   # T mistaken for seven
      }
      text = ''.join([corrections.get(c, c) for c in text])
    matches = re.findall(unit_pattern, text, re.IGNORECASE)
    return matches

In [43]:
def standardize_unit(unit: str, category: str):
    # Normalize the unit by stripping spaces and converting to lowercase
    unit = unit.lower().strip()

    # Conversion mapping for shorthand to full unit names
    conversion_map = {
        'kv': 'kilovolt',
        'v': 'volt',
        'kw': 'kilowatt',
        'w': 'watt',
        'mg': 'milligram',
        'g': 'gram',
        'kg': 'kilogram',
        'µg': 'microgram',
        'lb': 'pound',
        'cm': 'centimetre',
        'mm': 'millimetre',
        'm': 'metre',
        'ft': 'foot',
        'yd': 'yard',
        'in': 'inch',
        'oz': 'ounce'
    }

    # Check for possible shorthand conversions
    if unit.lower() in conversion_map:
        unit = conversion_map[unit]
        return unit

    # Match the converted unit to the standard name from entity_unit_map
    for standard_unit in entity_unit_map[category]:
        if unit in standard_unit:
            return standard_unit

    return None

# Function to extract value and unit based on the required measurement type
def extract_measurement(value_list, category, unit_map, entity_unit_map):
    out = []
    for value in value_list:
        # Search for the numerical value and the unit in the string
        smatch = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([a-zA-Zµ³]+)", value)
        if smatch:
            
            number = smatch.group(1)
            unit = smatch.group(2)

            if number[0] == '0':
               print(number, 'hey', "0." + number[1:])
               number = "0." + number[1:]

            # Check if the unit belongs to the specified category
            if unit.lower() in unit_map[category]:
                standardized_unit = standardize_unit(unit, category)
                if standardized_unit:
                    # Format the result as required
                    return f"{float(number):.2f} {standardized_unit}"
        else:
          try:
            if int(value):
              out.append(value)
          except:
            pass
    if out:
      for e in entity_unit_map[category]:
        break
      return out[0] + " " + e
    return ""  # Return empty string if no match

extract_measurement(['100g', '25.6', '070v'], 'voltage', unit_map, entity_unit_map)

100 <class 'str'> 1
070 <class 'str'> 0
070 hey 0.70


'0.70 volt'

In [38]:
%time
prediction = []
for i in range(len(output)):
    value_list = extract_numbers_and_units(output[i])
    category = test['entity_name'][i]
    result = extract_measurement(value_list, category, unit_map, entity_unit_map)
    result = result.replace('\n', '')
    prediction.append(result)
for i in range(len(prediction)):
    print(i, ":", prediction[i])

CPU times: total: 0 ns
Wall time: 0 ns
0 : 20.00 centimetre
1 : 20.00 centimetre
2 : 1 kilogram
3 : 81 volt
4 : 2100.00 watt
5 : 
6 : 470  volt
7 : 40.00 watt
8 : 29.00 centimetre
9 : 
10 : 
11 : 
12 : 
13 : 
14 : 7 kilogram
15 : 5 centimetre
16 : 5 centimetre
17 : 6.20 inch
18 : 6.20 inch
19 : 9.00 metre
20 : 9.00 metre
21 : 9.00 metre
22 : 69.00 centimetre
23 : 69.00 centimetre
24 : 12.00 volt
25 : 1 watt
26 : 
27 : 
28 : 
29 : 
30 : 
31 : 
32 : 6 centimetre
33 : 6 centimetre
34 : 7 kilogram
35 : 
36 : 11.40 centimetre
37 : 12.00 inch
38 : 5 centimetre
39 : 26.00 metre
40 : 26.00 metre
41 : 26.00 metre
42 : 6 kilogram
43 : 7 volt
44 : 7 watt
45 : 
46 : 
47 : 
48 : 6 kilogram
49 : 45.00 watt
50 : 6 volt
51 : 1 watt
52 : 1 centimetre
53 : 
54 : 
55 : 
56 : 120.00 volt
57 : 280.00 watt
58 : 
59 : 
60 : 
61 : 15.00 metre
62 : 51.00 centimetre
63 : 5.00 gram
64 : 5.00 gram
65 : 05  volt
66 : 4 kilogram
67 : 500.00 watt
68 : 80  kilogram
69 : 
70 : 
71 : 6 volt
72 : 6 watt
73 : 1 volt
74 :

## Checking for accuracy

In [19]:
y = pd.read_csv('../dataset/sample_test_out.csv')
y.head()

Unnamed: 0,index,prediction
0,0,21.9 foot
1,1,10 foot
2,2,
3,3,289.52 kilovolt
4,4,1078.99 kilowatt


In [37]:
test.iloc[75]

index                                     75
image_link                       71afEPoRGsL
group_id                              701880
entity_name    maximum_weight_recommendation
Name: 75, dtype: object