## Image-based Entity Value Extraction for Automated Data Recognition

#### 1. Importing data

Using Paddle OCR for text extraction from images

In [1]:
#importing the training data
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR

# Initialize the PaddleOCR reader
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Supports more options like 'ch', 'en'

# Load the training CSV file
train_file = r'C:\Users\singh\Desktop\Amazon ML Challenge 2024\66e31d6ee96cd_student_resource_3 (1)\student_resource 3\dataset\train - Copy.csv'  
train_df = pd.read_csv(train_file)

[2024/09/14 15:37:53] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\singh/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\singh/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [2]:
train_df.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


#### 2. Extracting text from each image without downloading them

In [3]:
# Function to download and process an image from a URL using PaddleOCR
def process_image_from_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))   # Open the image from the response content
            image.save("temp_image.jpg")  # Save image temporarily
            results = ocr.ocr("temp_image.jpg")   # Use PaddleOCR to extract text from the saved image

            # Extract and concatenate text from results
            text = ' '.join([result[1][0] for result in results[0]])  # Joining recognized text
            return text.strip()  # Return the extracted text

        else:
            print(f"Failed to download image from {url}")
            return ""
    except Exception as e:
        print(f"Error processing image from {url}: {e}")
        return ""

In [4]:
from tqdm import tqdm

for idx, row in tqdm(train_df.iterrows(), total=train_df.shape[0], desc="Processing Images"):
    image_url = row['image_link']

    # Process the image and extract text
    extracted_text = process_image_from_url(image_url)

    # Print or store the extracted text as needed
    print(f"Image URL: {image_url}")
    print(f"Extracted Text: {extracted_text}")

Processing Images:   0%|          | 0/49 [00:00<?, ?it/s]

[2024/09/14 15:38:18] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.7460968494415283
[2024/09/14 15:38:18] ppocr DEBUG: cls num  : 19, elapsed : 0.12832355499267578
[2024/09/14 15:38:19] ppocr DEBUG: rec_res num  : 19, elapsed : 1.0871057510375977


Processing Images:   2%|▏         | 1/49 [00:02<01:42,  2.14s/it]

Image URL: https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg
Extracted Text: PROPOS NATURE INGREDIENT MENAGER MULTI-USAGE TERREDE SOMMIERES 100%NATUREL Argile 100% pure et naturelle, la terre de Sommieres presente des proprietesabsorbantes qui permettent le nettoyage a sec des taches recalcitrantes sur toutes les surfaces (moquette, tapis, parquet...). Elle est aussi efficace pour desodoriser le linge. Ingredient Bentonite Dosage conseill Selon usage ge fermobrde la chorte hm 100% 500g LABORATOIRE PROPOS'NATUREE
[2024/09/14 15:38:20] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.1441800594329834
[2024/09/14 15:38:20] ppocr DEBUG: cls num  : 19, elapsed : 0.03389143943786621
[2024/09/14 15:38:20] ppocr DEBUG: rec_res num  : 19, elapsed : 0.6729884147644043


Processing Images:   4%|▍         | 2/49 [00:03<01:08,  1.47s/it]

Image URL: https://m.media-amazon.com/images/I/71gSRbyXmoL.jpg
Extracted Text: TEARRIFIC LEBENSMITTELECHT HDAY GEPRAGTES DESIGN AS YOULIKE BEST HITT Designed in OR caee Berlin OR 4 celd LIZENZIERTE UND GESCHUTZTE DESIGNS
[2024/09/14 15:38:21] ppocr DEBUG: dt_boxes num : 53, elapsed : 0.14397644996643066
[2024/09/14 15:38:21] ppocr DEBUG: cls num  : 53, elapsed : 0.10918784141540527
[2024/09/14 15:38:23] ppocr DEBUG: rec_res num  : 53, elapsed : 2.435746669769287


Processing Images:   6%|▌         | 3/49 [00:05<01:35,  2.08s/it]

Image URL: https://m.media-amazon.com/images/I/61BZ4zrjZXL.jpg
Extracted Text: COMPOSITION Serving Size:1 Tablet 0.709 g)Each serving contains (Approx.Values Ingredient Qty./Serving %RDA" *PHOSPHOcomplexSilybin (Sillybum marianum) 200 mg ** Dandelion Taraxacum officinale) leaf extract-101 100 mg ** Kutki Picrorhiza kurroa)rhizome extract-0.5%Bitters 50 mg ** Kasani Cichorium intybus seed extract-1%Bitters 25 mg ** Punarnava Boerhavia diffusa root extract-0.07%alkaloids 25 mg ** Bhui amla Phyllanthus amarus WP extract-0.5%Bitters 25 mg Amla (Phyllanthus emblica) fruit extract-10%Tannins 25 mg Licorice (Glycyrrhiza glabra) root extract-5%Glycyrrhizin 25 mg Vitamin E 10 mg 100 Piper nigrum fruit extract-95%Piperine 5 mg ** NUTRITIONAL INFORMATION PER SERVING (APPROX.VALUES) Nutrients Qty./Serving %RDA# Energy 3.04kcal 0.13 Carbohydrate 0.51 g ** (Sugars) 0.2 g ** Protein 0.04g 0.07 Fat 0.09 g ** %RDA values established as per ICMR 2010 for sedentary lifestyle-Men **%RDA not established by

Processing Images:   8%|▊         | 4/49 [00:08<01:44,  2.31s/it]

Image URL: https://m.media-amazon.com/images/I/612mrlqiI4L.jpg
[2024/09/14 15:38:26] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.13933086395263672
[2024/09/14 15:38:26] ppocr DEBUG: cls num  : 13, elapsed : 0.11302375793457031
[2024/09/14 15:38:27] ppocr DEBUG: rec_res num  : 13, elapsed : 0.5528507232666016


Processing Images:  10%|█         | 5/49 [00:09<01:20,  1.83s/it]

Image URL: https://m.media-amazon.com/images/I/617Tl40LOXL.jpg
Extracted Text: Horbaach HIGHSTRENGTH PSYLLIOM HUSK 1400 PLANTAGO OVATA MG PLANT SEEDS FOOD 365 VEGAN SUPPLEMENT CAPSULES
[2024/09/14 15:38:27] ppocr DEBUG: dt_boxes num : 17, elapsed : 0.1493217945098877
[2024/09/14 15:38:27] ppocr DEBUG: cls num  : 17, elapsed : 0.0794672966003418
[2024/09/14 15:38:28] ppocr DEBUG: rec_res num  : 17, elapsed : 0.8393740653991699


Processing Images:  12%|█▏        | 6/49 [00:10<01:09,  1.62s/it]

Image URL: https://m.media-amazon.com/images/I/61QsBSE7jgL.jpg
Extracted Text: Horbaach HIGHSTRENGTH Naturally-Sourced Psyllium PSYLLIOM HUSK High strength 1400mg per serving 1400MG PLANTAGO OVATA PLANT SEEDS Suitable for Vegans & Vegetarians FOOD SUPPLEMENT 365 VEGAN CAPSULES Horbaach
[2024/09/14 15:38:29] ppocr DEBUG: dt_boxes num : 37, elapsed : 0.26262378692626953
[2024/09/14 15:38:29] ppocr DEBUG: cls num  : 37, elapsed : 0.11513876914978027
[2024/09/14 15:38:32] ppocr DEBUG: rec_res num  : 37, elapsed : 2.7132959365844727


Processing Images:  14%|█▍        | 7/49 [00:14<01:36,  2.29s/it]

Image URL: https://m.media-amazon.com/images/I/81xsq6vf2qL.jpg
Extracted Text: Horbaach Directions: For adults, take two (2) vegan capsules daily, preferably with a meal. Do not exceed stated dose Nutrition Information Typically Per Daily Dose HIGHSTRENGTH 1400mg Psyllium Husk Powder PSYLLIOM Ingredients:Psyllium Husk Powder,Capsule Shell (Hy- droxypropylmethylcellulose), Anti-Caking Agents (Mag nesium Salts of Fatty Acids, Silicon Dioxide). HUSK May contain Sesame Seeds & Mustard.For allergens, see the ingredients in bold. Notice: Take this product with 220ml of fluids. Taking this product without adequate fluid may cause the pos- 1400MG sibility of choking.Do not use this product if you have PLANTAGO OVATA difficulty swallowing.If you experience chest pain, vom- PLANT SEEDS iting or difficulty in swallowing or breathing after taking this product, seek immediate medical attention. Do not take before sleeping.Fibre products can affect the ab- sorption of many medications.Do not take th

Processing Images:  16%|█▋        | 8/49 [00:15<01:17,  1.90s/it]

Image URL: https://m.media-amazon.com/images/I/71DiLRHeZdL.jpg
Extracted Text: VEGAN Horbaach WHEAT FREE HIGH STRENGTH 000 PSYLLIOM SOY HUSK FREE PLANTAGO OVATA 1400MG PLANT SEEDS DAIRY VEGAN FREE 365 CAPSULES FOOD SUPPLEMENT FREEFROM PRESERVATIVES Horbaach
[2024/09/14 15:38:34] ppocr DEBUG: dt_boxes num : 23, elapsed : 0.13808679580688477
[2024/09/14 15:38:34] ppocr DEBUG: cls num  : 23, elapsed : 0.07596516609191895
[2024/09/14 15:38:34] ppocr DEBUG: rec_res num  : 23, elapsed : 0.6913423538208008


Processing Images:  18%|█▊        | 9/49 [00:17<01:12,  1.81s/it]

Image URL: https://m.media-amazon.com/images/I/91Cma3RzseL.jpg
Extracted Text: Horbaach 100% HIGHEST QUALITY Horbaach TO GMP HIGHSTRENGTH PSYLLIUM GMP HUSK PLANTAGO OVATA 1400MG PLANT SEEDS VEGAN 365 CAPSULES FOOD SUPPLEMENT Horbaach
[2024/09/14 15:38:35] ppocr DEBUG: dt_boxes num : 30, elapsed : 0.2857687473297119
[2024/09/14 15:38:35] ppocr DEBUG: cls num  : 30, elapsed : 0.08731818199157715
[2024/09/14 15:38:36] ppocr DEBUG: rec_res num  : 30, elapsed : 1.1025233268737793


Processing Images:  20%|██        | 10/49 [00:18<01:09,  1.77s/it]

Image URL: https://m.media-amazon.com/images/I/71jBLhmTNlL.jpg
Extracted Text: NEWOOK SAME TRUSTED OUALITY OLD NEW Horbaach Horbaach HIGHSTRENGTH PSYLLIOM HIGHSTRENGTH HUSK PSYLLIOM HUSK 1400MG oer serving 1400MG PLANTAGO OVATA PLANT SEEDS FOOD 365 VEGAN SUPPLEMENT CAPSULES FOOD 365 VEGAN SUPPLEMENT CAPSULES Horbaach PACKAGING MAY VARY
[2024/09/14 15:38:37] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.13439273834228516
[2024/09/14 15:38:37] ppocr DEBUG: cls num  : 7, elapsed : 0.06919431686401367
[2024/09/14 15:38:38] ppocr DEBUG: rec_res num  : 7, elapsed : 1.2798490524291992


Processing Images:  22%|██▏       | 11/49 [00:20<01:08,  1.80s/it]

Image URL: https://m.media-amazon.com/images/I/81N73b5khVL.jpg
Extracted Text: GroBe Kapazitat, Tragfahigkeit bis zu 30KG Das Material ist ca.5mm dick und die Fugen sind verstarkt, so dass es sehr stark ist Tragfahigkeit bis zu 30KG, kann eine Vielzahl von Gegenstanden 30 KG aufnehmen.
[2024/09/14 15:38:38] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.13141250610351562
[2024/09/14 15:38:38] ppocr DEBUG: cls num  : 20, elapsed : 0.08686256408691406
[2024/09/14 15:38:39] ppocr DEBUG: rec_res num  : 20, elapsed : 0.5322444438934326


Processing Images:  24%|██▍       | 12/49 [00:21<00:55,  1.51s/it]

Image URL: https://m.media-amazon.com/images/I/61oMj2iXOuL.jpg
Extracted Text: WILDFANG XS S XL 1,3 cm 2 cm 2,5cm 3 cm 3,5 cm 12 cm 13 cm 14 cm 15 cm 16cm <10 kg 10-15 kg 15-25 kg 25-45 kg >45 kg
[2024/09/14 15:38:39] ppocr DEBUG: dt_boxes num : 28, elapsed : 0.13351964950561523
[2024/09/14 15:38:40] ppocr DEBUG: cls num  : 28, elapsed : 0.09337687492370605
[2024/09/14 15:38:40] ppocr DEBUG: rec_res num  : 28, elapsed : 0.724130392074585


Processing Images:  27%|██▋       | 13/49 [00:22<00:53,  1.48s/it]

Image URL: https://m.media-amazon.com/images/I/91LPf6OjV9L.jpg
Extracted Text: ERICH Helps in CRICH Helps in osteoarthritis digestion Reduces inflammation USDA ORGANIC Anti-bacterial properties ERFOOD RICH ORGANICS SED Helps in PCOS SPEARMINT GREEN TEA 0 Fe WEIGHT NE
[2024/09/14 15:38:41] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.2595677375793457
[2024/09/14 15:38:41] ppocr DEBUG: cls num  : 19, elapsed : 0.07987022399902344
[2024/09/14 15:38:42] ppocr DEBUG: rec_res num  : 19, elapsed : 0.9132556915283203


Processing Images:  29%|██▊       | 14/49 [00:24<00:52,  1.50s/it]

Image URL: https://m.media-amazon.com/images/I/81fOxWWWKYL.jpg
Extracted Text: USDA ORGANIC SSRICHS PROMOTES ORGANICS DIGESTVE HEALTH PLANT BASED ORGANIC SUPERFOOD SPEARMINT GREEN TEA HIGH IN ANTIOXIDANTS HIGH IN VITAMIN ANTI-BACTERIAL IN NATURE +COMBATS DISEASES Fe GREAT SOURCEOFIRON NET WEIGHT100 g3.53 oz
[2024/09/14 15:38:42] ppocr DEBUG: dt_boxes num : 58, elapsed : 0.17284941673278809
[2024/09/14 15:38:42] ppocr DEBUG: cls num  : 58, elapsed : 0.12934327125549316
[2024/09/14 15:38:44] ppocr DEBUG: rec_res num  : 58, elapsed : 1.8997471332550049


Processing Images:  31%|███       | 15/49 [00:27<01:02,  1.83s/it]

Image URL: https://m.media-amazon.com/images/I/81dzao1Ob4L.jpg
Extracted Text: PACKAGNG  CHANGED OOD FOR THE MODERN LIFE SRICH ORGANICS RICH ORGANICS NTO USE PSAPERFOSED Issa SPEARMINT GREEN TEA USDA ORGANIC OODFOR THE MODERN LIFE BEFORE SRICH fssai ORGANICS SRICH LIC.No.12721999000135 ORGANIC SPEARMINT HERBAL TEAE ORGANICS kee DIRECTION OF USE an be made into herba CHORGANICS.COM IngredientsOrganic Spearmint Pure Herb Country of Origin: Indic PLANT BASED Nutritionl Facts ORGANIC SUPERFOOD Calories 80kcal SPEARMINT GREEN TEA ated fal MANUFACTURED AND MARKETED BY: Omg HIGH IN VITAMINA Sorich Organics Pt.d RICHINANTIOXIDANTS NET WEIGHT : 100 g (3.53 oz) ANTI-BACTERIALIN NATURE MRP Rs. FGREAT SOURCEOFIRON BATCH NO. : NET WEIGHT100g3.53oz PKD. : USE BY : AFTER Note: Packaging may vary for a brief period of time.
[2024/09/14 15:38:45] ppocr DEBUG: dt_boxes num : 24, elapsed : 0.13906311988830566
[2024/09/14 15:38:45] ppocr DEBUG: cls num  : 24, elapsed : 0.043155670166015625
[2024/09/14 15:

Processing Images:  33%|███▎      | 16/49 [00:28<00:57,  1.74s/it]

Image URL: https://m.media-amazon.com/images/I/91-iahVGEDL.jpg
Extracted Text: XX XX L.oose packaging Impurities added Other Does not brew easily brands Added preservatiyes Processed chemically VS USDA ORGANIC SSRICH Air-tight packaging ERICH ORGANICS Impurities free Brews easily SPEARMINT GREEN TEA Noadded preservative HIGHIN VITAMIN ANTI-BACTERIAL INNATURE Drectly from farm Fe GREAT SOURCE OFIRON
[2024/09/14 15:38:47] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.40456342697143555
[2024/09/14 15:38:47] ppocr DEBUG: cls num  : 10, elapsed : 0.06782317161560059
[2024/09/14 15:38:47] ppocr DEBUG: rec_res num  : 10, elapsed : 0.29754638671875


Processing Images:  35%|███▍      | 17/49 [00:29<00:49,  1.54s/it]

Image URL: https://m.media-amazon.com/images/I/81S2+GnYpTL.jpg
Extracted Text: FREE Glucon-D Regular 200gpack Glucon-D R Instant Energy Regular Vitamin
[2024/09/14 15:38:47] ppocr DEBUG: dt_boxes num : 38, elapsed : 0.08905839920043945
[2024/09/14 15:38:48] ppocr DEBUG: cls num  : 38, elapsed : 0.14922690391540527
[2024/09/14 15:38:49] ppocr DEBUG: rec_res num  : 38, elapsed : 1.8010764122009277


Processing Images:  37%|███▋      | 18/49 [00:32<00:55,  1.79s/it]

Image URL: https://m.media-amazon.com/images/I/81e2YtCOKvL.jpg
Extracted Text: FREE Glucon-D Regular 200gpack Glucon-D Net Weight Instant Energy Ikg eaves youtredyour family needs the Instant Energyof Glucon-D 99.4% pure Glucose-Glucose is an Instant energy source for the body and is the only energy source for the brain. Vitamin Glucon-D has Vitamin D and Calcium that provides strength to the bones and body. Minerals -Glucon-D has Phosphors ttis essential or energy production and storage. So experience the Instant Energyof Glucon-D with your family! i  n66 R   RAARORT ffe a k t.ksferae. Offer available in specifically marked packs only M.R.P RS/INCLOFALLTAXES) LOT No MFD. (See space below/bottom of jar.)
[2024/09/14 15:38:50] ppocr DEBUG: dt_boxes num : 43, elapsed : 0.39749670028686523
[2024/09/14 15:38:50] ppocr DEBUG: cls num  : 43, elapsed : 0.11713075637817383
[2024/09/14 15:38:52] ppocr DEBUG: rec_res num  : 43, elapsed : 1.5179011821746826


Processing Images:  39%|███▉      | 19/49 [00:34<00:59,  1.97s/it]

Image URL: https://m.media-amazon.com/images/I/81RNsNEM1EL.jpg
Extracted Text: FREE Glucon-D Regular 200gpack Glucon-D Instant Energy NUTRITIONALVALUE* Per 100g Per Serve (35g) Energy Value. 360kcal 126kcal Prot.e. Og Carbohydrates. 31.5g of which sugarr (sucrose...g. 0g Fat&All types of fatty aid..g Og Calc.iu. .170mg 59.5mg Phosphorus. .100mg 35mg Vitamin D. .3001.U. 1051.U. *Approx.Value GREIENTSce99.4%MirlsCamhsphtesanditainD Glucose based beverage mi Proprtary Food Storage ConditionStore ina cool dry place away from strong odours. BEST BEFORE 24 MONTHS FROM MANUFACTURE. Heinzy Manufactured forHeinz India Pt.td.7FlooDShivsagar WorliMumbai-400018. fssai LiC.No.10013022001417 Frt  ft hf.ee e FSSAILic.No.10014012000278 FSSAILic.No.10014062000273 Net Weight:
[2024/09/14 15:38:53] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.7182931900024414
[2024/09/14 15:38:53] ppocr DEBUG: cls num  : 19, elapsed : 0.08077812194824219
[2024/09/14 15:38:54] ppocr DEBUG: rec_res num  : 19, elapsed : 0.72

Processing Images:  41%|████      | 20/49 [00:36<00:56,  1.96s/it]

Image URL: https://m.media-amazon.com/images/I/91prZeizZnL.jpg
Extracted Text: FREE Glucon-D Regular 200gpack Glucon-D ? Instant Energy Regular Glucon-D Instant Energy Regular Minerals Vitamin Minerals Vitamin
[2024/09/14 15:38:54] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.3239874839782715
[2024/09/14 15:38:54] ppocr DEBUG: cls num  : 3, elapsed : 0.053795814514160156
[2024/09/14 15:38:54] ppocr DEBUG: rec_res num  : 3, elapsed : 0.15224266052246094


Processing Images:  43%|████▎     | 21/49 [00:37<00:43,  1.56s/it]

Image URL: https://m.media-amazon.com/images/I/31EvJszFVfL.jpg
Extracted Text: Glucon-D 9.4in 23cm
[2024/09/14 15:38:55] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.7174777984619141
[2024/09/14 15:38:55] ppocr DEBUG: cls num  : 15, elapsed : 0.03120112419128418
[2024/09/14 15:38:56] ppocr DEBUG: rec_res num  : 15, elapsed : 0.7688577175140381


Processing Images:  45%|████▍     | 22/49 [00:38<00:43,  1.61s/it]

Image URL: https://m.media-amazon.com/images/I/61wzlucTREL.jpg
Extracted Text: ONE SIZE FITS 6 ALL STRETCHABLE AND BLACKFABRIC BLOCKS FORMFITTING THELIGHT EXPERT-GRADE GEL NON-TOXIC NO SMELL FLEXIBLE COMFORTABLE FOR COLD ORHOT THERAPY 3.1 in 5.3 5 22.2 in
[2024/09/14 15:38:56] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.13201427459716797
[2024/09/14 15:38:56] ppocr DEBUG: cls num  : 12, elapsed : 0.017060041427612305
[2024/09/14 15:38:57] ppocr DEBUG: rec_res num  : 12, elapsed : 0.8455119132995605


Processing Images:  47%|████▋     | 23/49 [00:39<00:37,  1.46s/it]

Image URL: https://m.media-amazon.com/images/I/61sQ+qAKr4L.jpg
Extracted Text: 925 SterlingSilver 18K Gold Plated 0.28INCH (7MM) High Polish 0.07 INCH (1.8MM) 16+2 INCH Extend Chain Net Weight: 2.7G (405MM+50MM) Manual measurement, the size may be slightly different from the physical
[2024/09/14 15:38:58] ppocr DEBUG: dt_boxes num : 22, elapsed : 0.14519309997558594
[2024/09/14 15:38:58] ppocr DEBUG: cls num  : 22, elapsed : 0.12405204772949219
[2024/09/14 15:38:59] ppocr DEBUG: rec_res num  : 22, elapsed : 0.8208179473876953


Processing Images:  49%|████▉     | 24/49 [00:41<00:37,  1.49s/it]

Image URL: https://m.media-amazon.com/images/I/81x77l2T5NL.jpg
Extracted Text: Creating a sustainablefoodfuture for you and the planet Visit www.knorr.com/uk How to use Knorr Stock Pots: S GLUTEN NO ADDED AND FREE MSG PRESERVATIVES 4x28g=112g e eiher:dissolve the pot into 500ml of boiling or:add the water. pot directly to dish. Best Before End:
[2024/09/14 15:38:59] ppocr DEBUG: dt_boxes num : 25, elapsed : 0.15143132209777832
[2024/09/14 15:38:59] ppocr DEBUG: cls num  : 25, elapsed : 0.09134364128112793
[2024/09/14 15:39:00] ppocr DEBUG: rec_res num  : 25, elapsed : 0.9833052158355713


Processing Images:  51%|█████     | 25/49 [00:42<00:35,  1.47s/it]

Image URL: https://m.media-amazon.com/images/I/71nywfWZUwL.jpg
Extracted Text: 9.1cm/3.58" 36.8cm/ 14.48" 48V 13Ah 48V 17.5Ah 11.1cm/4.37 Downtube Battery Type 13Ah:2500m-A-h Battery Cell 17.5Ah:Sam-sung/L-G/Cell 3500mAh Send it randomly Charger 48V 13Ah (2A) Charger Output Current 48V 17.5Ah (4A) USB port BMS 30A Battery Weight. KG 4.1KG/4.4KG
[2024/09/14 15:39:01] ppocr DEBUG: dt_boxes num : 25, elapsed : 0.26202940940856934
[2024/09/14 15:39:01] ppocr DEBUG: cls num  : 25, elapsed : 0.04999709129333496
[2024/09/14 15:39:01] ppocr DEBUG: rec_res num  : 25, elapsed : 0.7024602890014648


Processing Images:  53%|█████▎    | 26/49 [00:44<00:31,  1.37s/it]

Image URL: https://m.media-amazon.com/images/I/71nywfWZUwL.jpg
Extracted Text: 9.1cm/3.58" 36.8cm/ 14.48" 48V 13Ah 48V 17.5Ah 11.1cm/4.37 Downtube Battery Type 13Ah:2500m-A-h Battery Cell 17.5Ah:Sam-sung/L-G/Cell 3500mAh Send it randomly Charger 48V 13Ah (2A) Charger Output Current 48V 17.5Ah (4A) USB port BMS 30A Battery Weight. KG 4.1KG/4.4KG
[2024/09/14 15:39:02] ppocr DEBUG: dt_boxes num : 17, elapsed : 0.12297177314758301
[2024/09/14 15:39:02] ppocr DEBUG: cls num  : 17, elapsed : 0.07632160186767578
[2024/09/14 15:39:02] ppocr DEBUG: rec_res num  : 17, elapsed : 0.5677950382232666


Processing Images:  55%|█████▌    | 27/49 [00:44<00:26,  1.23s/it]

Image URL: https://m.media-amazon.com/images/I/51WsuKKAVrL.jpg
Extracted Text: Product Name Harvest Festival Doll Product Size 15926CM Qty./Ctn. 96PCS Carton Size 52x42x40CM Material Polyester material Craftsmanship Manual Colour BrownYellow GreyOrange Product Weight 158g
[2024/09/14 15:39:02] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.12725043296813965
[2024/09/14 15:39:02] ppocr DEBUG: cls num  : 3, elapsed : 0.054651737213134766
[2024/09/14 15:39:03] ppocr DEBUG: rec_res num  : 3, elapsed : 0.14874649047851562


Processing Images:  57%|█████▋    | 28/49 [00:45<00:20,  1.01it/s]

Image URL: https://m.media-amazon.com/images/I/61XGDKap+JL.jpg
Extracted Text: 158g 1580
[2024/09/14 15:39:03] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.13344907760620117
[2024/09/14 15:39:03] ppocr DEBUG: cls num  : 9, elapsed : 0.06230449676513672
[2024/09/14 15:39:04] ppocr DEBUG: rec_res num  : 9, elapsed : 0.5438277721405029


Processing Images:  59%|█████▉    | 29/49 [00:46<00:19,  1.02it/s]

Image URL: https://m.media-amazon.com/images/I/715vVcWJxGL.jpg
Extracted Text: GMP NUVIDA NATURAL HEALTH CRANBERRY EXTRACT High Strength Extract for Natural Urinary Tract and Bladder Support SUITABLE FOR VEGETARIANS AND VEGANS 5000mg 90 Tablets FoodSupplement
[2024/09/14 15:39:04] ppocr DEBUG: dt_boxes num : 14, elapsed : 0.1626451015472412
[2024/09/14 15:39:04] ppocr DEBUG: cls num  : 14, elapsed : 0.07792496681213379
[2024/09/14 15:39:05] ppocr DEBUG: rec_res num  : 14, elapsed : 0.6923151016235352


Processing Images:  61%|██████    | 30/49 [00:47<00:19,  1.02s/it]

Image URL: https://m.media-amazon.com/images/I/613v+2W4UwL.jpg
Extracted Text: 50 mI Carvomin R Verdauungstropfen 18,55 g/20 ml Flussigkeit zum Einnehmen Wirkstoff: Auszug aus einer Mischung von Angelikawurzeln, Benediktenkraut und Pfefferminzblattern KLONGE PHARMA
[2024/09/14 15:39:05] ppocr DEBUG: dt_boxes num : 47, elapsed : 0.28184986114501953
[2024/09/14 15:39:05] ppocr DEBUG: cls num  : 47, elapsed : 0.12166404724121094
[2024/09/14 15:39:07] ppocr DEBUG: rec_res num  : 47, elapsed : 1.4758741855621338


Processing Images:  63%|██████▎   | 31/49 [00:49<00:24,  1.35s/it]

Image URL: https://m.media-amazon.com/images/I/71+fn9TWQmL.jpg
Extracted Text: 50n 50 mI Carvomin R arvom Verdauungstropfen Verdauungstropten hmel 18,55 g/20 ml m Einnehi 18.55g/20m Bkeit z Flussigkeit zum Einnehmen Zu 157 50 ml Wirkstoff Ase Benediktenkrau! Carvomin pflanzliches A Verdauungstuco Verdauungstropfen keit enthalten 16 Mischung von A 18,55 g/20 ml ferminzblattem Flussigkeit zum Wirkstoff: Enthalt 58V Einnehmen Auszug aus einer Mischung mittel fur Kine lagern.Das A von Angelikawurzeln ausschlieBlichar Benediktenkraut und dungsgebietre Pfefferminzblattern Krankheitssymco Packungsberag eine andere n KLONGE tieren.RegN. Zur Anwendung bei Erwachsenen Klinge Pharma PHARMA
[2024/09/14 15:39:07] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.13248038291931152
[2024/09/14 15:39:07] ppocr DEBUG: cls num  : 42, elapsed : 0.10579323768615723
[2024/09/14 15:39:09] ppocr DEBUG: rec_res num  : 42, elapsed : 1.2125062942504883


Processing Images:  65%|██████▌   | 32/49 [00:51<00:25,  1.49s/it]

Image URL: https://m.media-amazon.com/images/I/71aKgRRQ2wL.jpg
Extracted Text: Carvonin 50 mI Ierdaungstonfien Carvomin 1855g20m Verdauungstropfen wendune 18,55 g/20 ml Flussigkeit zum Einnehmen OCKAUn &SDeil 50 ml Wirkstoff: Asu Benediktenkrau! Carvomin pflanzliches Am Verdauungstro Verdauungstropfen keit enthalten Mischung von  18,55 g/20 ml Wirkstoff: ferminzblattem Flussigkeit zum Enthalt 58Va Auszug aus einer Mischung Einnehmen mittel fur Kine von Angelikawurzeln, lagern.Das A Benediktenkraut und ausschlieBlichar. dungsgebiere Pfefferminzblattern Krankheitssymo Packungstelag  KLONGE eine andere ng tieren.Reg-N. PHARMA Zur Anwendung Klinge Pharma bei Erwachsenen
[2024/09/14 15:39:09] ppocr DEBUG: dt_boxes num : 26, elapsed : 0.14240050315856934
[2024/09/14 15:39:09] ppocr DEBUG: cls num  : 26, elapsed : 0.09335160255432129
[2024/09/14 15:39:10] ppocr DEBUG: rec_res num  : 26, elapsed : 0.923058271408081


Processing Images:  67%|██████▋   | 33/49 [00:52<00:23,  1.44s/it]

Image URL: https://m.media-amazon.com/images/I/71rKXZJrh4L.jpg
Extracted Text: 50.m 50 ml Carvomin arvom Verdauungstropfen Veroaungstropten 18,55 g/20 ml 18.55g/20m Flussigkeit zum Einnehmen FliSsie AnWer 552 LUSZUS 13 Wirkstoff: Auszug aus einer Mischung von Angelikawurzeln, Benediktenkraut und Pfefferminzblattern KLONGE PHARMA
[2024/09/14 15:39:11] ppocr DEBUG: dt_boxes num : 20, elapsed : 0.26769351959228516
[2024/09/14 15:39:11] ppocr DEBUG: cls num  : 20, elapsed : 0.03805232048034668
[2024/09/14 15:39:11] ppocr DEBUG: rec_res num  : 20, elapsed : 0.7346229553222656


Processing Images:  69%|██████▉   | 34/49 [00:54<00:21,  1.42s/it]

Image URL: https://m.media-amazon.com/images/I/71D824lbRvL.jpg
Extracted Text: carvonin 50 mI lerdauungstronfien Carvomin 1855g20m Verdauungstropfen 18,55g/20 ml Flussigkeit zum Einnehmen Wirkstoff: Auszug aus einer Mischung von Angelikawurzeln Benediktenkraut und Pfefferminzblattern KLONGE PHARMA
[2024/09/14 15:39:13] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.12976360321044922
[2024/09/14 15:39:13] ppocr DEBUG: cls num  : 9, elapsed : 0.06987738609313965
[2024/09/14 15:39:13] ppocr DEBUG: rec_res num  : 9, elapsed : 0.3103775978088379


Processing Images:  71%|███████▏  | 35/49 [00:55<00:20,  1.46s/it]

Image URL: https://m.media-amazon.com/images/I/71004c9tzfL.jpg
Extracted Text: 65:62 USB CHARGING  PORTS 50 MODE UP DOWN
[2024/09/14 15:39:13] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.12252235412597656
[2024/09/14 15:39:13] ppocr DEBUG: cls num  : 7, elapsed : 0.06766009330749512
[2024/09/14 15:39:14] ppocr DEBUG: rec_res num  : 7, elapsed : 0.6039149761199951


Processing Images:  73%|███████▎  | 36/49 [00:56<00:16,  1.30s/it]

Image URL: https://m.media-amazon.com/images/I/51bQPPtMqYL.jpg
Extracted Text: How to Use. Place the curtain between the counterweight and the magnet 4cm/1.6in Weight:26g Load-bearing metalblock
[2024/09/14 15:39:14] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.12074971199035645
[2024/09/14 15:39:14] ppocr DEBUG: cls num  : 5, elapsed : 0.05532717704772949
[2024/09/14 15:39:15] ppocr DEBUG: rec_res num  : 5, elapsed : 0.46061038970947266


Processing Images:  76%|███████▌  | 37/49 [00:57<00:13,  1.13s/it]

Image URL: https://m.media-amazon.com/images/I/61o2ntPNNgL.jpg
Extracted Text: 28" ZFREEWHEELS 800W36V GABELMINDESTBREITE135MM HINTERRADMITBRUSHLESSMOTOR
[2024/09/14 15:39:15] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.12929368019104004
[2024/09/14 15:39:15] ppocr DEBUG: cls num  : 5, elapsed : 0.012994766235351562
[2024/09/14 15:39:15] ppocr DEBUG: rec_res num  : 5, elapsed : 0.3413546085357666


Processing Images:  78%|███████▊  | 38/49 [00:57<00:10,  1.03it/s]

Image URL: https://m.media-amazon.com/images/I/61o2ntPNNgL.jpg
Extracted Text: 28" ZFREEWHEELS 800W36V GABELMINDESTBREITE135MM HINTERRADMITBRUSHLESSMOTOR
[2024/09/14 15:39:16] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.14949250221252441
[2024/09/14 15:39:16] ppocr DEBUG: cls num  : 18, elapsed : 0.06990575790405273
[2024/09/14 15:39:18] ppocr DEBUG: rec_res num  : 18, elapsed : 1.9868476390838623


Processing Images:  80%|███████▉  | 39/49 [01:00<00:14,  1.44s/it]

Image URL: https://m.media-amazon.com/images/I/71IUuTJ8QwL.jpg
Extracted Text: Superior Quality Put durability,safety,and environmental protection first Honeycomb Surface Offers solid performance and ensures that the yoga ball won't burst suddenly. Explosion-Proof Design Provides the best protection and prevents injuries, EnvironmentallyFriendly Material Just use it with confidence knowing it's made >poI of environmentally friendly material. No Sticking Rest assuredthat the surface won't stickto your skin, clothing, floor, and much more. 330lbs Load Bearing Never worry about the load-bearing capacity of this yoga ball as it can support up to 330lbs
[2024/09/14 15:39:18] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.2553577423095703
[2024/09/14 15:39:18] ppocr DEBUG: cls num  : 9, elapsed : 0.0702829360961914
[2024/09/14 15:39:19] ppocr DEBUG: rec_res num  : 9, elapsed : 0.398998498916626


Processing Images:  82%|████████▏ | 40/49 [01:01<00:12,  1.35s/it]

Image URL: https://m.media-amazon.com/images/I/915JHkwtcrL.jpg
Extracted Text: 51*70 IN 31 0Z GARDEN soft  ventilate warm and Particularly smooth Comfortable to skin
[2024/09/14 15:39:21] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.5630090236663818
[2024/09/14 15:39:21] ppocr DEBUG: cls num  : 18, elapsed : 0.03936648368835449
[2024/09/14 15:39:22] ppocr DEBUG: rec_res num  : 18, elapsed : 1.1561882495880127


Processing Images:  84%|████████▎ | 41/49 [01:04<00:14,  1.82s/it]

Image URL: https://m.media-amazon.com/images/I/71cjrYndwIL.jpg
Extracted Text: SKINFOOD SALMON DARK CIRCLE CONCEALER This concentrated and creamy concealer contains salmon ingredients which effectively help conceal dark under eye circles and brighten the eyes. To use : After applying your foundation dispense an adequate amount and gently tap onto dark circles around your eye area #2 809221274243 Dist.SKINFOOD USA,INC.1 Technology Drive Bldg.B,Suite B113 Irvine,California,92618 www.theskinfood.com 10g/Net wt.0.35oz Made in Korea OPEN HERE
[2024/09/14 15:39:23] ppocr DEBUG: dt_boxes num : 72, elapsed : 0.5323212146759033
[2024/09/14 15:39:23] ppocr DEBUG: cls num  : 72, elapsed : 0.15299177169799805
[2024/09/14 15:39:26] ppocr DEBUG: rec_res num  : 72, elapsed : 3.4676413536071777


Processing Images:  86%|████████▌ | 42/49 [01:09<00:18,  2.65s/it]

Image URL: https://m.media-amazon.com/images/I/81hnk2WXO3L.jpg
Extracted Text: hot'chocolate BOMBS Made with Belgian Chocolate inc/udes Mini Marshmallows Milk Double Salted Peppermint Chocolate Chocolate Caramel ORIGINAL INGREDIENTS:SUGARCOCOA BUTTERWHOLE MILK POWDER Nutrition Facts MARSHMALLOWS SUGARGLUCOSE SYRUPGELATINSORBITOLMODIFIED CORN STARCH.DEXTROSE,ARTFICIAL FLAVOR,TETRASODIUM PYROPHOSPHATE.COCOA MASSEMULSIFIERS (SOY LECITHIN.PGPR 16 serving per container. VANILLA EXTRACT Serving size1 Package (35g) CONTAINS:MILK,SOY. Amount per serving SALTED CARAMEL INGREDIENTS:SUGAR.COCOA BUTTER,WHOLE MILK 180 POWDER.MARSHMALLOWS SUGAR.GLUCOSE SYRUP.GELATIN.SORBITOL Calories MODIFIED CORN STARCH.DEXTROSE,ARTFICIAL FLAVOR.TETRASODIUM PYROPHOSPHATE.COCOA MASS.EMULSIFIERS (SOY LECITHINPGPR) VANILLAEXTRACT.SALT.CARAMEL FLAVOR % Daily Value* CONTAINS:MILK,SOY. Total Fat 9g 12% PEPPERMINT INGREDIENTS:SUGARCOCOA BUTTER.WHOLE MILK POWDER Saturated Fat 5g 25% MARSHMALLOWS (SUGAR.GLUCOSE SYRUP.GELATI

Processing Images:  88%|████████▊ | 43/49 [01:10<00:13,  2.18s/it]

Image URL: https://m.media-amazon.com/images/I/61HXgujoxpL.jpg
Extracted Text: IP65 150W
[2024/09/14 15:39:28] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.13818645477294922
[2024/09/14 15:39:28] ppocr DEBUG: cls num  : 10, elapsed : 0.12021327018737793
[2024/09/14 15:39:28] ppocr DEBUG: rec_res num  : 10, elapsed : 0.4501457214355469


Processing Images:  90%|████████▉ | 44/49 [01:11<00:09,  1.82s/it]

Image URL: https://m.media-amazon.com/images/I/613G8GOyLSL.jpg
Extracted Text: 220mm/8.66in 150W IP65 305mm/12.00in 32mm/1.25in Ground Wall Ceiling
[2024/09/14 15:39:29] ppocr DEBUG: dt_boxes num : 11, elapsed : 0.15043854713439941
[2024/09/14 15:39:29] ppocr DEBUG: cls num  : 11, elapsed : 0.06997466087341309
[2024/09/14 15:39:29] ppocr DEBUG: rec_res num  : 11, elapsed : 0.5900824069976807


Processing Images:  92%|█████████▏| 45/49 [01:12<00:06,  1.57s/it]

Image URL: https://m.media-amazon.com/images/I/71YyZ2iPyZL.jpg
Extracted Text: Durable & Flexible LED Floodlight Thickened Glass High-Quality LEDs Die-Cast Alumium Frame IP65 30W Mental Bracket
[2024/09/14 15:39:30] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.1230318546295166
[2024/09/14 15:39:30] ppocr DEBUG: cls num  : 9, elapsed : 0.06964397430419922
[2024/09/14 15:39:30] ppocr DEBUG: rec_res num  : 9, elapsed : 0.33678340911865234


Processing Images:  94%|█████████▍| 46/49 [01:12<00:03,  1.31s/it]

Image URL: https://m.media-amazon.com/images/I/81K3JwUCnQL.jpg
Extracted Text: 1P65 30W IP65 waterproof Dustproof Heat-resisting Frost-resistant
[2024/09/14 15:39:31] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.4753532409667969
[2024/09/14 15:39:31] ppocr DEBUG: cls num  : 8, elapsed : 0.07367515563964844
[2024/09/14 15:39:31] ppocr DEBUG: rec_res num  : 8, elapsed : 0.2714855670928955


Processing Images:  96%|█████████▌| 47/49 [01:13<00:02,  1.19s/it]

Image URL: https://m.media-amazon.com/images/I/41wvffSxB4L.jpg
Extracted Text: 15.5g 50cm 4cm BF EST FRIF ANDS 3.5cm
[2024/09/14 15:39:32] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.24298906326293945
[2024/09/14 15:39:32] ppocr DEBUG: cls num  : 6, elapsed : 0.059409379959106445
[2024/09/14 15:39:32] ppocr DEBUG: rec_res num  : 6, elapsed : 0.5231935977935791


Processing Images:  98%|█████████▊| 48/49 [01:14<00:01,  1.20s/it]

Image URL: https://m.media-amazon.com/images/I/91cErO-KbLL.jpg
Extracted Text: JACQUARD KNITTED MATERIAL Super soft and comfortable creating a custom upholstered look 200GSM Microfiber
[2024/09/14 15:39:33] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.1219325065612793
[2024/09/14 15:39:33] ppocr DEBUG: cls num  : 12, elapsed : 0.0300445556640625
[2024/09/14 15:39:34] ppocr DEBUG: rec_res num  : 12, elapsed : 1.6798467636108398


Processing Images: 100%|██████████| 49/49 [01:17<00:00,  1.57s/it]

Image URL: https://m.media-amazon.com/images/I/817vo3DcCNL.jpg
Extracted Text: KOMFORT-PAKET DAS HERZSTUCK : STARKER UND EFFIZIENTER MOTOR FUR ANGENEHMEN FAHRKOMFORT KRAFTVOLLE UND EFFIZIENTE EXTREM LEISERUND UNGLAUBLICHES DREHMOMENT UNTERSTUTZUNG BIS LEISTUNGSSTARKERMOTOR MIT VON 25 KM/H 250 W 45 NM





In [5]:
results = train_df.copy()
results['extracted_text'] = results['image_link'].apply(process_image_from_url)

[2024/09/14 15:39:58] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.14313936233520508
[2024/09/14 15:39:58] ppocr DEBUG: cls num  : 19, elapsed : 0.08128881454467773
[2024/09/14 15:39:59] ppocr DEBUG: rec_res num  : 19, elapsed : 1.044745922088623
[2024/09/14 15:40:00] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.14001226425170898
[2024/09/14 15:40:00] ppocr DEBUG: cls num  : 19, elapsed : 0.03688502311706543
[2024/09/14 15:40:01] ppocr DEBUG: rec_res num  : 19, elapsed : 0.7748618125915527
[2024/09/14 15:40:01] ppocr DEBUG: dt_boxes num : 53, elapsed : 0.26578593254089355
[2024/09/14 15:40:01] ppocr DEBUG: cls num  : 53, elapsed : 0.22244668006896973
[2024/09/14 15:40:03] ppocr DEBUG: rec_res num  : 53, elapsed : 2.2711856365203857
[2024/09/14 15:40:04] ppocr DEBUG: dt_boxes num : 45, elapsed : 0.15004873275756836
[2024/09/14 15:40:04] ppocr DEBUG: cls num  : 45, elapsed : 0.11779165267944336
[2024/09/14 15:40:06] ppocr DEBUG: rec_res num  : 45, elapsed : 2.357625722885132
[2024/09/14 15

In [6]:
results.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS NATURE INGREDIENT MENAGER MULTI-USAGE T...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,TEARRIFIC LEBENSMITTELECHT HDAY GEPRAGTES DESI...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size:1 Tablet 0.709 g)Each...
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,WarningConsult your physician before using thi...
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbaach HIGHSTRENGTH PSYLLIOM HUSK 1400 PLANT...


#### 3. Performing Entity Mapping

In [7]:
import re
entity_patterns = {
    'item_weight': {
        'gram': r'(\d+\.?\d*)\s*(gram)',
        'kilogram': r'(\d+\.?\d*)\s*(kilogram|kg)',
        'microgram': r'(\d+\.?\d*)\s*(microgram|mcg)',
        'milligram': r'(\d+\.?\d*)\s*(milligram|mg)',
        'ounce': r'(\d+\.?\d*)\s*(ounce|oz)',
        'pound': r'(\d+\.?\d*)\s*(pound|lb)',
        'ton': r'(\d+\.?\d*)\s*(ton|t)'
    },
    'item_volume': {
        'centilitre': r'(\d+\.?\d*)\s*(centilitre|cl)',
        'cubic foot': r'(\d+\.?\d*)\s*(cubic foot|cu ft)',
        'cubic inch': r'(\d+\.?\d*)\s*(cubic inch|cu in)',
        'cup': r'(\d+\.?\d*)\s*(cup)',
        'decilitre': r'(\d+\.?\d*)\s*(decilitre|dl)',
        'fluid ounce': r'(\d+\.?\d*)\s*(fluid ounce|fl oz)',
        'gallon': r'(\d+\.?\d*)\s*(gallon|gal)',
        'imperial gallon': r'(\d+\.?\d*)\s*(imperial gallon|imp gal)',
        'litre': r'(\d+\.?\d*)\s*(litre|liter|l)',
        'microlitre': r'(\d+\.?\d*)\s*(microlitre|µl)',
        'millilitre': r'(\d+\.?\d*)\s*(millilitre|ml)',
        'pint': r'(\d+\.?\d*)\s*(pint)',
        'quart': r'(\d+\.?\d*)\s*(quart)'
    },
    'depth': {
        'centimetre': r'(\d+\.?\d*)\s*(centimetre|cm)',
        'foot': r'(\d+\.?\d*)\s*(foot|ft)',
        'inch': r'(\d+\.?\d*)\s*(inch|in)',
        'metre': r'(\d+\.?\d*)\s*(metre|meter|m)',
        'millimetre': r'(\d+\.?\d*)\s*(millimetre|mm)',
        'yard': r'(\d+\.?\d*)\s*(yard|yd)'
    },
    'height': {
        'centimetre': r'(\d+\.?\d*)\s*(centimetre|cm)',
        'foot': r'(\d+\.?\d*)\s*(foot|ft)',
        'inch': r'(\d+\.?\d*)\s*(inch|in)',
        'metre': r'(\d+\.?\d*)\s*(metre|meter|m)',
        'millimetre': r'(\d+\.?\d*)\s*(millimetre|mm)',
        'yard': r'(\d+\.?\d*)\s*(yard|yd)'
    },
    'maximum_weight_recommendation': {
        'gram': r'(\d+\.?\d*)\s*(gram)',
        'kilogram': r'(\d+\.?\d*)\s*(kilogram|kg)',
        'microgram': r'(\d+\.?\d*)\s*(microgram|mcg)',
        'milligram': r'(\d+\.?\d*)\s*(milligram|mg)',
        'ounce': r'(\d+\.?\d*)\s*(ounce|oz)',
        'pound': r'(\d+\.?\d*)\s*(pound|lb)',
        'ton': r'(\d+\.?\d*)\s*(ton|t)'
    },
    'voltage': {
        'kilovolt': r'(\d+\.?\d*)\s*(kilovolt|kV)',
        'millivolt': r'(\d+\.?\d*)\s*(millivolt|mV)',
        'volt': r'(\d+\.?\d*)\s*(volt|V)'
    },
    'wattage': {
        'kilowatt': r'(\d+\.?\d*)\s*(kilowatt|kW)',
        'watt': r'(\d+\.?\d*)\s*(watt|W)'
    }
}


# Function to extract the value based on the entity_name
def extract_entity_value(extracted_text, entity_name):
    patterns = entity_patterns.get(entity_name, {})
    for unit, pattern in patterns.items():
        match = re.search(pattern, extracted_text)
        if match:
            return match.group(1) + ' ' + unit
    return ''
results['extracted_value'] = results.apply(lambda row: extract_entity_value(row['extracted_text'], row['entity_name']), axis=1)

results

Unnamed: 0,image_link,group_id,entity_name,entity_value,extracted_text,extracted_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS NATURE INGREDIENT MENAGER MULTI-USAGE T...,
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,TEARRIFIC LEBENSMITTELECHT HDAY GEPRAGTES DESI...,
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size:1 Tablet 0.709 g)Each...,200 milligram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,WarningConsult your physician before using thi...,
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbaach HIGHSTRENGTH PSYLLIOM HUSK 1400 PLANT...,
5,https://m.media-amazon.com/images/I/61QsBSE7jg...,731432,item_weight,1400 milligram,Horbaach HIGHSTRENGTH Naturally-Sourced Psylli...,1400 milligram
6,https://m.media-amazon.com/images/I/81xsq6vf2q...,731432,item_weight,1400 milligram,"Horbaach Directions: For adults, take two (2) ...",1400 milligram
7,https://m.media-amazon.com/images/I/71DiLRHeZd...,731432,item_weight,1400 milligram,VEGAN Horbaach WHEAT FREE HIGH STRENGTH 000 PS...,
8,https://m.media-amazon.com/images/I/91Cma3Rzse...,731432,item_weight,1400 milligram,Horbaach 100% HIGHEST QUALITY Horbaach TO GMP ...,
9,https://m.media-amazon.com/images/I/71jBLhmTNl...,731432,item_weight,1400 milligram,NEWOOK SAME TRUSTED OUALITY OLD NEW Horbaach H...,


#### 4. Using BERT NER

In [None]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
ner_results = nlp_ner(extracted_text)