In [None]:
!pip install paddlepaddle paddleocr
!pip install gdown

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting shapely (from paddleocr)
  Downloading shapely-2.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting imgaug (from paddleocr)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp3

**1.** **Libraries and Imports**
<br/>
This section imports various libraries that enable functionality such as downloading images, performing OCR (Optical Character Recognition), manipulating images, working with data in CSV format, and downloading files from Google Drive. These libraries are essential for processing images, extracting relevant text, and handling the results.


In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from paddleocr import PaddleOCR
import numpy as np
from docx import Document
from docx.shared import Inches
from PIL import Image
import io
import gdown
import re
import os

**2. PaddleOCR Initialization**
<br/>
PaddleOCR is initialized in this section. The model is set to use angle classification and English language recognition. The use_gpu parameter is set to True, allowing the model to leverage GPU acceleration if available, improving the speed of OCR.

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:17<00:00, 234kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:18<00:00, 567kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:15<00:00, 141kiB/s]

[2024/09/14 12:26:57] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




In [None]:
os.makedirs("dataset")

**3. Google Drive File Downloading**
<br />
The script uses the gdown library to download a CSV file from Google Drive. The file contains data, which is later used for prediction. The file is downloaded and saved locally with a specified name.



In [None]:
vol = '1vlTp8dPrBYkvsv045Sdp6GrexcGv-bjZ'
url = f'https://drive.google.com/uc?id={vol}'
output_file = 'downloaded_file.csv'
gdown.download(url, output_file, quiet=False)

print(f"File downloaded and saved as {output_file}")


Downloading...
From: https://drive.google.com/uc?id=1vlTp8dPrBYkvsv045Sdp6GrexcGv-bjZ
To: /content/downloaded_file.csv
100%|██████████| 298k/298k [00:00<00:00, 100MB/s]

File downloaded and saved as downloaded_file.csv





**4. Image Downloading and Text Extraction**
<br/>
A function is defined to download an image from a URL using the requests library. The image is processed using PaddleOCR to extract text, which is then stored in a list. This extracted text will be processed in later steps to identify weight values and units.

In [None]:
def download_image(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content))
    return image

In [None]:
def get_text(url):
    res = []
    image = download_image(url)
    image_np = np.array(image)
    result = ocr.ocr(image_np, cls=True)
    # print(result)
    try:
        for i in result:
            for j in i:
                for k in j:
                    if isinstance(k[0], str):
                        res.append(k[0])
    except Exception as e:
        print(f"An error occurred: {e}")
    print(res)
    return res

**5. Mapping Volume Units**
<br/>
The map_to_volume_unit() function handles the conversion of various volume units into standardized ones, such as "liter", "gallon", and "cup". This ensures consistency when analyzing and reporting volume data.

**6. Regular Expression for Volume Detection**
<br/>
The find_value() function uses a regular expression to find and extract volume values along with their units from the text. The regular expression accounts for a wide range of unit formats, including abbreviations like "ml", "gallon", and "fl oz".

In [None]:
def map_to_volume_unit(unit):
    volume_units = {
        "cubic foot": "cubic foot",
        "cf": "cubic foot",
        "microlitre": "microliter",
        "μl": "microliter",
        "ml": "milliliter",
        "millilitre": "milliliter",
        "centilitre": "centiliter",
        "cl": "centiliter",
        "cup": "cup",
        "fluid ounce": "fluid ounce",
        "fl oz": "fluid ounce",
        "imperial gallon": "imperial gallon",
        "pint": "pint",
        "decilitre": "deciliter",
        "dl": "deciliter",
        "litre": "liter",
        "l": "liter",
        "quart": "quart",
        "cubic inch": "cubic inch",
        "ci": "cubic inch",
        "gallon": "gallon",
        "gal": "gallon"
    }
    return volume_units.get(unit.lower(), unit)

def find_value(text_list):
    text = ' '.join(text_list).lower()
    pattern = r'(\d+(\.\d+)?)\s*(?:[:\-,./\\|\'\s]*)\s*(cubic foot|cf|microlitre|μl|millilitre|ml|centilitre|cl|cup|fluid ounce|fl oz|imperial gallon|pint|decilitre|dl|litre|l|quart|cubic inch|ci|gallon|gal)\b'
    matches = re.findall(pattern, text)

    extracted_values = []
    for match in matches:
        value = match[0]
        unit = match[2].strip()
        standard_unit = map_to_volume_unit(unit)
        extracted_values.append(f"{value} {standard_unit}")

    return extracted_values


**7. Prediction Function**
<br/>
The predictor() function processes each image URL, passing it through the OCR pipeline to extract volume data. If a valid volume is found, it is printed and returned; otherwise, it returns an empty string.
<br/>
**8. Main Program Execution**
<br/>
This section loads the downloaded CSV file, processes each image to predict the volume values, and saves the results in the test_out.csv file under the dataset folder.
<br/>
**9. Execution**
<br/>
To execute the above script: The script first downloads a CSV file of image links.
Each image is processed to extract volume information.
The extracted data is saved in an output CSV file for further use

In [None]:
i = 0

def predictor(url, category_id, entity_name):
    global i
    i += 1
    print(i)
    '''
    Call your model/approach here
    '''
    #TODO
    res = get_text(url)
    f = find_value(res)
    if len(f) > 0:
      print(f[0])
      return f[0]
    else:
      print("no")
      return " "

if __name__ == "__main__":
    DATASET_FOLDER = '/content/dataset'

    test = pd.read_csv('downloaded_file.csv')

    test['prediction'] = test.apply(
        lambda row: predictor(row['image_link'], row['group_id'], row['entity_name']), axis=1)

    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
    test[['index', 'prediction']].to_csv(output_filename, index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
355 milliliter
3001
[2024/09/14 13:43:26] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.26097536087036133
[2024/09/14 13:43:26] ppocr DEBUG: cls num  : 8, elapsed : 0.05312156677246094
[2024/09/14 13:43:27] ppocr DEBUG: rec_res num  : 8, elapsed : 0.5268621444702148
['12', 'x', '11.2FL.OZ.', 'BOTTLES', 'GUINNESS', "ST JAMES'S GATE DUBLIN,IRELAND", 'DRAUGHT', 'STOUT']
no
3002
[2024/09/14 13:43:27] ppocr DEBUG: dt_boxes num : 34, elapsed : 0.2599785327911377
[2024/09/14 13:43:28] ppocr DEBUG: cls num  : 34, elapsed : 0.09923887252807617
[2024/09/14 13:43:29] ppocr DEBUG: rec_res num  : 34, elapsed : 1.5053584575653076
["ELMER'S", "ELMER'S", "ELMER'S", "ELMER'S", 'WASHABLE', 'WASHABLE', 'WASHABLE', 'WASHABLE', 'COLOR', 'COLOR', 'COLOR', 'COLOR', 'GLUE', 'GLUE', 'GLUE', 'GLUE', 'Techg', 'Tehg', 'Techgs', "Teche'", 'Safe|Nontoxic', 'Safe|Nontoxic', 'Safe|Nontoxic', 'Safe|Nontoxic', '5 fl oz (147 mL)', '5 fl oz (147 mL)', '5 fl oz