In [None]:
!pip install paddlepaddle paddleocr
!pip install gdown

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting shapely (from paddleocr)
  Downloading shapely-2.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting imgaug (from paddleocr)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp3

**1.** **Libraries and Imports**
<br/>
This section imports various libraries that enable functionality such as downloading images, performing OCR (Optical Character Recognition), manipulating images, working with data in CSV format, and downloading files from Google Drive. These libraries are essential for processing images, extracting relevant text, and handling the results.


In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from paddleocr import PaddleOCR
import numpy as np
from docx import Document
from docx.shared import Inches
from PIL import Image
import io
import gdown
import re
import os

**2. PaddleOCR Initialization**
<br/>
PaddleOCR is initialized in this section. The model is set to use angle classification and English language recognition. The use_gpu parameter is set to True, allowing the model to leverage GPU acceleration if available, improving the speed of OCR.

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:15<00:00, 256kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:16<00:00, 613kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:15<00:00, 141kiB/s]

[2024/09/14 14:44:23] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




In [None]:
os.makedirs("dataset")

**3. Google Drive File Downloading**
<br />
The script uses the gdown library to download a CSV file from Google Drive. The file contains data, which is later used for prediction. The file is downloaded and saved locally with a specified name.



In [None]:
maxweight = '1z3iLz2R7-gGvRWwJ2rMWtDBq1A4hozfK'
weight = '1F7emO5QqHPNwFlZhz7nvq3kH3L9h2r1G'
url = f'https://drive.google.com/uc?id={weight}'
output_file = 'downloaded_file.csv'
gdown.download(url, output_file, quiet=False)

print(f"File downloaded and saved as {output_file}")


Downloading...
From: https://drive.google.com/uc?id=1F7emO5QqHPNwFlZhz7nvq3kH3L9h2r1G
To: /content/downloaded_file.csv
100%|██████████| 1.71M/1.71M [00:00<00:00, 195MB/s]

File downloaded and saved as downloaded_file.csv





**4. Image Downloading and Text Extraction**
<br/>
A function is defined to download an image from a URL using the requests library. The image is processed using PaddleOCR to extract text, which is then stored in a list. This extracted text will be processed in later steps to identify weight values and units.

In [None]:
def download_image(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content))
    return image

**5. Text Parsing for Weight Units**
<br />
Using regular expressions, the script identifies weight-related measurements from the extracted text. It looks for patterns of numerical values followed by units such as grams, kilograms, pounds, ounces, etc. These values are then parsed and prepared for further processing.

In [None]:
def get_text(url):
    res = []
    image = download_image(url)
    image_np = np.array(image)
    result = ocr.ocr(image_np, cls=True)
    # print(result)
    try:
        for i in result:
            for j in i:
                for k in j:
                    if isinstance(k[0], str):
                        res.append(k[0])
    except Exception as e:
        print(f"An error occurred: {e}")
    print(res)
    return res

In [None]:
def find_value(text_list):
    text = ' '.join(text_list).lower()
    pattern = r'(\d+(\.\d+)?)\s*(weight|g|kg|grams?|gram|kilogram|kilograms?|pound|pounds?|gm|gms|oz|milligram|mg|milligrams?|mgs|grains?|gr|grs|ounce|ounces?|lbs?)\b'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return [match[0] + ' ' + map_to_standard_unit(match[2]).strip() for match in matches]

**6. Mapping to Standard Units**
<br/>
This section maps various weight units (like grams, kilograms, pounds, etc.) to standardized units using a dictionary. This mapping ensures that all weight units are consistent and comparable, regardless of how they are presented in the text.

In [None]:
from collections import Counter

def map_to_standard_unit(weight_unit):
    unit_mappings = {
        "mg": "milligram",
        "milligram": "milligram",
        "mgs": "milligram",
        "g": "gram",
        "gram": "gram",
        "grams": "gram",
        "gm": "gram",
        "gms": "gram",
        "kg": "kilogram",
        "kilogram": "kilogram",
        "kilograms": "kilogram",
        "pound": "pound",
        "pounds": "pound",
        "lb": "pound",
        "lbs": "pound",
        "ibs": "pound",
        "oz": "ounce",
        "ounce": "ounce",
        "ounces": "ounce",
        "gr": "gram",
        "grs": "gram",
        "grain": "gram",
        "grains": "gram",
        "ton": "ton",
        "tons": "ton"
    }
    return unit_mappings.get(weight_unit.lower(), None)

def predictor1(url, entity):
  try:
    if entity != "item_weight":
        return ""
    result = get_text(url)
    values = find_value(result)
    print(values)
    d = {val.split()[0] : val.split()[1] for val in values}
    if not values:
        return ""
    weights = [weight.split()[0] for weight in values]
    if not weights:
        print("no")
        return ""
    weight_counts = Counter(weights)
    most_common_weight, count = weight_counts.most_common(1)[0]
    most_common_weights = [weight for weight, freq in weight_counts.items() if freq == count]
    if len(most_common_weights) > 1:
        result_weight = max(most_common_weights)
    else:
        result_weight = most_common_weight

    print(result_weight + " " + d[result_weight])
    return result_weight + " " + d[result_weight]
  except Exception as e:
    print(f"An error occurred: {e}")
    return ""

**7. Prediction Logic**
<br/>
The core function of this script predicts the weight of an item from an image based on the extracted text. It processes the extracted text, maps the weight units, and finds the most frequent or largest weight value from the identified results. This predicted value is then returned.

**8. Main Program Execution**
<br/>
In the main execution block:

A dataset is loaded from the previously downloaded CSV file.
The script processes the dataset, downloading images from URLs provided in the CSV file, extracting text, and predicting weight values.
The results are then saved in a new CSV file, where the predicted weights are added as a column.
<br/>
**9. File Processing and Output**
<br/>
Finally, the processed data, including the predicted weights, is saved to a CSV file in the dataset folder. The output file contains both the index and the predicted values, allowing easy access to the results for further analysis or validation.

In [None]:
import os
import random
import pandas as pd

i = 0

def predictor(url, category_id, entity_name):
    '''
    Call your model/approach here
    '''
    global i
    i += 1
    print(i)
    #TODO
    return predictor1(url, entity_name)

if __name__ == "__main__":
    DATASET_FOLDER = '/content/dataset'
    header = pd.read_csv('downloaded_file.csv', nrows=1).columns

    test = pd.read_csv('downloaded_file.csv', skiprows=14999, header=None, names=header)

    test['prediction'] = test.apply(
        lambda row: predictor(row['image_link'], row['group_id'], row['entity_name']), axis=1)

    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
    test[['index', 'prediction']].to_csv(output_filename, index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2024/09/14 17:44:09] ppocr DEBUG: dt_boxes num : 9, elapsed : 0.07991433143615723
[2024/09/14 17:44:09] ppocr DEBUG: cls num  : 9, elapsed : 0.05376863479614258
[2024/09/14 17:44:10] ppocr DEBUG: rec_res num  : 9, elapsed : 0.5022926330566406
['Durable zipper', 'Internal pouch', 'for Makeup brush', 'Lipstick', 'Eyebrow pencil', 'Mascara', '12oz Cotton Canvas', 'Jacquard pattern']
['12 ounce']
12 ounce
6475
[2024/09/14 17:44:10] ppocr DEBUG: dt_boxes num : 21, elapsed : 0.08192157745361328
[2024/09/14 17:44:10] ppocr DEBUG: cls num  : 21, elapsed : 0.05424976348876953
[2024/09/14 17:44:11] ppocr DEBUG: rec_res num  : 21, elapsed : 1.2962355613708496
['Directions', ' FOR USE', ' Administer 1g per 5kg body weight Orally', 'Give twice daily ( AM & PM )', 'INGREDIENTS', 'PRODUCTFACTS', 'Calcium Carbonate', '250 mg', 'Chitosan', '200 mg', 'Inactive Ingredients:Lactose, Hydrolyzed Soy Protein.', 'Aluminum', 'Hydroxide', 'FREE',