In [None]:
!pip install paddlepaddle paddleocr
!pip install gdown

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting shapely (from paddleocr)
  Downloading shapely-2.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting imgaug (from paddleocr)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp3

**1.** **Libraries and Imports**
<br/>
This section imports various libraries that enable functionality such as downloading images, performing OCR (Optical Character Recognition), manipulating images, working with data in CSV format, and downloading files from Google Drive. These libraries are essential for processing images, extracting relevant text, and handling the results.


In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
from paddleocr import PaddleOCR
import numpy as np
from docx import Document
from docx.shared import Inches
from PIL import Image
import io
import gdown
import re
import os

**2. PaddleOCR Initialization**
<br/>
PaddleOCR is initialized in this section. The model is set to use angle classification and English language recognition. The use_gpu parameter is set to True, allowing the model to leverage GPU acceleration if available, improving the speed of OCR.

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:16<00:00, 245kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:19<00:00, 521kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:14<00:00, 148kiB/s]

[2024/09/14 13:13:39] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




In [None]:
os.makedirs("dataset")

**3. Google Drive File Downloading**
<br />
The script uses the gdown library to download a CSV file from Google Drive. The file contains data, which is later used for prediction. The file is downloaded and saved locally with a specified name.



In [None]:
volt = '1Oe7QYgbejBWNre4g43gn_13lQUqOjKgt'
url = f'https://drive.google.com/uc?id={volt}'
output_file = 'downloaded_file.csv'
gdown.download(url, output_file, quiet=False)

print(f"File downloaded and saved as {output_file}")


Downloading...
From: https://drive.google.com/uc?id=1Oe7QYgbejBWNre4g43gn_13lQUqOjKgt
To: /content/downloaded_file.csv
100%|██████████| 401k/401k [00:00<00:00, 116MB/s]

File downloaded and saved as downloaded_file.csv





**4. Image Downloading and Text Extraction**
<br/>
A function is defined to download an image from a URL using the requests library. The image is processed using PaddleOCR to extract text, which is then stored in a list. This extracted text will be processed in later steps to identify weight values and units.

In [None]:
def download_image(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content))
    return image

**5. Text Extraction Using PaddleOCR**
<br/>
The function get_text() performs OCR on the downloaded image. It extracts all text in the image using PaddleOCR and organizes the results into a list. This extracted text will be processed to find voltage-related values using regular expressions.



In [None]:
def get_text(url):
    res = []
    image = download_image(url)
    image_np = np.array(image)
    result = ocr.ocr(image_np, cls=True)
    # print(result)
    try:
        for i in result:
            for j in i:
                for k in j:
                    if isinstance(k[0], str):
                        res.append(k[0])
    except Exception as e:
        print(f"An error occurred: {e}")
    print(res)
    return res

**6. Mapping Voltage Units**
<br/>
The function map_to_standard_unit() normalizes various representations of voltage units into a standard format, such as converting "mv" to "millivolt" and "kv" to "kilovolt". This ensures consistency in the units, making it easier to process and analyze the extracted values.

**7. Regular Expression for Voltage Detection**
<br/>
The find_value() function uses a regular expression to find voltage values in the extracted text. It searches for patterns involving numbers followed by voltage units (e.g., "V", "kV", "mV", etc.) and returns them in a standardized format. This ensures that voltage values from different images are extracted accurately.



In [None]:
def map_to_standard_unit(unit):
    conversion_map = {
        "mv": "millivolt",
        "millivolt": "millivolt",
        "millivolts": "millivolt",
        "mv.": "millivolt",
        "kv": "kilovolt",
        "kilovolt": "kilovolt",
        "kilovolts": "kilovolt",
        "kv.": "kilovolt",
        "v": "volt",
        "volts": "volt",
        "volt": "volt",
        "v.": "volt",
        "KV": "kilovolt",
        "kV": "kilovolt",
        "MV": "megavolt",
        "mV": "millivolt",
        "VOLT": "volt",
        "VOLTS": "volt"
    }
    return conversion_map.get(unit.lower().strip(':/.-\\'), unit)
def find_value(text_list):
    text = ' '.join(text_list).lower()
    pattern = r'(\d+(\.\d+)?)\s*[\/:\\.\-]?\s*([mMkKvV]?v|[mMkK]?[vV]?olts?|millivolt|kilovolt|volt|VOLT|VOLTS)\s*[\/:\\.\-]?\b'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return [match[0] + ' ' + map_to_standard_unit(match[2]).strip() for match in matches]

**8. Prediction Logic**
<br/>
In the predictor() function, images are processed one by one, and text is extracted from each image. The function looks for voltage-related values using the regular expression and returns the first detected value. If no value is found, it returns an empty string.
<br/>
**9. Main Program Execution**
<br/>
In the main block of the program:
The CSV file is loaded, and each row is processed.
For each row, the image URL is passed to the predictor function, which returns the extracted voltage value.
The predicted voltage values are added to the CSV file under a new "prediction" column.
<br/>
**10. Saving the Results**
<br/>
The script saves the processed data into a new CSV file within the dataset folder. This file contains the index and the predicted voltage values, allowing easy access to the results for analysis or validation.

In [None]:
i = 0

def predictor(url, category_id, entity_name):
    global i
    i += 1
    print(i)
    '''
    Call your model/approach here
    '''
    #TODO
    res = get_text(url)
    f = find_value(res)
    if len(f) > 0:
      print(f[0])
      return f[0]
    else:
      print("no")
      return " "

if __name__ == "__main__":
    DATASET_FOLDER = '/content/dataset'

    test = pd.read_csv('downloaded_file.csv')

    test['prediction'] = test.apply(
        lambda row: predictor(row['image_link'], row['group_id'], row['entity_name']), axis=1)

    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
    test[['index', 'prediction']].to_csv(output_filename, index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['3000RPM Loading Speed', '3000RPM loading speed helps dissipate high', 'heat to a great extent.']
no
4656
[2024/09/14 14:26:01] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.2574741840362549
[2024/09/14 14:26:01] ppocr DEBUG: cls num  : 5, elapsed : 0.04064750671386719
[2024/09/14 14:26:02] ppocr DEBUG: rec_res num  : 5, elapsed : 0.2520740032196045
['iKzF4028', 'Dual ball', 'MADE IN CHINA', '12V', 'bearing']
12 volt
4657
[2024/09/14 14:26:02] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.24441933631896973
[2024/09/14 14:26:02] ppocr DEBUG: cls num  : 18, elapsed : 0.04520463943481445
[2024/09/14 14:26:03] ppocr DEBUG: rec_res num  : 18, elapsed : 1.200380802154541
['120x120x38mm', 'Noise:42dBA', 'Rated Voltage:24V', 'Life: 35000 hours', 'Rated Current: 0.35A', 'Bearing type: Sleeve Bearing.', 'Speed: 3000RPM', 'Air Pressure (In H2O) :0.39', 'Air flow:125CFM', 'Connecter: XH2.54-2Pin', 'Cable Length: 25cm Or', '9.85 inches', 