In [None]:
!pip install paddlepaddle paddleocr

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting httpx (from paddlepaddle)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.6.0.tar

**1. Library Imports**
<br/>
Imports essential libraries for image manipulation (cv2, numpy, PIL), text extraction (paddleocr), and data handling (pandas). It also includes modules for mathematical operations (math), HTTP requests (requests), and regular expressions (re).

In [None]:
import cv2
import numpy as np
from paddleocr import PaddleOCR
import math
import requests
from io import BytesIO
from PIL import Image
import os
import pandas as pd
import gdown


import re

**2. Unit Mapping Function**
<br/>
The map_to_length_unit() function standardizes various representations of length units to a consistent format, such as converting "cm" to "centimeter" and "mm" to "millimeter".

In [None]:
def map_to_length_unit(unit):
    length_units = {
        "centimetre": "centimeter",
        "cm": "centimeter",
        "millimetre": "millimeter",
        "mm": "millimeter",
        "metre": "meter",
        "m": "meter",
        "inch": "inch",
        "i": "inch",
        "in": "inch",
        "foot": "foot",
        "ft": "foot",
        "yard": "yard",
        "yd": "yard",
        "feet": "foot",
        "yds": "yard"
    }

    clean_unit = re.sub(r'[.,:/\\]', '', unit).strip().lower()
    return length_units.get(clean_unit, unit)

**3. PaddleOCR Initialization**
<br/>
PaddleOCR is initialized to perform OCR tasks. It is configured with angle classification and English language support, suitable for extracting text from images.

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en')

**4. Image Conversion Function**
<br/>
The url_to_image() function downloads an image from a provided URL, converts it to RGB format using PIL, and then converts it to an OpenCV-compatible BGR format for further processing.



In [None]:
def url_to_image(url):
    response = requests.get(url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    open_cv_image = np.array(image)
    return cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)

**5. Vertical Line Detection**
<br/>
The detect_vertical_lines() function identifies vertical lines in the image using edge detection and Hough Line Transform techniques. It returns a list of vertical lines with their midpoints and lengths.



In [None]:
def detect_vertical_lines(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray_image, 50, 150, apertureSize=3)
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 100, minLineLength=100, maxLineGap=10)

    vertical_lines = []

    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x1 - x2) < 10:
                line_length = abs(y2 - y1)
                x_mid = (x1 + x2) / 2
                y_mid = (y1 + y2) / 2
                vertical_lines.append((x_mid, y_mid, line_length, x1, y1, x2, y2))

    return vertical_lines

**6. Text Extraction with Coordinates**
<br/>
The extract_text_with_coordinates() function extracts text from an image using PaddleOCR and calculates the center coordinates of each text bounding box.



In [None]:
def extract_text_with_coordinates(image):
    result = ocr.ocr(image)

    text_coordinates = []
    for line in result:
        for word_info in line:
            text = word_info[1][0]
            coordinates = word_info[0]
            x_center = (coordinates[0][0] + coordinates[2][0]) / 2
            y_center = (coordinates[0][1] + coordinates[2][1]) / 2
            text_coordinates.append((text, x_center, y_center))

    return text_coordinates

**7. Distance Calculation**
<br/>
The calculate_distance() function computes the Euclidean distance between two points in the image. This is used to measure the proximity between text and detected vertical lines.



In [None]:
def calculate_distance(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

**8. Minimum Distance Text Finder**
<br/>
The find_minimum_distance_text() function finds the text closest to any detected vertical line. It considers both the distance and the length of the vertical lines to determine the closest text.

In [None]:
def find_minimum_distance_text(image):
    vertical_lines_midpoints = detect_vertical_lines(image)
    if not vertical_lines_midpoints:
        return None, None

    text_coordinates = extract_text_with_coordinates(image)
    min_distance = float('inf')
    closest_text = None
    closest_line_length = -1
    for text, x_text, y_text in text_coordinates:
        distances_to_lines = []
        for x_line_mid, y_line_mid, line_length, _, _, _, _ in vertical_lines_midpoints:
            distance = calculate_distance((x_text, y_text), (x_line_mid, y_line_mid))
            distances_to_lines.append((distance, line_length))
        distances_to_lines.sort(key=lambda d: (-d[1], d[0]))
        best_distance, best_line_length = distances_to_lines[0]
        if best_distance < min_distance or (best_distance == min_distance and best_line_length > closest_line_length):
            min_distance = best_distance
            closest_text = text
            closest_line_length = best_line_length

    return closest_text, min_distance

**9. Number and Text Extraction**
<br/>
The extract_number_and_text() function extracts a number and a text unit from an input string using regular expressions. It captures numeric values and associated units like "cm" or "inch".

In [None]:
def extract_number_and_text(input_string):
    pattern = r'(\d+(\.\d+)?)\s*([a-zA-Z]+)'
    match = re.search(pattern, input_string)

    if match:
        number = match.group(1)
        text = match.group(3)
        return number, text
    else:
        return None, None

**10. Main Processing Function**
<br/>
The get_vertical() function processes the image URL to find the closest text to vertical lines and extracts the number with its associated length unit. It then returns the formatted length measurement.

In [None]:
def get_vertical(image_url):
    image = url_to_image(image_url)

    closest_text, min_distance = find_minimum_distance_text(image)

    if closest_text:
        try:
            m, u = extract_number_and_text(closest_text)
            if u:
              fina = map_to_length_unit(u.strip())
              f = f"{m} {fina}"
              print(f)
              return f
        except ValueError:
            print("Text format is incorrect")
            return ""

**11. Prediction Function**
<br/>
The predictor() function calls get_horizontal() to process an image URL and obtain predictions. It is used for batch processing of images.

**12. Save Predictions Function**
<br/>
The save_predictions() function saves or appends batch predictions to a CSV file. It ensures that predictions are properly written to the output file, handling both new and existing files.



In [None]:

i = 0
batch_size = 5

def predictor(url, category_id, entity_name):
    global i
    i += 1
    print(f"Processing {i}th image...")
    return get_horizontal(url)

def save_predictions(df):
    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')

    # Check if the file exists; if not, create and write headers
    if not os.path.exists(output_filename):
        df[['index', 'prediction']].to_csv(output_filename, index=False, mode='w', header=True)
    else:
        df[['index', 'prediction']].to_csv(output_filename, index=False, mode='a', header=False)

    print(f"Batch predictions appended to {output_filename}")

if __name__ == "__main__":
    DATASET_FOLDER = 'dataset'

    # Create dataset folder if it doesn't exist
    if not os.path.exists(DATASET_FOLDER):
        os.makedirs(DATASET_FOLDER)

    # Download a sample CSV file
    height = '1wDYH_D-S3WhkSZ_pudlEe9OR86d79JZT'
    width = '1RPn7I70mlus2OV1-cAh7pRsrtcana7C8'
    url = f'https://drive.google.com/uc?id={height}'
    output_file = 'downloaded_file.csv'
    gdown.download(url, output_file, quiet=False)

    print(f"File downloaded and saved as {output_file}")

    # Load and process the CSV file
    test = pd.read_csv('downloaded_file.csv', nrows=13000)

    # Prepare to save predictions in batches
    test['prediction'] = None
    num_rows = len(test)

    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        batch = test.iloc[start:end]

        # Apply the predictor function to each row of the batch
        batch['prediction'] = batch.apply(lambda row: predictor(row['image_link'], row['group_id'], row['entity_name']), axis=1)

        # Save the batch predictions to the same file
        save_predictions(batch)

    # In case there are rows left with missing predictions
    remaining = test[test['prediction'].isna()]
    if not remaining.empty:
        save_predictions(remaining)


Downloading...
From: https://drive.google.com/uc?id=1wDYH_D-S3WhkSZ_pudlEe9OR86d79JZT
To: /content/downloaded_file.csv
100%|██████████| 2.32M/2.32M [00:00<00:00, 151MB/s]


File downloaded and saved as downloaded_file.csv
Processing 1th image...
Processing 2th image...
Processing 3th image...
Processing 4th image...
Processing 5th image...
[2024/09/15 08:59:57] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.4040226936340332
[2024/09/15 08:59:57] ppocr DEBUG: cls num  : 3, elapsed : 0.017586946487426758
[2024/09/15 08:59:57] ppocr DEBUG: rec_res num  : 3, elapsed : 0.1402435302734375
15 centimeter
Batch predictions appended to dataset/test_out.csv
Processing 6th image...
Processing 7th image...
[2024/09/15 08:59:57] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.1260673999786377
[2024/09/15 08:59:57] ppocr DEBUG: cls num  : 2, elapsed : 0.02766704559326172
[2024/09/15 08:59:57] ppocr DEBUG: rec_res num  : 2, elapsed : 0.11901569366455078
14 centimeter
Processing 8th image...
Processing 9th image...
Processing 10th image...
[2024/09/15 08:59:58] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.22827696800231934
[2024/09/15 08:59:58] ppocr DEBUG: cls num  : 4, elapsed : 0

KeyboardInterrupt: 