In [1]:
# Image processing and OCR
from PIL import Image, ImageDraw
import cv2
import pytesseract
from paddleocr import PaddleOCR, draw_ocr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [14]:
img_path1='C:/Users/user/Documents/capstone/Test Dataset__2/132302/132302_31 Cropped Images/132302_31_table_1.png' 
img_path='C:/Users/user/Documents/capstone/Test Dataset__2/108156/108156_16 Cropped Images/108156_16_table_0.png'

def table_text_extraction(img_path):
    # Read the image in grayscale
    img = cv2.imread(img_path, 0)
    # Apply thresholding to convert the image to binary
    thresh, img_bin = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY)
    img_bin = 255 - img_bin 

    # Invert the binary image
    img_bin2 = 255-img
    # Apply Otsu's thresholding to the inverted image
    thresh1,img_bin_otsu = cv2.threshold(img_bin2,128,255,cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Define structuring elements for morphological operations
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

    # Erode and dilate to detect vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, np.array(img).shape[1]//150))
    eroded_image = cv2.erode(img_bin_otsu, vertical_kernel, iterations=5)
    vertical_lines = cv2.dilate(eroded_image, vertical_kernel, iterations=5)

    # Erode and dilate to detect horizontal lines
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (np.array(img).shape[1]//150, 1))
    image_2 = cv2.erode(img_bin, hor_kernel, iterations=5)
    horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=5)

    # Combine vertical and horizontal lines
    vertical_horizontal_lines = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    vertical_horizontal_lines = cv2.erode(~vertical_horizontal_lines, kernel, iterations=3)

    # Apply thresholding to the combined lines image
    thresh, vertical_horizontal_lines = cv2.threshold(vertical_horizontal_lines,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    b_image = cv2.bitwise_not(cv2.bitwise_xor(img,vertical_horizontal_lines))

    # Find contours in the binary image
    contours, hierarchy = cv2.findContours(vertical_horizontal_lines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Sort contours based on their Y-coordinate
    boundingBoxes = [cv2.boundingRect(c) for c in contours]
    (contours, boundingBoxes) = zip(*sorted(zip(contours, boundingBoxes),
    key=lambda x:x[1][1]))

    # Draw rectangles around identified contours
    boxes = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if (w<1000 and h<500):
            image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
            boxes.append([x,y,w,h])

    rows=[]
    columns=[]
    # Calculate the heights of the bounding boxes
    heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
    mean = np.mean(heights)  # Calculate the mean height of the bounding boxes

    # Initialise the 'columns' list with the first bounding box
    columns.append(boxes[0])
    previous=boxes[0]

    for i in range(1,len(boxes)):
        # Check if the current box is within half of the mean height of the previous box
        if(boxes[i][1]<=previous[1]+mean/2):
            columns.append(boxes[i])
            previous=boxes[i]

            # If it's the last box, add the columns to 'rows'
            if(i==len(boxes)-1):
                rows.append(columns)

        else:
           # If the current box is not in the same row, start a new 'columns' list
            rows.append(columns)
            columns=[]
            previous = boxes[i]
            columns.append(boxes[i])

    total_cells=0 # Number of cells (column)
    # Iterate through the identified rows to find the row with the maximum number of cells
    for i in range(len(rows)):
        # Check if the current row has more cells than the previous maximum
       if len(rows[i]) > total_cells:
           total_cells = len(rows[i])

    # Initialise an empty list to store the center coordinates of cells
    center = [int(rows[i][j][0]+rows[i][j][2]/2) for j in range(len(rows[i])) if rows[0]]
    center=np.array(center)
    center.sort()

    boxes_list = []
    for i in range(len(rows)):
        l=[]

        # Create an empty list for each cell in the row
        for k in range(total_cells):
           l.append([])
        for j in range(len(rows[i])):
            # Calculate the horizontal distance of the cell's center to the center of the column
            diff = abs(center-(rows[i][j][0]+rows[i][j][2]/4))

            # Find the column index (indexing) with the minimum horizontal distance
            minimum = min(diff)
            indexing = list(diff).index(minimum)

            # Append the cell's bounding box to the appropriate column in the row
            l[indexing].append(rows[i][j])
        boxes_list.append(l)


    dataframe_final = []
    ocr = PaddleOCR(use_angle_cls=True, lang='en')

    for i in range(len(boxes_list)):
        for j in range(len(boxes_list[i])):
            s = ''
            # Check if there are no bounding boxes in the current cell
            if len(boxes_list[i][j]) == 0:
               dataframe_final.append(' ')
            else:
                for k in range(len(boxes_list[i][j])):
                    y, x, w, h = boxes_list[i][j][k][0], boxes_list[i][j][k][1], boxes_list[i][j][k][2], boxes_list[i][j][k][3]
                    # Extract the region of interest (ROI) from the binary image
                    roi = b_image[x:x+h, y:y+w]
                    # Apply morphological operations to enhance text quality
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                    border = cv2.copyMakeBorder(roi, 2, 2, 2, 2, cv2.BORDER_CONSTANT, value=[255, 255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel, iterations=1)
                    erosion = cv2.erode(dilation, kernel, iterations=2)
                    # Perform OCR on the preprocessed image to recognize text
                    results = ocr.ocr(erosion)
                    # Extract recognized text from the OCR results
                    out = ' '.join([result[1][0] for result in results[0]])
                    s += " " + out

                dataframe_final.append(s)

    arr = np.array(dataframe_final)

    dataframe = pd.DataFrame(arr.reshape(len(rows), total_cells))
    data = dataframe.style.set_properties(align="left")

    return dataframe

df=table_text_extraction(img_path1)
df

[2023/10/26 23:05:41] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\user/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\user/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_lengt

Unnamed: 0,0,1,2,3,4,5
0,Element,Range,Element,Range,Element,Range
1,Ag,0.05-500ppm,Hf,0.05-2000ppm,Sb,0.05ppm-1%
2,AI,50ppm-15%,In,0.01-2000ppm,Sc,0.1-5000ppm
3,As,0.5ppm-1%,K,20ppm-10%,Se,0.5ppm-1%
4,Ba,0.1-5000ppm,La,0.01-5000ppm,Sn,0.1-2000ppm
5,Be,0.05-2000ppm,Li,0.1-5000ppm,Sr,0.05ppm-1%
6,Bi,0.01ppm-1%,Mg,20ppm-40%,Ta,0.01-2000ppm
7,Ca,50ppm-40%,Mn,1ppm-5%,Te,0.2-2000ppm
8,Cd,0.02-2000ppm,Mo,0.1ppm-1%,Th,0.01-5000ppm
9,Ce,0.01ppm-1%,Na,20ppm-10%,Ti,5ppm-2%


In [15]:
img_path='C:/Users/user/Documents/capstone/Test Dataset__2/135347/135347_8 Cropped Images/135347_8_table_0.png'

df1=table_text_extraction(img_path)

[2023/10/26 23:06:05] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\user/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\user/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_lengt

[2023/10/26 23:06:06] ppocr DEBUG: dt_boxes num : 1, elapsed : 0.09012222290039062
[2023/10/26 23:06:06] ppocr DEBUG: cls num  : 1, elapsed : 0.018524169921875
[2023/10/26 23:06:07] ppocr DEBUG: rec_res num  : 1, elapsed : 0.15528225898742676
[2023/10/26 23:06:07] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.06561660766601562
[2023/10/26 23:06:07] ppocr DEBUG: cls num  : 2, elapsed : 0.02486419677734375
[2023/10/26 23:06:07] ppocr DEBUG: rec_res num  : 2, elapsed : 0.2603154182434082
[2023/10/26 23:06:07] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.11484122276306152
[2023/10/26 23:06:07] ppocr DEBUG: cls num  : 2, elapsed : 0.02419137954711914
[2023/10/26 23:06:07] ppocr DEBUG: rec_res num  : 2, elapsed : 0.2616105079650879
[2023/10/26 23:06:08] ppocr DEBUG: dt_boxes num : 1, elapsed : 0.1482090950012207
[2023/10/26 23:06:08] ppocr DEBUG: cls num  : 1, elapsed : 0.013051271438598633
[2023/10/26 23:06:08] ppocr DEBUG: rec_res num  : 1, elapsed : 0.1349010467529297
[2023/10/26 23:06:08] ppo

TypeError: 'NoneType' object is not iterable