In [70]:
#imports
import os
import pytesseract
import cv2
import pandas as pd
from tqdm import tqdm
from openpyxl import Workbook

In [71]:
#variables for configuration

binarize = True
denoise = True
# hvalue for denoising
h_value = 10
#tesseract psm value
psm_v = 4
#threshold_value for global thresholding 
threshold_value = 127

#otsu binarization automatically chooses threshold using histograms (might be more helpful for some cases)
#implemented after denoising
otsu_binarization = True
contrast_increase = True

#contrast control (alpha) from 1 to 3; brightness control(beta) from 0 to 100
contrast_alpha, contrast_beta = 1.2, 0
deskew = True


In [72]:
def extract_table_from_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    if binarize == True:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    #check if that works ==> it works
    if contrast_increase == True:
        image = cv2.convertScaleAbs(gray_image, alpha=contrast_alpha, beta=contrast_beta)

    ###TEST VALUES FOR DENOISE
    if denoise == True:
        # parameters of fast NLMeansDenoising: (src image, output image, hvalue*, search window, block size)
        # hvalue = strength of denoise = higher stronger but chops data 
        #search_window Size in pixels of the window that is used to compute weighted average for given pixel. Should be odd. Affect performance linearly: greater search_window - greater denoising time. Recommended value 21 pixels
        #block_size Size in pixels of the template patch that is used to compute weights. Should be odd. Recommended value 7 pixels
        denoised_image = cv2.fastNlMeansDenoising(image, None, h_value, 21, 7) 
    
    # Apply thresholding to preprocess the image
    ret, threshold_image = cv2.threshold(gray_image, threshold_value, 255, cv2.THRESH_BINARY)
    # Use Tesseract to extract text from the preprocessed image
    extracted_data = pytesseract.image_to_string(threshold_image, config=f'--psm {psm_v} --oem 1 -c tessedit_char_whitelist=0123456789-.')

    if otsu_binarization == True:
        blur = cv2.GaussianBlur(image,(5,5),0)
        otsu_ret,otsu_threshold_image = cv2.threshold(blur,0 ,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        extracted_data = pytesseract.image_to_string(otsu_threshold_image, config=f'--psm {psm_v} --oem 1 -c tessedit_char_whitelist=0123456789-.')


    
    # Split the extracted text into rows
    rows = extracted_data.split('\n')

    # Remove empty rows
    rows = [row for row in rows if row.strip()]

    # Split each row into cells
    table_data = [row.split() for row in rows]

    return table_data
# df.to_csv(f'{original_name}.csv')


In [73]:
def save_table_to_excel(table_data, output_path, sheet_name):
    # Create a Pandas DataFrame from the table data
    df = pd.DataFrame(table_data)
    #print(table_data)
    #print(type(table_data))

    # Save the DataFrame to an Excel file with a specific sheet name
    with pd.ExcelWriter(output_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)


In [76]:
# Example usage
image_directory = 'selected'
output_directory = 'output'
batch_size = 200

# Get a list of image files in the directory
image_files = [f for f in os.listdir(image_directory) if f.endswith('.png') or f.endswith('.jpg')]

total_images = len(image_files)
total_batches = (total_images + batch_size - 1) // batch_size

In [None]:
with tqdm(total=total_batches, desc="Processing Batches") as pbar:
    for i in range(0, total_images, batch_size):
        batch_files = image_files[i:i+batch_size]
        batch_output_path = os.path.join(output_directory, f'output_batch_{i//batch_size + 1}.xlsx')

        # Create a new workbook for the current batch
        workbook = Workbook()
        workbook.save(batch_output_path)

        with tqdm(total=len(batch_files), desc=f"Batch {i//batch_size + 1}") as pbar_batch:
            for image_file in batch_files:
                image_path = os.path.join(image_directory, image_file)

                table_data = extract_table_from_image(image_path)
                sheet_name = image_file[-20:]  # Use the last 20 characters of the file name as the sheet name

                save_table_to_excel(table_data, batch_output_path, sheet_name)

                pbar_batch.update(1)

        pbar.update(1)

Processing Batches:   0%|                                | 0/11 [00:00<?, ?it/s]
Batch 1:   0%|                                          | 0/200 [00:00<?, ?it/s][A
Batch 1:   0%|▏                                 | 1/200 [00:03<10:05,  3.04s/it][A
Batch 1:   1%|▎                                 | 2/200 [00:06<09:54,  3.00s/it][A
Batch 1:   2%|▌                                 | 3/200 [00:08<09:38,  2.94s/it][A
Batch 1:   2%|▋                                 | 4/200 [00:11<09:22,  2.87s/it][A
Batch 1:   2%|▊                                 | 5/200 [00:14<09:01,  2.77s/it][A
Batch 1:   3%|█                                 | 6/200 [00:17<09:13,  2.85s/it][A
Batch 1:   4%|█▏                                | 7/200 [00:20<09:16,  2.88s/it][A
Batch 1:   4%|█▎                                | 8/200 [00:23<10:00,  3.13s/it][A
Batch 1:   4%|█▌                                | 9/200 [00:25<08:58,  2.82s/it][A
Batch 1:   5%|█▋                               | 10/200 [00:29<09:15,  2.93s/it