In [4]:
import os
import pandas as pd
import cv2
import pytesseract
from tqdm import tqdm
from pathlib import Path

# Path to the folder containing images
image_folder = Path(r'D:\smarterdevOCRScanner\templates\images')

# Check if the folder exists
if not image_folder.exists():
    print(f"The folder {image_folder} does not exist. Please check the path.")
else:
    # Initialize DataFrame
    allWorkOrder = pd.DataFrame(columns=['id', 'text'])

    # Iterate through each file in the folder
    for img_path in tqdm(image_folder.iterdir(), desc='WorkOrder'):
        # Only process files with .png extension (case-insensitive)
        if img_path.is_file() and img_path.suffix.lower() == '.png':
            # Load and process the image
            img = cv2.imread(str(img_path))
            
            # Preprocessing
            img_cv = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            img_gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
            img_thresh = cv2.adaptiveThreshold(img_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
            img_denoised = cv2.fastNlMeansDenoising(img_thresh, None, 30, 7, 21)
            
            # OCR Extraction
            data = pytesseract.image_to_data(img_denoised)
            dataList = list(map(lambda x: x.split('\t'), data.split('\n')))
            df = pd.DataFrame(dataList[1:], columns=dataList[0])
            df.dropna(inplace=True)

            # Convert 'conf' to float and then to int
            try:
                df['conf'] = df['conf'].astype(float).astype(int)
            except ValueError:
                print(f"Error converting 'conf' values: {df['conf'].unique()}")

            useFulData = df.query('conf >= 30')

            # Dataframe
            workOrder = pd.DataFrame()
            workOrder['text'] = useFulData['text']
            workOrder['id'] = img_path.name
            
            # Concatenate
            allWorkOrder = pd.concat((allWorkOrder, workOrder))

    # Save the results to a CSV file
    allWorkOrder.to_csv('workOrders.csv', index=False)
    print("OCR completed.")


WorkOrder: 39it [08:38, 13.28s/it]

OCR completed.



