In [7]:
!sudo apt-get update
!sudo apt-get install tesseract-ocr -y

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [2 InRelease 60.5 kB/128 kB 47%] [3 InRelease 108 kB/129 kB 84%] [Connected 0% [2 InRelease 79.3 kB/128 kB 62%] [Connected to r2u.stat.illinois.edu (192.170% [Connected to r2u.stat.illinois.edu (192.17.190.167)] [Waiting for headers]                                                                                Get:5 http://archive.ubuntu.com/ubuntu jamm

In [18]:
import cv2
import easyocr
import pytesseract
from PIL import Image
import numpy as np
import os
import re
import csv

# Input folder path containing images
folder_path = input("Enter the folder path containing images: ")

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Function to adjust brightness and contrast
def adjust_brightness_contrast(image, brightness_factor, contrast_factor):
    """
    Adjusts the brightness and contrast of an image.
    """
    return cv2.convertScaleAbs(image, alpha=contrast_factor, beta=brightness_factor)

# Function to extract the date with dynamic threshold adjustment
def extract_date_with_dynamic_threshold(image_path, save_images=False):
    test_img = cv2.imread(image_path)

    if test_img is None:
        print(f"Error: Image not found! {image_path}")
        return "Image not found"

    date_region = test_img[270:300, 200:350]
    result = reader.readtext(date_region, detail=1)

    if result:
        _, extracted_date, confidence = result[0]
        if confidence > 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
            return extracted_date

    date_gray = cv2.cvtColor(date_region, cv2.COLOR_BGR2GRAY)
    date_gray = adjust_brightness_contrast(date_gray, brightness_factor=15, contrast_factor=1.0)

    threshold_value = 150
    iteration = 0
    while True:
        _, date_thresh = cv2.threshold(date_gray, threshold_value, 255, cv2.THRESH_BINARY)

        if save_images:
            image_name = f"processed_image_iter_{iteration}.png"
            cv2.imwrite(image_name, date_thresh)

        result = reader.readtext(date_thresh, detail=1)
        if result:
            _, extracted_date, confidence = result[0]
            if confidence > 0.65 and confidence < 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
                return extracted_date
            elif confidence < 0.50:
                reduced_brightness = (date_gray * 0.75).astype("uint8")
                _, date_thresh_tesseract = cv2.threshold(reduced_brightness, 45, 255, cv2.THRESH_BINARY)
                tesseract_result = pytesseract.image_to_string(date_thresh_tesseract, config=r'--oem 3 --psm 6').strip()
                if len(tesseract_result) == 3 and tesseract_result.isalpha():
                    return tesseract_result

        threshold_value -= 5
        iteration += 1
        if threshold_value <= 25:
            return "No date found"

# Function to clean and process MICR text
def clean_and_extract(text):
    if text and text[0].isalpha():
        text = text[1:]
    digits_only = re.sub(r'\D', '', text)
    cleaned_text = digits_only[:6] if len(digits_only) >= 6 else None

    if not cleaned_text:
        return None

    if len(cleaned_text) == 6:
        return cleaned_text
    elif len(cleaned_text) == 8:
        return cleaned_text[1:-1]
    elif len(cleaned_text) == 9:
        return cleaned_text[1:-2]
    elif len(cleaned_text) == 10:
        return cleaned_text[2:-2]
    elif len(cleaned_text) == 11:
        return cleaned_text[1:-4]
    else:
        return None

# Function to extract MICR text
def process_image(image_path):
    image = Image.open(image_path)
    check_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    micr_img = check_img[320:380, 180:310]
    micr_text = pytesseract.image_to_string(micr_img, lang='mcr')
    return clean_and_extract(micr_text)

# Output dictionary for results
results = []

# Process all images in the folder
image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.JPG'))]

for file_name in image_files:
    file_path = os.path.join(folder_path, file_name)
    try:
        extracted_code = extract_date_with_dynamic_threshold(file_path, save_images=False)
        micr_text = process_image(file_path)
        results.append({"File Name": file_name, "Extracted Code": extracted_code, "MICR Text": micr_text})
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Save results to CSV file
output_csv_path = "extracted_data2.csv"
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["File Name", "Extracted Code", "MICR Text"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_csv_path}")


Enter the folder path containing images: /content/drive/MyDrive/new_cheque_img




Results saved to extracted_data2.csv


In [9]:
! apt install tesseract-ocr
! apt install libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 53 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev
0 upgraded, 3 newly installed, 0 to remove and 53 not upgraded.
Need to get 3,743 kB of archives.
After this operation, 16.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.3 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB

In [10]:
import pytesseract
from PIL import Image
import requests
from io import BytesIO
import cv2
import numpy as np

In [11]:
!ls -alrt /usr/bin/tesseract

-rwxr-xr-x 1 root root 35128 Feb  9  2022 /usr/bin/tesseract


In [12]:
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [13]:
!wget https://raw.githubusercontent.com/BigPino67/Tesseract-MICR-OCR/master/Tessdata/mcr.traineddata

--2024-11-28 05:32:37--  https://raw.githubusercontent.com/BigPino67/Tesseract-MICR-OCR/master/Tessdata/mcr.traineddata
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147363 (144K) [application/octet-stream]
Saving to: ‘mcr.traineddata’


2024-11-28 05:32:37 (35.1 MB/s) - ‘mcr.traineddata’ saved [147363/147363]



In [14]:
!cp mcr.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

In [20]:
import cv2
import easyocr
import pytesseract
from PIL import Image
import numpy as np
import os
import re
import csv

# Input folder path containing images
folder_path = input("Enter the folder path containing images: ")

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Function to adjust brightness and contrast
def adjust_brightness_contrast(image, brightness_factor, contrast_factor):
    """
    Adjusts the brightness and contrast of an image.
    """
    return cv2.convertScaleAbs(image, alpha=contrast_factor, beta=brightness_factor)

# Function to extract the date with dynamic threshold adjustment
def extract_date_with_dynamic_threshold(image_path, save_images=False):
    test_img = cv2.imread(image_path)

    if test_img is None:
        print(f"Error: Image not found! {image_path}")
        return "Image not found"

    date_region = test_img[270:300, 200:350]
    result = reader.readtext(date_region, detail=1)

    if result:
        _, extracted_date, confidence = result[0]
        if confidence > 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
            return extracted_date

    date_gray = cv2.cvtColor(date_region, cv2.COLOR_BGR2GRAY)
    date_gray = adjust_brightness_contrast(date_gray, brightness_factor=15, contrast_factor=1.0)

    threshold_value = 150
    iteration = 0
    while True:
        _, date_thresh = cv2.threshold(date_gray, threshold_value, 255, cv2.THRESH_BINARY)

        if save_images:
            image_name = f"processed_image_iter_{iteration}.png"
            cv2.imwrite(image_name, date_thresh)

        result = reader.readtext(date_thresh, detail=1)
        if result:
            _, extracted_date, confidence = result[0]
            if confidence > 0.65 and confidence < 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
                return extracted_date
            elif confidence < 0.50:
                reduced_brightness = (date_gray * 0.75).astype("uint8")
                _, date_thresh_tesseract = cv2.threshold(reduced_brightness, 45, 255, cv2.THRESH_BINARY)
                tesseract_result = pytesseract.image_to_string(date_thresh_tesseract, config=r'--oem 3 --psm 6').strip()
                if len(tesseract_result) == 3 and tesseract_result.isalpha():
                    return tesseract_result

        threshold_value -= 5
        iteration += 1
        if threshold_value <= 25:
            return "No date found"

# Function to clean and process MICR text
def clean_and_extract(text):
    if text and text[0].isalpha():
        text = text[1:]
    digits_only = re.sub(r'\D', '', text)
    cleaned_text = digits_only[:6] if len(digits_only) >= 6 else None

    if not cleaned_text:
        return None

    if len(cleaned_text) == 6:
        return cleaned_text
    elif len(cleaned_text) == 8:
        return cleaned_text[1:-1]
    elif len(cleaned_text) == 9:
        return cleaned_text[1:-2]
    elif len(cleaned_text) == 10:
        return cleaned_text[2:-2]
    elif len(cleaned_text) == 11:
        return cleaned_text[1:-4]
    else:
        return None

# Function to extract MICR text
def process_image(image_path):
    image = Image.open(image_path)
    check_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    micr_img = check_img[320:380, 180:310]
    micr_text = pytesseract.image_to_string(micr_img, lang='mcr')
    return clean_and_extract(micr_text)

# Output dictionary for results
results = []

# Process all images in the folder
image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.JPG'))]

for file_name in image_files:
    file_path = os.path.join(folder_path, file_name)
    try:
        extracted_code = extract_date_with_dynamic_threshold(file_path, save_images=False)
        micr_text = process_image(file_path)

        # Add logic to check confidence and print the output
        result = reader.readtext(cv2.imread(file_path)[270:300, 200:350], detail=1)  # Re-read the raw result
        if result:
            _, easyocr_raw, confidence = result[0]
            if confidence > 0.80:
                extracted_code = easyocr_raw  # Update extracted_code if confidence is high
                print(f"Image: {file_name}, Alpha Code: {easyocr_raw}")
            else:
                print(f"Image: {file_name}, Alpha Code: {extracted_code}")

        # Append extracted data to the results list with "Alpha Code" as the header
        results.append({"File Name": file_name, "Alpha Code": extracted_code, "MICR Text": micr_text})
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Save results to CSV file
output_csv_path = "extracted_data4.csv"
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["File Name", "Alpha Code", "MICR Text"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_csv_path}")


Enter the folder path containing images: /content/drive/MyDrive/new_cheque_img




Image: 1.JPG, Alpha Code: No date found
Image: 3.JPG, Alpha Code: LCC
Image: 5.JPG, Alpha Code: IUP
Image: 6.JPG, Alpha Code: IVB
Image: 7.JPG, Alpha Code: IND
Image: 8.JPG, Alpha Code: VAW
Image: 10.JPG, Alpha Code: shinpu
Image: 11.jpg, Alpha Code: IBX
Image: 12.jpg, Alpha Code: UID
Image: 2.JPG, Alpha Code: IGR
Image: 4.png, Alpha Code: IWW
Results saved to extracted_data4.csv


In [23]:
import cv2
import easyocr
import pytesseract
from PIL import Image
import numpy as np
import os
import re
import csv

# Input folder path containing images
folder_path = input("Enter the folder path containing images: ")

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Function to adjust brightness and contrast
def adjust_brightness_contrast(image, brightness_factor, contrast_factor):
    """
    Adjusts the brightness and contrast of an image.
    """
    return cv2.convertScaleAbs(image, alpha=contrast_factor, beta=brightness_factor)

# Function to extract the date with dynamic threshold adjustment
def extract_date_with_dynamic_threshold(image_path, save_images=False):
    test_img = cv2.imread(image_path)

    if test_img is None:
        print(f"Error: Image not found! {image_path}")
        return "Image not found"

    date_region = test_img[270:300, 200:350]
    result = reader.readtext(date_region, detail=1)

    if result:
        _, extracted_date, confidence = result[0]
        if confidence > 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
            return extracted_date

    date_gray = cv2.cvtColor(date_region, cv2.COLOR_BGR2GRAY)
    date_gray = adjust_brightness_contrast(date_gray, brightness_factor=15, contrast_factor=1.0)

    threshold_value = 150
    iteration = 0
    while True:
        _, date_thresh = cv2.threshold(date_gray, threshold_value, 255, cv2.THRESH_BINARY)

        if save_images:
            image_name = f"processed_image_iter_{iteration}.png"
            cv2.imwrite(image_name, date_thresh)

        result = reader.readtext(date_thresh, detail=1)
        if result:
            _, extracted_date, confidence = result[0]
            if confidence > 0.65 and confidence < 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
                return extracted_date
            elif confidence < 0.50:
                reduced_brightness = (date_gray * 0.75).astype("uint8")
                _, date_thresh_tesseract = cv2.threshold(reduced_brightness, 45, 255, cv2.THRESH_BINARY)
                tesseract_result = pytesseract.image_to_string(date_thresh_tesseract, config=r'--oem 3 --psm 6').strip()
                if len(tesseract_result) == 3 and tesseract_result.isalpha():
                    return tesseract_result

        threshold_value -= 5
        iteration += 1
        if threshold_value <= 25:
            return "No date found"

# Function to clean and process MICR text
def clean_and_extract(text):
    if text and text[0].isalpha():
        text = text[1:]
    digits_only = re.sub(r'\D', '', text)
    cleaned_text = digits_only[:6] if len(digits_only) >= 6 else None

    if not cleaned_text:
        return None

    if len(cleaned_text) == 6:
        return cleaned_text
    elif len(cleaned_text) == 8:
        return cleaned_text[1:-1]
    elif len(cleaned_text) == 9:
        return cleaned_text[1:-2]
    elif len(cleaned_text) == 10:
        return cleaned_text[2:-2]
    elif len(cleaned_text) == 11:
        return cleaned_text[1:-4]
    else:
        return None

# Function to extract MICR text
def process_image(image_path):
    image = Image.open(image_path)
    check_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    micr_img = check_img[320:380, 180:310]
    micr_text = pytesseract.image_to_string(micr_img, lang='mcr')
    return clean_and_extract(micr_text)

# Output dictionary for results
results = []

# Process all images in the folder
image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.JPG'))]

for file_name in image_files:
    file_path = os.path.join(folder_path, file_name)
    try:
        extracted_code = extract_date_with_dynamic_threshold(file_path, save_images=False)
        micr_text = process_image(file_path)

        # Add logic to update extracted_code with high-confidence result from EasyOCR
        result = reader.readtext(cv2.imread(file_path)[270:300, 200:350], detail=1)  # Re-read the raw result
        if result:
            _, easyocr_raw, confidence = result[0]
            if confidence > 0.80:
                extracted_code = easyocr_raw  # Update extracted_code if confidence is high

        # Append extracted data to the results list with "Alpha Code" as the header
        results.append({"File Name": file_name, "Alpha Code": extracted_code, "Cheque No": micr_text})
    except Exception as e:
        # Skip printing errors but still proceed with other images
        pass

# Save results to CSV file
output_csv_path = "extracted_data5.csv"
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["File Name", "Alpha Code", "Cheque No"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_csv_path}")


Enter the folder path containing images: /content/drive/MyDrive/new_cheque_img




Results saved to extracted_data5.csv


In [26]:
import cv2
import easyocr
import pytesseract
from PIL import Image
import numpy as np
import os
import re
import csv
from datetime import datetime  # Added for current date

# Input folder path containing images
folder_path = input("Enter the folder path containing images: ")

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Function to adjust brightness and contrast
def adjust_brightness_contrast(image, brightness_factor, contrast_factor):
    """
    Adjusts the brightness and contrast of an image.
    """
    return cv2.convertScaleAbs(image, alpha=contrast_factor, beta=brightness_factor)

# Function to extract the date with dynamic threshold adjustment
def extract_date_with_dynamic_threshold(image_path, save_images=False):
    test_img = cv2.imread(image_path)

    if test_img is None:
        print(f"Error: Image not found! {image_path}")
        return "Image not found"

    date_region = test_img[270:300, 200:350]
    result = reader.readtext(date_region, detail=1)

    if result:
        _, extracted_date, confidence = result[0]
        if confidence > 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
            return extracted_date

    date_gray = cv2.cvtColor(date_region, cv2.COLOR_BGR2GRAY)
    date_gray = adjust_brightness_contrast(date_gray, brightness_factor=15, contrast_factor=1.0)

    threshold_value = 150
    iteration = 0
    while True:
        _, date_thresh = cv2.threshold(date_gray, threshold_value, 255, cv2.THRESH_BINARY)

        if save_images:
            image_name = f"processed_image_iter_{iteration}.png"
            cv2.imwrite(image_name, date_thresh)

        result = reader.readtext(date_thresh, detail=1)
        if result:
            _, extracted_date, confidence = result[0]
            if confidence > 0.65 and confidence < 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
                return extracted_date
            elif confidence < 0.50:
                reduced_brightness = (date_gray * 0.75).astype("uint8")
                _, date_thresh_tesseract = cv2.threshold(reduced_brightness, 45, 255, cv2.THRESH_BINARY)
                tesseract_result = pytesseract.image_to_string(date_thresh_tesseract, config=r'--oem 3 --psm 6').strip()
                if len(tesseract_result) == 3 and tesseract_result.isalpha():
                    return tesseract_result

        threshold_value -= 5
        iteration += 1
        if threshold_value <= 25:
            return "No date found"

# Function to clean and process MICR text
def clean_and_extract(text):
    if text and text[0].isalpha():
        text = text[1:]
    digits_only = re.sub(r'\D', '', text)
    cleaned_text = digits_only[:6] if len(digits_only) >= 6 else None

    if not cleaned_text:
        return None

    if len(cleaned_text) == 6:
        return cleaned_text
    elif len(cleaned_text) == 8:
        return cleaned_text[1:-1]
    elif len(cleaned_text) == 9:
        return cleaned_text[1:-2]
    elif len(cleaned_text) == 10:
        return cleaned_text[2:-2]
    elif len(cleaned_text) == 11:
        return cleaned_text[1:-4]
    else:
        return None

# Function to extract MICR text
def process_image(image_path):
    image = Image.open(image_path)
    check_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    micr_img = check_img[320:380, 180:310]
    micr_text = pytesseract.image_to_string(micr_img, lang='mcr')
    return clean_and_extract(micr_text)

# Output dictionary for results
results = []

# Process all images in the folder
image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.JPG'))]

for file_name in image_files:
    file_path = os.path.join(folder_path, file_name)
    try:
        extracted_code = extract_date_with_dynamic_threshold(file_path, save_images=False)
        micr_text = process_image(file_path)

        # Add logic to update extracted_code with high-confidence result from EasyOCR
        result = reader.readtext(cv2.imread(file_path)[270:300, 200:350], detail=1)  # Re-read the raw result
        if result:
            _, easyocr_raw, confidence = result[0]
            if confidence > 0.80:
                extracted_code = easyocr_raw  # Update extracted_code if confidence is high

        # Append extracted data to the results list with "Alpha Code" as the header
        results.append({"File Name": file_name, "Alpha Code": extracted_code, "Cheque No": micr_text})
    except Exception as e:
        # Skip printing errors but still proceed with other images
        pass

# Generate the current date for the CSV file name
current_date = datetime.now().strftime("%Y-%m-%d")
output_csv_path = f"{current_date}.csv"

# Save results to CSV file
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["File Name", "Alpha Code", "Cheque No"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_csv_path}")


Enter the folder path containing images: /content/drive/MyDrive/new_cheque_img




Results saved to 2024-11-28.csv


In [30]:
import cv2
import easyocr
import pytesseract
from PIL import Image
import numpy as np
import os
import re
import csv
from datetime import datetime
import sys
import pickle  # To save progress and avoid duplicate processing

# Define a lock file for ensuring that the process isn't edited or stopped
lock_file_path = "process_lock.pkl"

# Function to create or check a lock file to prevent process tampering
def check_lock():
    if os.path.exists(lock_file_path):
        print("Process is already running or has been tampered with. Exiting.")
        sys.exit()
    else:
        with open(lock_file_path, 'wb') as lock_file:
            pickle.dump(True, lock_file)

# Function to release the lock
def release_lock():
    if os.path.exists(lock_file_path):
        os.remove(lock_file_path)

# Validate and check the folder path
def validate_folder_path(folder_path):
    if not os.path.isdir(folder_path):
        print("Error: Provided path is not a valid directory.")
        sys.exit()

    # Check for duplicate image paths
    processed_images_file = "processed_images.txt"
    if os.path.exists(processed_images_file):
        with open(processed_images_file, 'r') as f:
            processed_images = set(f.read().splitlines())
    else:
        processed_images = set()

    new_images = {f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.JPG'))}
    if new_images & processed_images:
        print("Warning: Some images have already been processed.")
        sys.exit()

    return new_images

# Save processed images to a file to avoid duplication
def save_processed_image(image_path):
    processed_images_file = "processed_images.txt"
    with open(processed_images_file, 'a') as f:
        f.write(image_path + '\n')

# Input folder path containing images
folder_path = input("Enter the folder path containing images: ")

# Lock the process
check_lock()

# Validate folder path and check for duplicate images
image_files = validate_folder_path(folder_path)

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Function to adjust brightness and contrast
def adjust_brightness_contrast(image, brightness_factor, contrast_factor):
    return cv2.convertScaleAbs(image, alpha=contrast_factor, beta=brightness_factor)

# Function to extract date with dynamic threshold adjustment
def extract_date_with_dynamic_threshold(image_path, save_images=False):
    test_img = cv2.imread(image_path)
    if test_img is None:
        print(f"Error: Image not found! {image_path}")
        return "Image not found"

    date_region = test_img[270:300, 200:350]
    result = reader.readtext(date_region, detail=1)

    if result:
        _, extracted_date, confidence = result[0]
        if confidence > 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
            return extracted_date

    date_gray = cv2.cvtColor(date_region, cv2.COLOR_BGR2GRAY)
    date_gray = adjust_brightness_contrast(date_gray, brightness_factor=15, contrast_factor=1.0)

    threshold_value = 150
    iteration = 0
    while True:
        _, date_thresh = cv2.threshold(date_gray, threshold_value, 255, cv2.THRESH_BINARY)
        if save_images:
            image_name = f"processed_image_iter_{iteration}.png"
            cv2.imwrite(image_name, date_thresh)

        result = reader.readtext(date_thresh, detail=1)
        if result:
            _, extracted_date, confidence = result[0]
            if confidence > 0.65 and confidence < 0.90 and len(extracted_date) == 3 and extracted_date.isalpha():
                return extracted_date
            elif confidence < 0.50:
                reduced_brightness = (date_gray * 0.75).astype("uint8")
                _, date_thresh_tesseract = cv2.threshold(reduced_brightness, 45, 255, cv2.THRESH_BINARY)
                tesseract_result = pytesseract.image_to_string(date_thresh_tesseract, config=r'--oem 3 --psm 6').strip()
                if len(tesseract_result) == 3 and tesseract_result.isalpha():
                    return tesseract_result

        threshold_value -= 5
        iteration += 1
        if threshold_value <= 25:
            return "No date found"

# Function to clean and process MICR text
def clean_and_extract(text):
    if text and text[0].isalpha():
        text = text[1:]
    digits_only = re.sub(r'\D', '', text)
    cleaned_text = digits_only[:6] if len(digits_only) >= 6 else None

    if not cleaned_text:
        return None

    if len(cleaned_text) == 6:
        return cleaned_text
    elif len(cleaned_text) == 8:
        return cleaned_text[1:-1]
    elif len(cleaned_text) == 9:
        return cleaned_text[1:-2]
    elif len(cleaned_text) == 10:
        return cleaned_text[2:-2]
    elif len(cleaned_text) == 11:
        return cleaned_text[1:-4]
    else:
        return None

# Function to extract MICR text
def process_image(image_path):
    image = Image.open(image_path)
    check_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    micr_img = check_img[320:380, 180:310]
    micr_text = pytesseract.image_to_string(micr_img, lang='mcr')
    return clean_and_extract(micr_text)

# Output dictionary for results
results = []

# Process all images in the folder
for file_name in image_files:
    file_path = os.path.join(folder_path, file_name)
    try:
        extracted_code = extract_date_with_dynamic_threshold(file_path, save_images=False)
        micr_text = process_image(file_path)

        result = reader.readtext(cv2.imread(file_path)[270:300, 200:350], detail=1)
        if result:
            _, easyocr_raw, confidence = result[0]
            if confidence > 0.80:
                extracted_code = easyocr_raw

        results.append({"File Name": file_name, "Alpha Code": extracted_code, "Cheque No": micr_text})

        # Save processed image path to avoid future duplication
        save_processed_image(file_path)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        release_lock()
        sys.exit()

# Generate the current date for the CSV file name
current_date = datetime.now().strftime("%Y-%m-%d")
output_csv_path = f"{current_date}.csv"

# Save results to CSV file
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["File Name", "Alpha Code", "Cheque No"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_csv_path}")

# Release the lock
release_lock()


Enter the folder path containing images: /content/drive/MyDrive/new_cheque_img




Results saved to 2024-11-28.csv
