# OCR Notebook
This is a jupyter notebook with code to do OCR on pdf and image files

# Requirements
1. Download and install tessaract-ocr and note the path (#TODO: source)
2. create .env file with references to the tesseract_ocr path and to some sample files
```
TESSARECT_OCR_PATH=C:\ ... \tesseract.exe
IMAGE_PATH_BAD_RECEIPT=C:\...image_path_bad_receipt.jpg
IMAGE_PATH_GOOD_RECEIPT=C:\...image_path_good_receipt.jpg
IMAGE_PATH_EXCEL_TABLE=C:\..image_path_table.png
```

In [None]:
from PIL import Image #pillow
import cv2 as cv2 #opencv-python
import pytesseract as pyt
from dotenv import load_dotenv
import os
load_dotenv()

image_path_bad_receipt = os.getenv('image_path_bad_receipt')
image_path_bad_receipt = os.getenv('IMAGE_PATH_BAD_RECEIPT')
image_path_good_receipt = os.getenv('IMAGE_PATH_GOOD_RECEIPT')
image_path_excel_table = os.getenv('IMAGE_PATH_EXCEL_TABLE')

# Set the path to the Tesseract executable:
tessarect_ocr_path = os.getenv('TESSARECT_OCR_PATH')
pyt.pytesseract.tesseract_cmd = tessarect_ocr_path



In [None]:
from tkinter import Tk, filedialog
def get_image_file_dialog():
    root=Tk()
    root.focus_force()
    FT = [('image files', ('.png', '.jpg'))]
    ttl = 'Select File'
    file_path=filedialog.askopenfilename(parent=root, title=ttl, filetypes=FT)
    root.withdraw()
    return file_path

In [None]:
def image_to_text(image_path, lang='eng', config='', preprocessing_method=None):
    # Open the image file
    if preprocessing_method != None:
        img=preprocess_image(image_path, preprocessing_method)
    else:
        img = Image.open(image_path)
    
    # Use Tesseract to do OCR on the image
    config=config+' --psm 6'
    text = pyt.image_to_string(img, lang=lang, config=config)

    return text


def preprocess_image(image_path, preprocessing_method):
    if preprocessing_method=="default": 
        # Grayscale, Gaussian blur, Otsu's threshold
        image = cv2.imread(image_path)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blur = cv2.GaussianBlur(gray, (3,3), 0)
        thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

        # Morph open to remove noise and invert image
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
        opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
        invert = 255 - opening

        cv2.imshow('thresh', thresh)
        cv2.imshow('opening', opening)
        cv2.imshow('invert', invert)
        #cv2.waitKey()
        return(invert)
    else: 
        raise ValueError(f"preprocessing method not defined: {preprocessing_method}")
        






call the function

In [None]:

# Replace 'path/to/your/image.png' with the actual path to your image file

#image_path=get_image_file_dialog()
image_path=image_path_bad_receipt
print(f"performing ocr on {image_path}\n..\n\n")

result_text = image_to_text(image_path, config=r'tessedit_char_whitelist="0123456789"', preprocessing_method='default')
print(result_text)

