# OCR Notebook
This is a jupyter notebook with code to do OCR on pdf and image files

## Requirements
1. Download and install tessaract-ocr (https://sourceforge.net/projects/tesseract-ocr.mirror/)
2. create .env file with references to the tesseract_ocr path and to some sample files
```
TESSARECT_OCR_PATH=C:\ ... \tesseract.exe
IMAGE_PATH_BAD_RECEIPT=C:\...image_path_bad_receipt.jpg
IMAGE_PATH_GOOD_RECEIPT=C:\...image_path_good_receipt.jpg
IMAGE_PATH_EXCEL_TABLE=C:\..image_path_table.png
```

# Code 
## Setup environment and OCR function

In [None]:
from PIL import Image #pillow
import cv2 as cv2 #opencv-python
from matplotlib import pyplot as plt
import pytesseract as pyt
from dotenv import load_dotenv
import os
load_dotenv()

image_path_bad_receipt = os.getenv('image_path_bad_receipt')
image_path_bad_receipt = os.getenv('IMAGE_PATH_BAD_RECEIPT')
image_path_good_receipt = os.getenv('IMAGE_PATH_GOOD_RECEIPT')
image_path_excel_table = os.getenv('IMAGE_PATH_EXCEL_TABLE')

# Set the path to the Tesseract executable:
tessarect_ocr_path = os.getenv('TESSARECT_OCR_PATH')
pyt.pytesseract.tesseract_cmd = tessarect_ocr_path



In [None]:
from tkinter import Tk, filedialog
def get_image_file_dialog():
    root=Tk()
    root.focus_force()
    FT = [('image files', ('.png', '.jpg'))]
    ttl = 'Select File'
    file_path=filedialog.askopenfilename(parent=root, title=ttl, filetypes=FT)
    root.withdraw()
    return file_path

In [None]:
def image_to_text(image_path = None, 
                  lang = 'eng', 
                  config = '', 
                  preprocessing_method = None):
    if image_path == None:
        image_path=get_image_file_dialog()
        
    if preprocessing_method == None:
        img = Image.open(image_path)
    else:
        img = preprocess_image(image_path, preprocessing_method)
    
    #add --psm 6 to config
    config=config+' --psm 6'
    # Use Tesseract to do OCR on the image
    text = pyt.image_to_string(img, lang=lang, config=config)

    return text




## preprocess image

In [None]:
def preprocess_image(image_path, preprocessing_method):
    if preprocessing_method=="default": 
        # Grayscale, Gaussian blur, Otsu's threshold
        image = cv2.imread(image_path)
        big_image = cv2.resize(image, dsize=(0,0), fx=3, fy=3)
        gray = cv2.cvtColor(big_image, cv2.COLOR_BGR2GRAY)
        #blur = cv2.GaussianBlur(gray, (3,3), 0)
        blur = cv2.bilateralFilter(gray,10,10,10)
        thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

        # Morph open to remove noise and invert image
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
        opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
        invert = 255 - opening

        #cv2.imshow('thresh', thresh)
        #cv2.imshow('opening', opening)
        #cv2.imshow('invert', invert)
        plt.imshow(image)
        plt.xticks([]), plt.yticks([])  # Hides the graph ticks and x / y axis
        plt.show()
        plt.imshow(invert)
        plt.xticks([]), plt.yticks([])  # Hides the graph ticks and x / y axis
        plt.show()
        #cv2.waitKey()
        return(invert)
    else: 
        raise ValueError(f"preprocessing method not defined: {preprocessing_method}")



## Use the OCR function

In [None]:
# set image path from environment var
image_path=image_path_excel_table
# Or omit image path to start a file dialog
result_text = image_to_text(image_path, 
                            config=r'tessedit_char_whitelist="0123456789"',
                            preprocessing_method='default')
print(result_text)

