In [None]:
!huggingface-cli login --token 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

# Import libs

In [None]:
# Visualization
import PIL.Image as Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib import colormaps
import numpy as np
import requests
from tqdm import tqdm

# DL
import torch
import torchvision.transforms.functional as tt
import torchvision.transforms as transforms
import cv2

# OCRs
from transformers import pipeline
import easyocr
import pytesseract
from pytesseract import Output

# Preprocces image

In [None]:
Image.MAX_IMAGE_PIXELS = None

In [None]:
def cut_image(image: Image.Image, k = 1.4142) -> Image.Image:
    assert len(image.size) == 2, f"image dim has to be equal to 2, but image.shape has {len(image.size)} dim."
    
    width, height = image.size
        
    height = min(k * width, height)
    image = image.crop((0, 0, width, height))
    
    return image

def check_exif(image: Image.Image) -> Image.Image:
    if hasattr(image, '_getexif') and image._getexif() is not None:
        exif = dict(image._getexif().items())
        if exif.get(274) == 3:
            image = image.rotate(180, expand=True)
        elif exif.get(274) == 6:
            image = image.rotate(270, expand=True)
        elif exif.get(274) == 8:
            image = image.rotate(90, expand=True)
            
    return image

In [None]:
class Preprocessing:
    def __init__(self):
        # Default threshold values
        self.threshold_thresh = 195
        self.threshold_maxval = 255

        # Default denoising parameters
        self.denoising_kernel = np.ones((2, 2), np.uint8)
        self.denoising_iterations = 1

        # Default blur parameters
        self.blur_kernel = (3, 3)
        self.blur_sigma = 3

    def PILtoNumpy(self, image: Image.Image):
        """Convert a PIL image to a NumPy array."""
        return np.array(image)
    
    def grayscale(self, image: Image.Image):
        """Convert a PIL image to a grayscale PIL image."""
        image = image.convert('L')
        
        return image
    
    def contrast_enchancement(self, image: np.ndarray) -> np.ndarray:
        """Increase the contrast to make the text stand out from the background."""
        min_val, max_val, _, _ = cv2.minMaxLoc(image)
        image = cv2.convertScaleAbs(image, alpha=255.0/(max_val-min_val), beta=-min_val*(255.0/(max_val-min_val)))

        return image
    
    def set_threshold(self, thresh: int, maxval: int) -> None:
        """Set custom threshold values."""
        self.threshold_thresh = thresh
        self.threshold_maxval = maxval

    def threshold(self, image: np.ndarray) -> np.ndarray:
        """Apply thresholding to the input image."""
        _, image = cv2.threshold(image, self.threshold_thresh, self.threshold_maxval, cv2.THRESH_BINARY)
        return image

    def set_denoising(self, kernel: np.ndarray, iterations: int) -> None:
        """Set custom denoising parameters."""
        self.denoising_kernel = kernel
        self.denoising_iterations = iterations

    def denoising(self, image: np.ndarray) -> np.ndarray:
        """Apply denoising to the input image."""
        image = cv2.erode(image, self.denoising_kernel, iterations=self.denoising_iterations)
        image = cv2.dilate(image, self.denoising_kernel, iterations=self.denoising_iterations)
        return image

    def blur_set(self, kernel: tuple, sigma: int) -> None:
        """Set custom blur parameters."""
        self.blur_kernel = kernel
        self.blur_sigma = sigma

    def blur(self, image: np.ndarray) -> np.ndarray:
        """Apply Gaussian blur to the input image."""
        image = cv2.GaussianBlur(image, self.blur_kernel, self.blur_sigma)
        return image


In [None]:
url = '/kaggle/input/examples-for-ocr/18.08.04 200.jpg'

image =  Image.open(url)

# Проверим EXIF-ориентацию у JPG
image = check_exif(image)
                
print(image.size)

In [None]:
pp = Preprocessing()

transform = transforms.Compose([
    cut_image,
    pp.grayscale,
    pp.PILtoNumpy,
    #pp.blur,
    pp.threshold,
    pp.contrast_enchancement,
    #pp.denoising
    
])

image = transform(image)
print(image.shape)

# Show preprocessed image

In [None]:
plt.figure(figsize=(16, 20))
plt.imshow(image, cmap="gray")
plt.axis('off')
plt.show()

# OCR Class

In [None]:
class OCR:
    def __init__(self, detect, recognize):
        """Initialize the OCR class with recognize and detect functions"""
        self.recognize = recognize
        self.detect = detect
        
    def detect(self, image):
        """Use the text detection function to detect text boxes in the image"""
        return self.detect(image)
    
    def recognize(self, image):
        """Use the text recognition function to recognize text within the detected boxes"""
        return self.recognize(image)
    
    def img2text(self, image):
        """Perform OCR: detect text boxes and recognize text within them"""
        boxes = self.detect(image)
        result = self.recognize(boxes)
        
        return result
    
    def plot_results(self, results, confidence_cmap = colormaps['summer']) -> None:
        """Plot the original image and an empty white image for overlaying boxes and text"""
        # plot image
        fig, ax = plt.subplots(1, 2, figsize=(16, 16))
        ax[0].imshow(image, cmap='gray')
        ax[0].axis('off')

        # plot white image nearby
        white_img = np.ones_like(image, dtype=np.uint8)
        white_img.fill(255)
        ax[1].imshow(white_img, cmap='gray', vmin=0, vmax=255)
        ax[1].axis('off')
        
        # plot boxes with text
        for box, text, confidence in result:
            x, y = zip(*box)
            
            confidence = max(0, confidence)
            color = confidence_cmap(1-confidence/2.) 
            
            rect0 = patches.Polygon(xy=list(zip(x, y)), closed=True, fill=False, edgecolor='green', lw=1)
            rect1 = patches.Polygon(xy=list(zip(x, y)), closed=True, fill=True, facecolor=color, edgecolor='green', lw=1)
            
            ax[0].add_patch(rect0)
            ax[1].add_patch(rect1)
            ax[1].text(x[0], np.mean(y), text, color='black', fontsize=7, va="center")
            
        plt.tight_layout()
        plt.show()

        

In [None]:
LANGUAGE = ['ru']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Easy OCR

In [None]:
model = easyocr.Reader(LANGUAGE)
easyOCR = OCR(model.detect, model.recognize)

In [None]:
horizontal_boxes, free_boxes = easyOCR.detect(image)

In [None]:
for x1, x2, y1, y2 in horizontal_boxes[0]:
    plt.imshow(image[y1:y2, x1:x2], cmap='gray', vmin=0, vmax=255)
    plt.axis('off')
    
    result = easyOCR.recognize(image[y1:y2, x1:x2])
    print(result, "\n")
    break

In [None]:
result = easyOCR.recognize(image, horizontal_boxes[0], free_boxes[0])

In [None]:
easyOCR.plot_results(result)

# Pytesseract OCR

In [None]:
# model.detect still from easyOCR
ttOCR = OCR(model.detect, pytesseract.image_to_data)

In [None]:
horizontal_boxes, free_boxes = ttOCR.detect(image)

In [None]:
config_url = "https://github.com/tesseract-ocr/tessdata/raw/main/rus.traineddata"
local_config_dir = "/kaggle/working/"
local_config_url = local_config_dir + "rus.traineddata"


response = requests.get(config_url)

if response.status_code == 200:
    with open(local_config_url, 'wb') as file:
        file.write(response.content)
        
    print("The config successfully downloaded.")
else:
    print("Failed to download the config")

config = f'--oem 3 --psm 6 --tessdata-dir {local_config_dir}'

In [None]:
def get_text_and_confidence(raw_data):
    text = ''
    confidence = []
    for i, word in enumerate(raw_data['text']):
        if word != '':
            text += f" {word}"
            confidence.append(raw_data['conf'][i])
            
    confidence = 0 if len(confidence)==0 else np.mean(confidence)/100.
            
    return text, confidence

In [None]:
for x1, x2, y1, y2 in horizontal_boxes[0]:
    plt.imshow(image[y1:y2, x1:x2], cmap='gray', vmin=0, vmax=255)
    plt.axis('off')
    
    result = ttOCR.recognize(image[y1:y2, x1:x2], lang='rus', config=config, output_type=Output.DICT)
          
    text, confidence = get_text_and_confidence(result)
    print(text, confidence)
    
    break

In [None]:
def clip(thresh, *args):
    result_ = []
    
    for num in args:
        num = min(thresh-1, num)
        num = max(0, num)
        result_.append(num)
        
    return list(result_)

In [None]:
w, h = image.shape
result = []

for x1, x2, y1, y2 in tqdm(horizontal_boxes[0]):
    x1, x2 = clip(h, x1, x2)
    y1, y2 = clip(w, y1, y2)
    
    box_result = ttOCR.recognize(image[y1:y2, x1:x2], lang='rus', config=config, output_type=Output.DICT)
    text, confidence = get_text_and_confidence(box_result)
    
    result.append((
        [[x1, y1], [x2, y1], [x2, y2], [x1, y2]],
        text,
        confidence
    ))

In [None]:
ttOCR.plot_results(result)

In [None]:
pytesseract.image_to_string(image, lang='rus', config=config, output_type=Output.DICT)

# Tesseract famaly OCR from Hugging Face

In [None]:
pipe = pipeline("image-to-text", model="microsoft/trocr-large-printed")

In [None]:
pipeOCR = OCR(model.detect, pipe)

In [None]:
for x1, x2, y1, y2 in horizontal_boxes[0]:
    plt.imshow(image[y1:y2, x1:x2], cmap='gray', vmin=0, vmax=255)
    plt.axis('off')
    
    pil_image = Image.fromarray(image[y1:y2, x1:x2], 'L')
    # i haven't found the cyrillic transfomers on Hugging Face
    result = pipeOCR.recognize(pil_image, max_new_tokens=40)
    print(result)
    break