## Notebook for docTR dataset preparation

In [1]:
!pip install beautifulsoup4 lxml
!pip install Pillow



In [12]:
!rm -rf ~/datasets/ocr_words/

#### Load dependencies

In [1]:
import cv2
import json
import numpy as np
import os
import re

from bs4 import BeautifulSoup
from PIL import Image, ImageDraw, ImageFont
from random import choice, randint, shuffle, random
from scipy.ndimage import gaussian_filter
from tqdm import tqdm


DATA_DIR = os.path.join(os.path.expanduser('~'), 'advanced/ocr')
OUTPUT_DIR = os.path.join(os.path.expanduser('~'), 'datasets/ocr_words')

os.makedirs(OUTPUT_DIR, exist_ok=True)

#### Preprocessing

In [26]:
def preprocess(img):
    # Mask to remove ghost text
    blurred = cv2.GaussianBlur(img, (3, 3), 0)
    thresh, _ = cv2.threshold(blurred, 0, 255, cv2.THRESH_OTSU)
    img[img>thresh] = 255

    # Make it fLavourless 
    img = cv2.medianBlur(img, 3)
    return img

def preprocess_bytes(image_bytes: bytes) -> np.ndarray:
    nparr = np.frombuffer(image_bytes, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
    img = preprocess(img)
    return img

## Recognition

### Crop images from provided OCR dataset

In [14]:
def parse_hocr_words(hocr_path):
    with open(hocr_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml-xml')

    word_data = []

    # Loop through all words
    for word in soup.find_all(id=re.compile(r'^word')):
        title = word.get('title', '')
        text = word.get_text(separator=' ', strip=True)

        # Extract bounding box using regex
        bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
        if bbox_match:
            x1, y1, x2, y2 = map(int, bbox_match.groups())
            bbox = [x1, y1, x2, y2]
            word_data.append({'text': text, 'bbox': bbox})

    return word_data


def handle_sample(num, json_data, im_dir):
    text_file_words = []
    
    sample_name = f"sample_{num}"
    
    hocr_path = os.path.join(DATA_DIR, f"{sample_name}.hocr")
    words = parse_hocr_words(hocr_path)
    
    image_path = os.path.join(DATA_DIR, f"{sample_name}.jpg")
    # image = Image.open(image_path)
    
    with open(image_path, 'rb') as f:
        img_b = f.read()
        
    img_arr = preprocess_bytes(img_b)
    image = Image.fromarray(img_arr, mode="L")
    
    for idx, word in enumerate(words):
        bbox = word['bbox']
        cropped_image = image.crop(bbox)
        
        img_file_name = f"{sample_name}_word_{idx}.jpg"
        
        output_path = os.path.join(im_dir, img_file_name)
        
        cropped_image.save(output_path)
        json_data[img_file_name] = word['text'] 

def prep_set(images_idxs, use):
    set_dir = os.path.join(OUTPUT_DIR, use)
    im_dir = os.path.join(set_dir, 'images')
    os.makedirs(set_dir, exist_ok=True)
    os.makedirs(im_dir, exist_ok=True)

    json_data = {}
    for i in tqdm(images_idxs):
        handle_sample(i, json_data, im_dir)
    
    json_data_items = list(json_data.items())
    shuffle(json_data_items)
    json_data = dict(json_data_items)
    with open(os.path.join(set_dir, 'labels_org.json'), 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=4)
        
    return json_data

In [11]:
with open('../../test/issues.json', 'r') as f:
    blurry_ls = json.load(f)

dataset = set(blurry_ls[:324]).union(set(range(4200, 4500)))
datals = list(dataset)
shuffle(datals)

In [27]:
datals[:10]

[738, 4471, 4244, 771, 1007, 4434, 4402, 4363, 4460, 2870]

In [15]:
train_labels = prep_set(datals[:500], 'train')
print(len(train_labels))
val_labels = prep_set(datals[500:], 'val')
print(len(val_labels))

100%|██████████| 500/500 [03:41<00:00,  2.26it/s]


245385


100%|██████████| 100/100 [00:43<00:00,  2.28it/s]

50197





### Confirming Crops

In [33]:
train_labels["sample_1007_word_273.jpg"]

'swiftly,'

In [44]:
import shutil 

fp = os.path.join(os.path.expanduser('~'), 'datasets/ocr_words/val/images/generated_1.jpg')

shutil.copyfile(fp, './check.jpg')

'./check.jpg'

In [8]:
Image.open('../check.jpg').size

(158, 31)

### Dataset Generation (text generated by ChatGPT)

In [45]:
with open("Brainhack_OCR_GPT_data.txt", 'r') as f:
    data = f.read()
    
# Cleaning up some errors
data = re.sub(r'(?<=[a-z])I', 'l', data)
data = re.sub(r'(?<=[a-z])I', 'l', data)
data = data.replace('0ASIS', 'OASIS')
data = data.replace('Ieaming', 'learning')
data = data.replace('0RBIT', 'ORBIT')
data = data.replace('0rbit', 'Orbit')

l_words = ['leaving', 'lone', 'logistics', 'long', 'landscape', 'location', 
           'late', 'layers', 'limitaions', 'level', 'latency', 'line', 'less']  
 
for l_word in l_words:
    data = data.replace('I' + l_word[1:], l_word)

with open('cleaned.txt', 'w') as f:
    f.write(data)

In [23]:
def add_salt_and_pepper(arr, amount=0.05):
    """Adds salt & pepper noise to an image (expects 'L' or 'RGB' mode)."""
    noisy = arr.copy()
    num_pixels = arr.shape[0] * arr.shape[1]
    num_salt = int(amount * num_pixels)
    num_pepper = int(amount * num_pixels)

    # Add salt
    for _ in range(num_salt):
        i = randint(0, arr.shape[0] - 1)
        j = randint(0, arr.shape[1] - 1)
        noisy[i, j] = 255  # White

    # Add pepper
    for _ in range(num_pepper):
        i = randint(0, arr.shape[0] - 1)
        j = randint(0, arr.shape[1] - 1)
        noisy[i, j] = 0  # Black
        
    return noisy

def lighten_words(arr, lighten_val=100):
    """
    Replaces white pixels (255) in a grayscale image with a lighter value (e.g., 220).
    """
    lightened = arr.copy()
    lightened[lightened == 0] = lighten_val
    return lightened

def load_words(use):
    with open(f"cleaned_final_{use}.txt", 'r') as f:
        text = f.read()
        
    text = re.sub(r'\s+', ' ', text)
    text_ls = text.split(' ')
    print(f"{len(text_ls)} words")
    return text_ls

def load_fonts(folder, sizes):
    fonts = []
    for f in os.listdir('fonts'):
        if not f.endswith('.ttf'):
            continue
        for size in sizes:
            fonts.append(ImageFont.truetype(os.path.join('fonts', f), size=size))
    
    return fonts

In [None]:
# Output directory
OUTPUT_DIR = os.path.join(os.path.expanduser('~'), 'datasets/ocr_words/')
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load font
sizes = [25, 30, 35]
fonts = load_fonts('fonts', sizes)

def generate_set(use, num):
    use_dir = os.path.join(OUTPUT_DIR, use)
    img_dir = os.path.join(use_dir, "images")
    os.makedirs(img_dir, exist_ok = True)
    
    # Load words
    words = load_words(use)
    words_len = len(words)
    words_dict = {}

    for i in tqdm(range(num)):
        word = words[i%words_len]
        font = choice(fonts)

        # Dummy image to get tight bounding box
        dummy_img = Image.new("RGB", (1, 1))
        draw = ImageDraw.Draw(dummy_img)
        bbox = draw.textbbox((0, 0), word, font=font)

        # Tight size based on actual text pixels
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]

        # Small padding to avoid clipping
        padding = 2
        img_width = text_width + padding * 2
        img_height = text_height + padding * 2

        # Create base image
        img = Image.new("L", (img_width, img_height), color=255)  # 'L' mode = grayscale
        draw = ImageDraw.Draw(img)
        draw.text((-bbox[0] + padding, -bbox[1] + padding), word, font=font, fill=0)

        # Augmentations:
        arr = np.array(img)

        lighten_val = randint(0, 180)
        blur_deg = randint(0, 4)/2
        salt_pepper_amt = randint(0,6)/100

        arr = lighten_words(arr, lighten_val)
        arr = gaussian_filter(arr, sigma=blur_deg)
        arr = add_salt_and_pepper(arr, salt_pepper_amt)
        
        if random() < 0.5:
            arr = preprocess(arr)

        fn = f"generated_{i}.jpg"
        Image.fromarray(arr, mode='L').save(os.path.join(img_dir, fn))
        words_dict[fn] = word

    with open(os.path.join(use_dir, 'generated.json'), 'w') as f:
        json.dump(words_dict, f, indent=2)
    print(f"GENERATED {num} images...")

In [36]:
generate_set('train', 200000)

5683 words


100%|██████████| 200000/200000 [08:29<00:00, 392.52it/s]


GENERATED 200000 images...


In [37]:
generate_set('val', 40000)

944 words


100%|██████████| 40000/40000 [01:33<00:00, 427.91it/s]


GENERATED 40000 images...


### Combine Datasets

In [38]:
def shuffle_labels(use):
    fp_org = os.path.join(OUTPUT_DIR, use, 'labels_org.json')
    with open(fp_org, 'r') as f:
        labels_org = json.load(f)
        
    fp_gen = os.path.join(OUTPUT_DIR, use, 'generated.json')
    with open(fp_gen, 'r') as f:
        labels_gen = json.load(f)
    
    labels_org_ls = list(labels_org.items())
    labels_gen_ls = list(labels_gen.items())
    labels_ls = labels_org_ls + labels_gen_ls
    
    shuffle(labels_ls)
    labels = dict(labels_ls)
    
    with open(os.path.join(OUTPUT_DIR, use, 'labels.json'), 'w') as f:
        json.dump(labels, f, indent=4)

In [39]:
shuffle_labels('train')
shuffle_labels('val')

In [71]:
len(os.listdir(os.path.join(OUTPUT_DIR, 'val', 'images')))

42233

In [67]:
for x in os.listdir(os.path.join(OUTPUT_DIR, 'train', 'images')):
    if os.path.isdir(os.path.join(OUTPUT_DIR, 'train', 'images', x)):
        print(x)

In [70]:
with open(os.path.join(OUTPUT_DIR, 'val', 'labels.json'), 'r') as f:
    print(len(json.load(f)))

42233


## Detection

In [66]:
import hashlib

OUTPUT_DIR = os.path.join(os.path.expanduser('~'), 'datasets/ocr_detection')


def get_word_polygons(num):
    sample_name = f"sample_{num}"
    
    hocr_path = os.path.join(DATA_DIR, f"{sample_name}.hocr")
    
    with open(hocr_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml-xml')

    word_data = []

    # Loop through all lines
    for word in soup.find_all(id=re.compile(r'^word')):
        title = word.get('title', '')

        # Extract bounding box using regex
        bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
        if bbox_match:
            x1, y1, x2, y2 = map(int, bbox_match.groups())
            bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
            word_data.append(bbox)

    return word_data


def create_set(imgs, use):
    set_dir = os.path.join(OUTPUT_DIR, use)
    im_dir = os.path.join(set_dir, 'images')
    os.makedirs(im_dir, exist_ok=True)
    
    labels = {}
    
    for i in tqdm(imgs):
        polygons = get_word_polygons(i)
        img_fn = f"sample_{i}.jpg"
        img_fp = os.path.join(DATA_DIR, img_fn)
        
        with open(img_fp, 'rb') as f:
            img_b = f.read()
                    
        nparr = preprocess_bytes(img_b)
        
        img_save_path = os.path.join(im_dir, img_fn)
        
#         for polygon in polygons:
#             bbox = [polygon[0][0], polygon[0][1], polygon[2][0], polygon[2][1]]
#             bbox = [int(n) for n in bbox]
#             x1, y1, x2, y2 = bbox
#             cv2.rectangle(nparr, (x1, y1), (x2, y2), (0, 255, 0), 2) 
        
        img = Image.fromarray(nparr, mode='L')
        img.save(img_save_path)
        
        with open(img_save_path, 'rb') as f:
            new_b = f.read()
        
        sha256hash = hashlib.sha256(new_b).hexdigest()
        
        labels[img_fn] = {
            'img_dimensions': img.size,
            'img_hash': sha256hash,
            'polygons': polygons
        }
            
    labels_ls = list(labels.items())
    shuffle(labels_ls)
    labels = dict(labels_ls)
    
    with open(os.path.join(set_dir, 'labels.json'), 'w') as f:
        json.dump(labels, f, indent=2)
        
    print(f"Created {len(labels)} imgs")

In [67]:
create_set(list(range(4000)), 'train')

100%|██████████| 4000/4000 [26:08<00:00,  2.55it/s]


Created 4000 imgs


In [70]:
create_set(list(range(4000, 4500)), 'val')

100%|██████████| 500/500 [01:44<00:00,  4.80it/s]


Created 500 imgs


In [74]:
x = "sample_200.jpg"
shutil.copyfile(os.path.join(OUTPUT_DIR, 'train', 'images', x), f"images/{x}")

'images/sample_200.jpg'