In [4]:
!pip install beautifulsoup4 lxml
!pip install Pillow



## Generating lines dataset

In [1]:
import json
import os
import re

from bs4 import BeautifulSoup
from PIL import Image


DATA_DIR = os.path.join(os.path.expanduser('~'), 'advanced/ocr')
OUTPUT_DIR = os.path.join(os.path.expanduser('~'), 'datasets/ocr_lines/train_data/rec')


def parse_hocr_lines(hocr_path):
    with open(hocr_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml-xml')

    line_data = []

    # Loop through all lines
    for line in soup.find_all(id=re.compile(r'^line')):
        title = line.get('title', '')
        text = line.get_text(separator=' ', strip=True)

        # Extract bounding box using regex
        bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
        if bbox_match:
            x1, y1, x2, y2 = map(int, bbox_match.groups())
            bbox = [x1, y1, x2, y2]
            line_data.append({'text': text, 'bbox': bbox})

    return line_data


def handle_sample(num, use):
    text_file_lines = []
    
    sample_name = f"sample_{num}"
    
    hocr_path = os.path.join(DATA_DIR, f"{sample_name}.hocr")
    lines = parse_hocr_lines(hocr_path)
    
    image_path = os.path.join(DATA_DIR, f"{sample_name}.jpg")
    image = Image.open(image_path)

    sub_output_dir = os.path.join(OUTPUT_DIR, use)
    os.makedirs(sub_output_dir, exist_ok=True)
    
    for idx, line in enumerate(lines):
        bbox = line['bbox']
        cropped_image = image.crop(bbox)
        
        img_file_name = f"{sample_name}_line_{idx}.jpg"
        
        output_path = os.path.join(sub_output_dir, img_file_name)
        
        cropped_image.save(output_path)
        text_file_lines.append(
            os.path.join(use, img_file_name) + "\t" + line['text']
        )
        
    return text_file_lines
    

In [8]:
import os

from tqdm import tqdm


len(os.listdir(DATA_DIR))//3

4500

In [9]:
# TRAIN
label_lines_train = []
for i in tqdm(range(500, 4500)):
    label_lines_train.extend(handle_sample(i, 'train'))

# Save to JSON or print
txt_file = os.path.join(OUTPUT_DIR, "rec_gt_train.txt")
with open(txt_file, 'w', encoding='utf-8') as f:
    for line in label_lines_train:
        f.write(line + '\n')
print(len(label_lines_train))        
        
label_lines_val = []
for i in tqdm(range(500)):
    label_lines_val.extend(handle_sample(i, 'test'))

# Save to JSON or print
txt_file = os.path.join(OUTPUT_DIR, "rec_gt_test.txt")
with open(txt_file, 'w', encoding='utf-8') as f:
    for line in label_lines_val:
        f.write(line + '\n')
print(len(label_lines_val))

100%|██████████| 4000/4000 [20:03<00:00,  3.32it/s]


223730


100%|██████████| 500/500 [02:33<00:00,  3.26it/s]

28956





In [14]:
len_list = [len(t.split('\t')[1]) for t in label_lines_val]
max(len_list)

128

In [15]:
Image.open(os.path.join(OUTPUT_DIR, 'train', 'sample_1002_line_2.jpg')).save('check.png')

In [3]:
txt_dir = os.path.join('./dataset/train_data/rec')
with open(os.path.join(txt_dir, 'rec_gt_train.txt'), 'r') as file:
    lines = [line.rstrip() for line in file]
    
with open(os.path.join(txt_dir, 'rec_gt_test.txt'), 'r') as file:
    lines.extend([line.rstrip() for line in file])
    
lines = [line.strip('-') for line in lines]
len(lines)

252686

In [4]:
train, test = [], []
for line in lines:
    if 4000 <= int(line.split('\t')[0].split('_')[1]) < 4100:
        test.append(line)
    else: 
        train.append(line)
        
print(len(train), len(test))

246887 5799


In [5]:
import random

random.shuffle(train)
random.shuffle(test)

with open(os.path.join(txt_dir, "train_small.txt"), "w") as f:
    for line in train:
        f.write(line + "\n")
        
with open(os.path.join(txt_dir, "test_small.txt"), "w") as f:
    for line in test:
        f.write(line + "\n")

## Generating Detections Dataset

In [4]:
import json
import os
import re

from bs4 import BeautifulSoup
from PIL import Image
from tqdm import tqdm


DATA_DIR = os.path.join(os.path.expanduser('~'), 'advanced/ocr')
OUTPUT_DIR = os.path.join('./dataset', 'det')

os.makedirs(OUTPUT_DIR, exist_ok=True)    


def parse_hocr_lines(hocr_path):
    with open(hocr_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml-xml')

    line_data = []

    # Loop through all lines
    for line in soup.find_all(id=re.compile(r'^line')):
        title = line.get('title', '')
        text = line.get_text(separator=' ', strip=True)

        # Extract bounding box using regex
        bbox_match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)', title)
        if bbox_match:
            x1, y1, x2, y2 = map(int, bbox_match.groups())
            bbox = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
            line_data.append({'transcription': text, 'points': bbox})

    return line_data


def handle_sample(num):
    text_file_lines = []
    
    sample_name = f"sample_{num}"
    
    hocr_path = os.path.join(DATA_DIR, f"{sample_name}.hocr")
    lines = parse_hocr_lines(hocr_path)
    
    img_name = f"{sample_name}.jpg"
        
    text_file_lines.append(
        img_name + "\t" + json.dumps(lines)
    )
        
    return text_file_lines


# TRAIN
anno_train = []
for i in tqdm(range(500, 4500)):
    anno_train.extend(handle_sample(i))

# Save to JSON or print
txt_file = os.path.join(OUTPUT_DIR, "det_gt_train.txt")
with open(txt_file, 'w', encoding='utf-8') as f:
    for word in anno_train:
        f.write(word + '\n')
print(len(anno_train))        
        
# VAL
anno_val = []
for i in tqdm(range(500)):
    anno_val.extend(handle_sample(i))

# Save to JSON or print
txt_file = os.path.join(OUTPUT_DIR, "det_gt_test.txt")
with open(txt_file, 'w', encoding='utf-8') as f:
    for word in anno_val:
        f.write(word + '\n')
print(len(anno_val))
    

100%|██████████| 4000/4000 [08:48<00:00,  7.56it/s]


4000


100%|██████████| 500/500 [01:06<00:00,  7.48it/s]

500



