In [1]:
!pip install pillow
!pip install lmdb opencv-python-headless imgaug --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.8/297.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m948.0/948.0 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import cv2
import lmdb
import random
import numpy as np
from tqdm.auto import tqdm
from PIL import Image, ImageDraw, ImageFont
import io
import pickle
import pandas as pd

In [5]:
# Font size typical receipt text, bisa di-tweak
FONT_SIZE = 28
PADDING = 10  # pixel padding di sekitar text

In [6]:
import os
import random
from tqdm.auto import tqdm
from PIL import Image, ImageDraw, ImageFont
import numpy as np


# Font folder dan list fonts
font_dir = "/kaggle/input/receipt-fonts/fonts"
receipt_fonts = [
    os.path.join(font_dir, f)
    for f in os.listdir(font_dir)
    if f.lower().endswith((".ttf", ".otf"))
]

output_base_dir = "/kaggle/working/generated_receipt_images"
os.makedirs(output_base_dir, exist_ok=True)

def generate_image(text, font_path, font_size=FONT_SIZE):
    font = ImageFont.truetype(font_path, font_size)
    
    dummy_img = Image.new("RGB", (1, 1))
    draw = ImageDraw.Draw(dummy_img)
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    img_width = text_width + 2 * PADDING
    img_height = text_height + 2 * PADDING
    image = Image.new("RGB", (img_width, img_height), color="white")

    draw = ImageDraw.Draw(image)
    draw.text((PADDING, PADDING), text, font=font, fill="black")

    return image

def generate_images_per_font(font_path, all_texts, num_images=500):
    font_name = os.path.splitext(os.path.basename(font_path))[0]
    font_output_dir = os.path.join(output_base_dir, font_name)
    os.makedirs(font_output_dir, exist_ok=True)

    image_count = 0
    for i in tqdm(range(num_images), desc=f"Generating images for {font_name}"):
        text = random.choice(all_texts)
        img = generate_image(text, font_path, FONT_SIZE)
        img.save(os.path.join(font_output_dir, f"{font_name}_{i:04d}.png"))
        image_count += 1

    print(f"✅ Done! Total images generated: {image_count}")
    print(f"📁 Output per font: saved under {font_output_dir}/")

print(f"Total fonts found: {len(receipt_fonts)}")


Total fonts found: 7


In [7]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

# === Step 1: Load Data ===
df = pd.read_csv("/kaggle/input/openfoodfacts-export-csv/openfoodfacts_export_csv.csv", 
                 on_bad_lines='skip', sep='\t', low_memory=True)

df = df[["product_name_nb", "generic_name_nb", "brands"]]
all_words = df.to_numpy().flatten()

# Filter NaN dan duplikat
all_words = [x for x in all_words if str(x) != 'nan']
all_words = list(set(all_words))

# === Step 2: Generate Price Strings ===
number_strings = []
for _ in range(len(all_words) * 9 // 10):
    digits = np.random.randint(1, 100, 4)
    number_strings.append(f"{digits[0]},{str(digits[1]).zfill(2)}")

for _ in range(len(all_words) * 1 // 10):
    before = np.random.randint(100, 999)
    after = str(np.random.randint(1, 99)).zfill(2)
    number_strings.append(f"{before},{after}")

# === Step 3: Combine Text + Price + Label ===
all_combinations = []

for idx, word in enumerate(tqdm(all_words[:1000])):  # cukup 1000 kata
    for price in random.sample(number_strings, 5):
        combined_string = f"{word}    {price}"
        
        # Dummy image path (nanti bisa diganti saat generate gambar)
        image_path = f"images/{idx}_{word.replace(' ', '_')}.jpg"
        
        # Dummy quad bbox (contoh: kiri atas, kanan atas, kanan bawah, kiri bawah)
        bbox = [[0, 0], [100, 0], [100, 30], [0, 30]]  # Ukuran bisa disesuaikan
        
        # Format label line
        label_line = f"{image_path}\t{combined_string}\t{bbox}"
        all_combinations.append(label_line)

# === Step 4: Simpan ke label.txt ===
with open("label.txt", "w", encoding="utf-8") as f:
    for line in all_combinations:
        f.write(line + "\n")

  df = pd.read_csv("/kaggle/input/openfoodfacts-export-csv/openfoodfacts_export_csv.csv",
100%|██████████| 1000/1000 [00:00<00:00, 65347.11it/s]


In [8]:
total_images_generated = 0
for font_path in receipt_fonts:
    generate_images_per_font(font_path, all_combinations, num_images=500)
    total_images_generated += 500

print(f"🎉 All fonts processed!")
print(f"📊 Total images generated overall: {total_images_generated}")
print(f"📁 Output base directory: {output_base_dir}/")

Generating images for Monaco: 100%|██████████| 500/500 [00:05<00:00, 87.03it/s]


✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/Monaco/


Generating images for MerchantCopyDoublesize-jE7R: 100%|██████████| 500/500 [00:12<00:00, 39.90it/s]


✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/MerchantCopyDoublesize-jE7R/


Generating images for DejaVuSans: 100%|██████████| 500/500 [00:04<00:00, 123.71it/s]


✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/DejaVuSans/


Generating images for MerchantCopy-GOXq: 100%|██████████| 500/500 [00:07<00:00, 63.82it/s]


✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/MerchantCopy-GOXq/


Generating images for MerchantCopyWide-z8m0: 100%|██████████| 500/500 [00:11<00:00, 44.15it/s]


✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/MerchantCopyWide-z8m0/


Generating images for DOTMATRI: 100%|██████████| 500/500 [00:04<00:00, 110.93it/s]


✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/DOTMATRI/


Generating images for Epson Pixeled: 100%|██████████| 500/500 [00:04<00:00, 101.05it/s]

✅ Done! Total images generated: 500
📁 Output per font: saved under /kaggle/working/generated_receipt_images/Epson Pixeled/
🎉 All fonts processed!
📊 Total images generated overall: 3500
📁 Output base directory: /kaggle/working/generated_receipt_images/





In [9]:
import shutil

# Path folder yang ingin di-zip
folder_to_zip = "/kaggle/working/generated_receipt_images"
output_zip_path = "/kaggle/working/generated_receipt_images.zip"

# Membuat zip dari folder
shutil.make_archive(output_zip_path.replace(".zip", ""), 'zip', folder_to_zip)

print(f"✅ Folder telah di-zip ke: {output_zip_path}")

✅ Folder telah di-zip ke: /kaggle/working/generated_receipt_images.zip


In [10]:
!git clone https://github.com/clovaai/deep-text-recognition-benchmark

Cloning into 'deep-text-recognition-benchmark'...
remote: Enumerating objects: 499, done.[K
remote: Counting objects: 100% (225/225), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 499 (delta 208), reused 200 (delta 200), pack-reused 274 (from 1)[K
Receiving objects: 100% (499/499), 3.05 MiB | 18.62 MiB/s, done.
Resolving deltas: 100% (308/308), done.


In [12]:
# Augmentasi
def augment_image(pil_img):
    img = np.array(pil_img)

    # Gaussian Noise
    if random.random() < 0.5:
        noise = np.random.normal(0, 15, img.shape).astype(np.uint8)
        img = cv2.add(img, noise)

    # Blur
    if random.random() < 0.5:
        k = random.choice([1, 3])
        img = cv2.GaussianBlur(img, (k, k), 0)

    # Distortion
    if random.random() < 0.5:
        rows, cols = img.shape[:2]
        src = np.float32([[5, 5], [cols - 5, 5], [5, rows - 5]])
        dst = src + np.random.randint(-5, 5, src.shape).astype(np.float32)
        M = cv2.getAffineTransform(src, dst)
        img = cv2.warpAffine(img, M, (cols, rows), borderMode=cv2.BORDER_REPLICATE)

    return Image.fromarray(img)

In [13]:
# === SETUP ===
train_lmdb_dir = "/kaggle/working/deep-text-recognition-benchmark/lmdb_train"
val_lmdb_dir = "/kaggle/working/deep-text-recognition-benchmark/lmdb_val"

env_train = lmdb.open(train_lmdb_dir, map_size=1099511627776)
env_val = lmdb.open(val_lmdb_dir, map_size=1099511627776)
txn_train = env_train.begin(write=True)
txn_val = env_val.begin(write=True)

img_id_train = 0
img_id_val = 0

label_train_lines = []
label_val_lines = []

font_limit = 500
split_ratio = 0.9
train_count = int(font_limit * split_ratio)
val_count = font_limit - train_count

for font_path in tqdm(receipt_fonts):
    font = ImageFont.truetype(font_path, 28)
    samples = []

    for i in range(font_limit):
        text = all_combinations[random.randint(0, len(all_combinations) - 1)]
        img = Image.new("L", (400, 50), 255)
        draw = ImageDraw.Draw(img)
        draw.text((5, 5), text, font=font, fill=0)
        img = augment_image(img)
        samples.append((text, img))

    random.shuffle(samples)
    train_samples = samples[:train_count]
    val_samples = samples[train_count:]

    # === Simpan ke TRAIN ===
    for text, img in train_samples:
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format='PNG')
        image_key = f'image-{img_id_train:09}'.encode()
        label_key = f'label-{img_id_train:09}'.encode()

        txn_train.put(image_key, img_byte_arr.getvalue())
        txn_train.put(label_key, text.encode())

        label_train_lines.append(f"image-{img_id_train:09}.png\t{text}\t[[0,0],[100,0],[100,30],[0,30]]\n")
        img_id_train += 1

        if img_id_train % 1000 == 0:
            txn_train.commit()
            txn_train = env_train.begin(write=True)

    # === Simpan ke VAL ===
    for text, img in val_samples:
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format='PNG')
        image_key = f'image-{img_id_val:09}'.encode()
        label_key = f'label-{img_id_val:09}'.encode()

        txn_val.put(image_key, img_byte_arr.getvalue())
        txn_val.put(label_key, text.encode())

        label_val_lines.append(f"image-{img_id_val:09}.png\t{text}\t[[0,0],[100,0],[100,30],[0,30]]\n")
        img_id_val += 1

        if img_id_val % 1000 == 0:
            txn_val.commit()
            txn_val = env_val.begin(write=True)

# === Final Commit + Simpan Jumlah ===
txn_train.put('num-samples'.encode(), str(img_id_train).encode())
print(f"✅ Done. Total: {img_id_train} train images written to LMDB.")
txn_val.put('num-samples'.encode(), str(img_id_val).encode())
print(f"✅ Done. Total: {img_id_val} val images written to LMDB.")
txn_train.commit()
txn_val.commit()
env_train.close()
env_val.close()

# === Simpan label.txt ===
with open("label_train.txt", "w", encoding="utf-8") as f:
    f.writelines(label_train_lines)

with open("label_val.txt", "w", encoding="utf-8") as f:
    f.writelines(label_val_lines)


100%|██████████| 7/7 [00:30<00:00,  4.37s/it]

✅ Done. Total: 3150 train images written to LMDB.
✅ Done. Total: 350 val images written to LMDB.





# FINETUNING

In [74]:
import os

exp_name = "TPS-ResNet-BiLSTM-Attn-Seed1111"
exp_dir = f"/kaggle/working/deep-text-recognition-benchmark/{exp_name}"
log_path = os.path.join(exp_dir, "log_dataset.txt")

# Buat folder jika belum ada
os.makedirs(exp_dir, exist_ok=True)

# Buat file log_dataset.txt kosong jika belum ada
if not os.path.exists(log_path):
    with open(log_path, "w") as f:
        f.write("")  # atau isi pesan awal jika mau


In [153]:
%cd /kaggle/working/deep-text-recognition-benchmark/

!python train.py \
  --train_data lmdb_train \
  --valid_data lmdb_val \
  --select_data "" \
  --batch_ratio 1.0 \
  --Transformation TPS \
  --FeatureExtraction ResNet \
  --SequenceModeling BiLSTM \
  --Prediction Attn \
  --batch_size 2 \
  --data_filtering_off \
  --workers 0 \
  --batch_max_length 300 \
  --num_iter 100 \
  --valInterval 20 \
  --saved_model TPS-ResNet-BiLSTM-Attn.pth

/kaggle/working/deep-text-recognition-benchmark
------ Use multi-GPU setting ------
if you stuck too long time with multi-GPU setting, try to set --workers 0
 dataset length: 3500
--------------------------------------------------------------------------------
model input parameters 32 100 20 1 512 256 38 300 TPS ResNet BiLSTM Attn
Skip Transformation.LocalizationNetwork.localization_fc2.weight as it is already initialized
Skip Transformation.LocalizationNetwork.localization_fc2.bias as it is already initialized
loading pretrained model from TPS-ResNet-BiLSTM-Attn.pth
Model:
DataParallel(
  (module): Model(
    (Transformation): TPS_SpatialTransformerNetwork(
      (LocalizationNetwork): LocalizationNetwork(
        (conv): Sequential(
          (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): MaxPool2d(kernel_s

In [151]:
# edit dataset.py

dataset_py_code = """
import os
import sys
import six
import random
import string
import warnings
from itertools import accumulate

import lmdb
import numpy as np
from PIL import Image
import torch
import torch.utils.data as data
from torchvision import transforms

class ResizeNormalize(object):
    def __init__(self, size, interpolation=Image.BICUBIC):
        self.size = size
        self.interpolation = interpolation
        self.toTensor = transforms.ToTensor()

    def __call__(self, img):
        img = img.resize(self.size, self.interpolation)
        img = self.toTensor(img)
        img.sub_(0.5).div_(0.5)
        return img

class AlignCollate:
    def __init__(self, imgH=32, imgW=100, keep_ratio_with_pad=False, min_ratio=1):
        self.imgH = imgH
        self.imgW = imgW
        self.keep_ratio_with_pad = keep_ratio_with_pad
        self.min_ratio = min_ratio

    def __call__(self, batch):
        # Filter None (yang muncul karena missing image)
        batch = [b for b in batch if b is not None]
        if len(batch) == 0:
            return None, None  # Bisa juga raise error atau handle sesuai kebutuhan

        images, labels = zip(*batch)

        if self.keep_ratio_with_pad:
            ratios = []
            for image in images:
                w, h = image.size
                ratios.append(w / float(h))
            ratios.sort()
            max_ratio = ratios[-1]
            imgW = int(np.floor(max_ratio * self.imgH))
            imgW = max(self.imgH * self.min_ratio, imgW)
        else:
            imgW = self.imgW

        transform = ResizeNormalize((imgW, self.imgH))
        images = [transform(image) for image in images]
        images = torch.stack(images, 0)

        return images, labels

class LmdbDataset(data.Dataset):
    def __init__(self, root, transform=None):
        self.env = lmdb.open(
            root,
            max_readers=1,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False
        )

        if not self.env:
            print(f'Cannot open LMDB from {root}')
            sys.exit(0)

        with self.env.begin(write=False) as txn:
            n_samples = int(txn.get('num-samples'.encode()).decode())
            self.n_samples = n_samples

        self.transform = transform

    def __len__(self):
        return self.n_samples

    def __getitem__(self, index):
        assert index < len(self), 'Index range error'  # changed to < len, since index is zero-based
    
        # LMDB keys start from 1, so increment index
        lmdb_index = index + 1
    
        with self.env.begin(write=False) as txn:
            img_key = f'image-{lmdb_index:09d}'.encode()
            imgbuf = txn.get(img_key)
    
            # If image buffer is missing, try next index (skip)
            if imgbuf is None:
                print(f'[ERROR] Image buffer is None for key: {img_key.decode()}')
                if lmdb_index + 1 < self.n_samples:
                    return self.__getitem__(lmdb_index)  # recursive call with next index (lmdb_index already +1)
                else:
                    return None  # no more images to try
    
            buf = six.BytesIO()
            buf.write(imgbuf)
            buf.seek(0)
    
            try:
                img = Image.open(buf).convert('L')
            except IOError:
                print(f'[ERROR] Corrupted image at key: {img_key.decode()}')
                if lmdb_index + 1 < self.n_samples:
                    return self.__getitem__(lmdb_index)
                else:
                    return None
    
            label_key = f'label-{lmdb_index:09d}'.encode()
            label = txn.get(label_key).decode('utf-8')
    
            if self.transform:
                img = self.transform(img)
                
            return (img, label)

def hierarchical_dataset(root, opt=None):
    dataset = LmdbDataset(root)
    log_msg = f"{root} dataset length: {len(dataset)}"
    return dataset, log_msg

class Batch_Balanced_Dataset:
    def __init__(self, opt):
        os.makedirs(opt.exp_name, exist_ok=True)
        log = open(f'{opt.exp_name}/log_dataset.txt', 'a')
        align_collate = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD)
        self.data_loader = {}
        self.dataloader_iter = {}
        batch_size = opt.batch_size
        data_ratio = opt.batch_ratio

        select_data = opt.select_data
        batch_ratio = opt.batch_ratio
        assert len(select_data) == len(batch_ratio)

        for selected_d, ratio in zip(select_data, batch_ratio):
            _path = os.path.join(opt.train_data, selected_d)
            dataset = LmdbDataset(_path)
            print(f'{selected_d} dataset length: {len(dataset)}')
            log.write(f'{selected_d} dataset length: {len(dataset)}\\n')
            batch_size_ratio = max(round(batch_size * float(ratio)), 1)

            data_loader = torch.utils.data.DataLoader(
                dataset,
                batch_size=batch_size_ratio,
                shuffle=True,
                num_workers=int(opt.workers),
                collate_fn=align_collate,
                pin_memory=True
            )

            self.data_loader[selected_d] = data_loader
            self.dataloader_iter[selected_d] = iter(data_loader)
        log.close()

    def get_batch(self):
        balanced_batch = []
        for selected_d in self.data_loader:
            try:
                data = next(self.dataloader_iter[selected_d])
            except StopIteration:
                self.dataloader_iter[selected_d] = iter(self.data_loader[selected_d])
                data = next(self.dataloader_iter[selected_d])
            balanced_batch.append(data)
        image = torch.cat([x[0] for x in balanced_batch], 0)
        label = sum([x[1] for x in balanced_batch], [])
        return image, label
"""

# Simpan ke dataset.py
with open("/kaggle/working/deep-text-recognition-benchmark/dataset.py", "w") as f:
    f.write(dataset_py_code)

print("✅ dataset.py berhasil ditulis ulang dengan modifikasi.")

✅ dataset.py berhasil ditulis ulang dengan modifikasi.


In [56]:
!cp /kaggle/input/ocr/other/default/1/TPS-ResNet-BiLSTM-Attn.pth /kaggle/working/deep-text-recognition-benchmark/TPS-ResNet-BiLSTM-Attn.pth

In [136]:
!cp /kaggle/input/fix-bug/utils.py /kaggle/working/deep-text-recognition-benchmark/utils.py

In [132]:
!cp /kaggle/input/fix-bug/train.py /kaggle/working/deep-text-recognition-benchmark/train.py

In [154]:
import shutil

# Path folder yang ingin di-zip
folder_to_zip = "/kaggle/working/deep-text-recognition-benchmark"
output_zip_path = "/kaggle/working/deep-text-recognition-benchmark.zip"

# Membuat zip dari folder
shutil.make_archive(output_zip_path.replace(".zip", ""), 'zip', folder_to_zip)

print(f"✅ Folder telah di-zip ke: {output_zip_path}")

✅ Folder telah di-zip ke: /kaggle/working/deep-text-recognition-benchmark.zip
