In [1]:
import random
from trdg.generators import GeneratorFromStrings
import multiprocessing as mp
import os
import uuid
from functools import partial

# python3 run.py -c 10 -w 5 -f 64 -l my

Missing modules for handwritten text generation.


In [2]:
# Generate images from a Myanmar dictionary

strings = ["ဂရိဒဏ္ဍာရီ", "သည်", "ရှေးခေတ်ဂရိလူမျိုးများ", "မူလအနေဖြင့်", "ပြောဆိုခဲ့ကြသော", "ဒဏ္ဍာရီ"]

# Code Snippet for Generating Images

## Generate Simple Images

In [3]:
generator = GeneratorFromStrings(
    strings = strings,
    language='my',
    count = len(strings)
)

# Generate your text images
for img, lbl in generator:
    # Do something with the generated image and label
    img.save(f"images/{str(uuid.uuid4())}.jpg")

## Generate Skewed Images

In [4]:
generator = GeneratorFromStrings(
    strings = strings,
    language='my',
    count = len(strings),
    skewing_angle=5, # Define skewing angle of the generated text. In positive degrees
    random_skew=True, # When set, the skew angle will be randomized between the value set with -k and it's opposite
)

# Generate your text images
for img, lbl in generator:
    # Do something with the generated image and label
    img.save(f"images/{str(uuid.uuid4())}.jpg")

## Generate Distorsion Images

In [5]:
generator = GeneratorFromStrings(
    strings = strings,
    language='my',
    count = len(strings),
    distorsion_type=3, # 0: None (Default), 1: Sine wave, 2: Cosine wave, 3: Random
    distorsion_orientation=random.randint(0, 2) # 0: Vertical (Up and down), 1: Horizontal (Left and Right), 2: Both
)

# Generate your text images
for img, lbl in generator:
    # Do something with the generated image and label
    img.save(f"images/{str(uuid.uuid4())}.jpg")

## Generate Blur Images

In [6]:
generator = GeneratorFromStrings(
    strings = strings,
    language='my',
    count = len(strings),
    blur=random.randint(0, 4), # gaussian blur (here 0, 1, 2, 4):
    random_blur=True
)

# Generate your text images
for img, lbl in generator:
    # Do something with the generated image and label
    img.save(f"images/{str(uuid.uuid4())}.jpg")

## Generate Images with Background

In [7]:
generator = GeneratorFromStrings(
    strings = strings,
    language='my',
    count = len(strings),
    background_type=random.randint(0, 3), # gaussian noise (0), plain white (1), quasicrystal (2) or image (3)
)

# Generate your text images
for img, lbl in generator:
    # Do something with the generated image and label
    img.save(f"images/{str(uuid.uuid4())}.jpg")

# Generate Images

**Plan**
- Simple Images (All-2.5M)
- Skewed Images (100K)
- Distorision
    - 10K (distorsion_type: 1, distorsion_orientation: 0)
    - 10K (distorsion_type: 1, distorsion_orientation: 1)
    - 10K (distorsion_type: 1, distorsion_orientation: 2)
    - 10K (distorsion_type: 2, distorsion_orientation: 0)
    - 10K (distorsion_type: 2, distorsion_orientation: 1)
    - 10K (distorsion_type: 2, distorsion_orientation: 2)
    - 10K (distorsion_type: 3, distorsion_orientation: 0)
    - 10K (distorsion_type: 3, distorsion_orientation: 1)
    - 10K (distorsion_type: 3, distorsion_orientation: 2)
- Blur
    - 25K (blur: 0)
    - 25K (blur: 1)
    - 25K (blur: 2)
    - 25K (blur: 4)
- Background
    - 25K (background_type: 0)
    - 25K (background_type: 1)
    - 25K (background_type: 2)
    - 25K (background_type: 3)


## Multiprocessing Functions

In [8]:
import random

In [9]:
with open("data/my_corpus.txt") as file:
    my_corpus = file.read().split("\n")

In [10]:
def shuffle_and_split(data, train_ratio=0.9):
    # Create a copy of the list to avoid modifying the original
    shuffled_data = data.copy()

    # Shuffle the list in place
    random.shuffle(shuffled_data)

    # Calculate the split point
    split_point = int(len(shuffled_data) * train_ratio)

    # Split the list into training and test sets
    train_set = shuffled_data[:split_point]
    test_set = shuffled_data[split_point:]

    return train_set, test_set

In [11]:
my_corpus_train, my_corpus_test = shuffle_and_split(my_corpus)

In [12]:
print(len(my_corpus), len(my_corpus_train), len(my_corpus_test))

2737935 2464141 273794


In [13]:
with open("data/my_corpus_train.txt", "w") as file:
    file.write("\n".join(my_corpus_train))

with open("data/my_corpus_test.txt", "w") as file:
    file.write("\n".join(my_corpus_test))

In [13]:
# with open("data/my_corpus_test.txt", "w") as file:
#     file.write("\n".join(my_corpus[:100]))
#
# with open("data/my_corpus_test.txt") as file:
#     my_corpus = file.read().split("\n")

In [14]:
root_dir="dataset"

In [15]:
def process_string(output_dir, dataset_type, param_dict, string):
    # Create a generator for a single string
    generator = GeneratorFromStrings(
        strings=[string],  # Wrap the string in a list
        language='my',
        count=1,  # Generate one image for this string
        **param_dict
    )

    # Get the generated item
    try:
        item = next(generator)
        img, lbl = item

        # Skip if image is None
        if img is None:
            return None, lbl

        # Create a unique filename
        img_filename = f"{str(uuid.uuid4())}.jpg"
        img_path = os.path.join(output_dir, img_filename)

        # Save the image
        img.save(img_path)

        # Write annotation to file (in a thread-safe way)
        annotations_file = os.path.join(root_dir, f"{dataset_type}.txt")
        with open(annotations_file, 'a') as f:
            f.write(f"{dataset_type}/{img_filename}\t{lbl}\n")

        return img_path, lbl
    except Exception as e:
        print(f"Error processing string '{string}': {str(e)}")
        return None, string

def generate_images_parallel(corpus, output_dir, dataset_type, image_params, num_processes=None):
    # Create a pool of workers
    pool = mp.Pool(processes=num_processes or mp.cpu_count())

    # Create a partial function with fixed arguments
    process_func = partial(process_string, output_dir, dataset_type, image_params)

    # Process strings in parallel
    results = pool.map(process_func, corpus)

    # Close the pool
    pool.close()
    pool.join()

    # Count successful generations
    successful = sum(1 for r in results if r[0] is not None)
    print(f"Generated {successful} images with annotations in {dataset_type}")



# Generate Train Images

In [17]:
dataset_type = "train"
output_dir = root_dir + f"/{dataset_type}"
os.makedirs(root_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Path for annotations file
annotations_file = os.path.join(root_dir, f"{dataset_type}.txt")

# Clear annotations file if it exists
with open(annotations_file, 'w') as f:
    pass

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

## Simple Images

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

generate_images_parallel(my_corpus, output_dir, dataset_type, image_params)

## Skew Images

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:100_000], output_dir, dataset_type, image_params)

## Distorsion Images

### 1. (distorsion_type: 1, distorsion_orientation: 0)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 1,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 2. (distorsion_type: 1, distorsion_orientation: 1)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 1,
    "distorsion_orientation": 1,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 3. (distorsion_type: 1, distorsion_orientation: 2)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 1,
    "distorsion_orientation": 2,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 4. (distorsion_type: 2, distorsion_orientation: 0)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 2,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 5. (distorsion_type: 2, distorsion_orientation: 1)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 2,
    "distorsion_orientation": 1,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 6. (distorsion_type: 2, distorsion_orientation: 2)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 2,
    "distorsion_orientation": 2,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 7. (distorsion_type: 3, distorsion_orientation: 0)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 3,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 8. (distorsion_type: 3, distorsion_orientation: 1)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 3,
    "distorsion_orientation": 1,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

### 9. (distorsion_type: 3, distorsion_orientation: 2)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 3,
    "distorsion_orientation": 2,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

## Blur Images

### 1. (blur: 0)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

### 2. (blur: 1)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 1,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

### 3. (blur: 2)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 2,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

### 4. (blur: 4)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 4,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

## Background Images

### 1. (background_type: 0)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

### 2. (background_type: 1)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 1
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

### 3. (background_type: 2)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 2
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

### 4. (background_type: 3)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 3
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:25_000], output_dir, dataset_type, image_params)

# Generate Test Images

In [15]:
dataset_type = "test"
output_dir = root_dir + f"/{dataset_type}"
os.makedirs(root_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Path for annotations file
annotations_file = os.path.join(root_dir, f"{dataset_type}.txt")

# Clear annotations file if it exists
with open(annotations_file, 'w') as f:
    pass

with open("data/my_corpus_test.txt") as file:
    my_corpus = file.read().split("\n")

## Simple Images

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

generate_images_parallel(my_corpus, output_dir, dataset_type, image_params)

## Skew Images

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:10_000], output_dir, dataset_type, image_params)

## Distorsion Images

### 1. (distorsion_type: 1, distorsion_orientation: 0)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 1,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 2. (distorsion_type: 1, distorsion_orientation: 1)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 1,
    "distorsion_orientation": 1,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 3. (distorsion_type: 1, distorsion_orientation: 2)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 1,
    "distorsion_orientation": 2,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 4. (distorsion_type: 2, distorsion_orientation: 0)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 2,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 5. (distorsion_type: 2, distorsion_orientation: 1)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 2,
    "distorsion_orientation": 1,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 6. (distorsion_type: 2, distorsion_orientation: 2)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 2,
    "distorsion_orientation": 2,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 7. (distorsion_type: 3, distorsion_orientation: 0)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 3,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 8. (distorsion_type: 3, distorsion_orientation: 1)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 3,
    "distorsion_orientation": 1,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

### 9. (distorsion_type: 3, distorsion_orientation: 2)

In [None]:
image_params = {
    "skewing_angle": 5,
    "random_skew": True,
    "distorsion_type": 3,
    "distorsion_orientation": 2,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:1_000], output_dir, dataset_type, image_params)

## Blur Images

### 1. (blur: 0)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

### 2. (blur: 1)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 1,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

### 3. (blur: 2)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 2,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

### 4. (blur: 4)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 4,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

## Background Images

### 1. (background_type: 0)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 0
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

### 2. (background_type: 1)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 1
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

### 3. (background_type: 2)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 2
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)

### 4. (background_type: 3)

In [None]:
image_params = {
    "skewing_angle": 0,
    "random_skew": False,
    "distorsion_type": 0,
    "distorsion_orientation": 0,
    "blur": 0,
    "random_blur": False,
    "background_type": 3
}

with open("data/my_corpus_train.txt") as file:
    my_corpus = file.read().split("\n")

random.shuffle(my_corpus)

generate_images_parallel(my_corpus[:2_500], output_dir, dataset_type, image_params)