In [12]:
# !pip uninstall wikipedia-api
# !pip install wikipedia


Extracting sentences from Wikipedia in Romanian and generating handwriting images using different fonts.

In [3]:
# import wikipedia
import re
import random

# wikipedia.set_lang("ro")

topics = ["Inteligență artificială", "România", "Baze de date", "Algoritm", "Rețea neuronală"," Procesare de limbaj natural", "Învățare automată", "Învățare profundă", "Recunoaștere a vorbirii", "Recunoaștere a imaginilor", "Robotica", "Sisteme expert", "Securitate cibernetică", "Internetul lucrurilor", "Blockchain", "Big data"]
len(topics)

16

In [None]:

sentences = []

for topic in topics:
    try:
        content = wikipedia.page(topic).content
    except wikipedia.exceptions.DisambiguationError as e:
        content = wikipedia.page(e.options[0]).content
    except Exception as e:
        print(f"Skip {topic}: {e}")
        continue

    raw = re.split(r'(?<=[.!?])\s+', content)
    for s in raw:
        s = s.strip()
        if 30 < len(s) < 150 and all(c.isprintable() for c in s):
            sentences.append(s)

selected = random.sample(sentences, k=min(1000, len(sentences)))

with open("propozitii_ro.txt", "w", encoding="utf-8") as f:
    for s in selected:
        f.write(s + "\n")

print(f"I have extracted {len(selected)} sentences from Wikipedia.")


Generating handwriting images using different fonts.

In [3]:
from PIL import Image, ImageDraw, ImageFont
import pandas as pd
import os
import textwrap
import glob


input_file = "propozitii_ro.txt"
font_dir = "fonts" # fonts that will be used to generate handwriting
output_dir = "handwriting_dataset"
os.makedirs(output_dir, exist_ok=True)

# load sentences
with open(input_file, "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

# load fonts
font_files = glob.glob(os.path.join(font_dir, "*.ttf"))
if not font_files:
    raise ValueError(" Did not find fonts in  'fonts/'")

# generate images
records = []
for i, sentence in enumerate(sentences):
    for font_path in font_files:
        try:
            font = ImageFont.truetype(font_path, 32)
        except Exception as e:
            print(f"[WARN] Couldn't load font {font_path}: {e}")
            continue

        wrapped = textwrap.fill(sentence, width=40)
        img = Image.new("RGB", (800, 120), "white")
        draw = ImageDraw.Draw(img)
        draw.text((10, 10), wrapped, font=font, fill="black")

        font_name = os.path.basename(font_path).split('.')[0]
        base_name = f"line_{i}_{font_name}"
        img_path = os.path.join(output_dir, f"{base_name}.png")
        txt_path = os.path.join(output_dir, f"{base_name}.txt")

        img.save(img_path)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(sentence)

        records.append({"image": img_path, "text": sentence})

# === Export .csv pentru pairing
csv_path = os.path.join(output_dir, "dataset_info.csv")
pd.DataFrame(records).to_csv(csv_path, index=False)

print(f"I have generated {len(records)} images.")
print(f"CSV: {csv_path}")


I have generated 11304 images.
CSV: handwriting_dataset\dataset_info.csv


In [4]:
from PIL import Image, ImageEnhance, ImageFilter
import os
import glob
import random

# config
input_dir = "handwriting_dataset"
output_dir = "handwriting_dataset_augmented"
os.makedirs(output_dir, exist_ok=True)

image_paths = glob.glob(os.path.join(input_dir, "*.png"))

def augment_image(img):
    # we rotate the image by a random angle
    angle = random.uniform(-5, 5)
    img = img.rotate(angle, expand=True, fillcolor="white")

    # brightness
    enhancer = ImageEnhance.Brightness(img)
    img = enhancer.enhance(random.uniform(0.85, 1.15))

    # blur
    if random.random() < 0.4:
        img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.3, 1.0)))

    # noise
    pixels = img.load()
    for _ in range(random.randint(150, 400)):
        x = random.randint(0, img.size[0] - 1)
        y = random.randint(0, img.size[1] - 1)
        color = (0, 0, 0) if random.random() < 0.5 else (255, 255, 255)
        pixels[x, y] = color

    return img

# augment images
for path in image_paths:
    img = Image.open(path).convert("RGB")
    aug_img = augment_image(img)

    base_name = os.path.basename(path).replace(".png", "_aug.png")
    aug_img.save(os.path.join(output_dir, base_name))

print(f"Augmented images saved in: {output_dir}")


Augmented images saved in: handwriting_dataset_augmented


In [5]:
import shutil
import pandas as pd
import os

original_dir = "handwriting_dataset"
augmented_dir = "handwriting_dataset_augmented"
original_csv = os.path.join(original_dir, "dataset_info.csv")
output_csv = "handwriting_dataset_full.csv"

# read original dataset
df_original = pd.read_csv(original_csv)
df_original["augmented"] = False

# read augmented dataset
df_augmented = df_original.copy()
df_augmented["image"] = df_augmented["image"].apply(
    lambda path: os.path.join(augmented_dir, os.path.basename(path).replace(".png", "_aug.png"))
)
df_augmented["augmented"] = True

# combine datasets
df_full = pd.concat([df_original, df_augmented], ignore_index=True)

# save combined dataset
df_full.to_csv(output_csv, index=False)
# save to a combined folder
combined_dir = "handwriting_dataset_ro"
os.makedirs(combined_dir, exist_ok=True)
for _, row in df_full.iterrows():
    src = row["image"]
    dst = os.path.join(combined_dir, os.path.basename(src))
    if not os.path.exists(dst):
        shutil.copy2(src, dst)
print(f"Combined dataset saved in: {combined_dir}")
print(f"Combined dataset saved in: {output_csv}")
print(f"Total images: {len(df_full)}")


Combined dataset saved in: handwriting_dataset_ro
Combined dataset saved in: handwriting_dataset_full.csv
Total images: 22608
