In [1]:
import xml.etree.ElementTree as ET
from PIL import Image, ImageFont, ImageDraw
import os
import json
import shutil

# Config
## Get this from: https://www.edrdg.org/kanjidic/kanjidic2.xml.gz
kanjidic2_path = "/Users/danyelkoca/Desktop/projects/kanji_png/kanjidic2.xml"

## Get this from: https://fonts.google.com/noto/specimen/Noto+Serif+JP
font_path = "/Users/danyelkoca/Desktop/projects/kanji_png/Noto_Serif_JP/NotoSerifJP-VariableFont_wght.ttf"
output_root = "kanji_dataset"
image_dir = os.path.join(output_root, "images")
image_size = 128
font_size = int(image_size * 0.8)

# Prepare folders
os.makedirs(image_dir, exist_ok=True)
font = ImageFont.truetype(font_path, font_size)

# Parse KANJIDIC2
tree = ET.parse(kanjidic2_path)
root = tree.getroot()

# Extract kanji with frequency, stroke count, and meanings
kanji_list = []
for character in root.findall('character'):
    literal = character.findtext('literal')
    freq_elem = character.find('misc/freq')
    stroke_elem = character.find('misc/stroke_count')
    freq = int(freq_elem.text) if freq_elem is not None else None
    stroke_count = int(stroke_elem.text) if stroke_elem is not None else None
    meanings = []
    for reading_meaning in character.findall('reading_meaning'):
        for rmgroup in reading_meaning.findall('rmgroup'):
            for m in rmgroup.findall('meaning'):
                if m.get('m_lang') is None:
                    text = m.text
                    meanings.append(text)
    # Get JLPT level if available
    jlpt_elem = character.find('misc/jlpt')
    jlpt = int(jlpt_elem.text) if jlpt_elem is not None else None
    if meanings is not None:
        kanji_list.append((freq, literal, meanings, stroke_count, jlpt))


# Generate images and metadata
metadata = []
for i, (freq, kanji, meanings, stroke_count, jlpt) in enumerate(kanji_list):
    file_name = f"{i:04d}.png"
    image_path = os.path.join(image_dir, file_name)

    # Create image
    img = Image.new("L", (image_size, image_size), color=255)
    draw = ImageDraw.Draw(img)
    bbox = draw.textbbox((0, 0), kanji, font=font)
    x = (image_size - (bbox[2] - bbox[0])) / 2 - bbox[0]
    y = (image_size - (bbox[3] - bbox[1])) / 2 - bbox[1]
    draw.text((x, y), kanji, fill=0, font=font)
    img.save(image_path)

    # Record metadata (relative path)
    metadata.append({
        "file": f"images/{file_name}",
        "kanji": kanji,
        "meanings": meanings,
        "stroke_count": stroke_count,
        "frequency": freq,
        "jlpt": jlpt
    })

# Save metadata JSON outside images folder
json_path = os.path.join(output_root, "metadata.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

# Zip the kanji_dataset folder after generating images and metadata
shutil.make_archive('kanji_dataset', 'zip', output_root)

# Remove the output folder after zipping
shutil.rmtree(output_root)

# Print how many kanji were processed
print(f"Processed {len(metadata)} kanji.")

Processed 13108 kanji.


### Hugging Face Datasets
- Added on 2025/6/11

In [7]:
import csv

output_root = "kanji_png_128"
os.makedirs(output_root, exist_ok=True)
font = ImageFont.truetype(font_path, font_size)

# Extract kanji and meanings
kanji_meanings_list = []
for character in root.findall("character"):
    literal = character.findtext("literal")
    meanings = []
    for reading_meaning in character.findall('reading_meaning'):
        for rmgroup in reading_meaning.findall('rmgroup'):
            for m in rmgroup.findall('meaning'):
                if m.get('m_lang') is None:
                    meanings.append(m.text)
    if literal:
        kanji_meanings_list.append((literal, meanings))

# Generate images and CSV rows
csv_rows = []
for i, (kanji, meanings) in enumerate(kanji_meanings_list):
    file_name = f"{i:04d}.png"
    image_path = os.path.join(output_root, file_name)

    # Create image
    img = Image.new("L", (image_size, image_size), color=255)
    draw = ImageDraw.Draw(img)
    bbox = draw.textbbox((0, 0), kanji, font=font)
    x = (image_size - (bbox[2] - bbox[0])) / 2 - bbox[0]
    y = (image_size - (bbox[3] - bbox[1])) / 2 - bbox[1]
    draw.text((x, y), kanji, fill=0, font=font)
    img.save(image_path)

    # Add row for CSV (relative path)
    text = ", ".join(meanings)
    csv_rows.append([file_name, text])

# Save CSV in the same folder as images
csv_path = os.path.join(output_root, "metadata.csv")
with open(csv_path, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["file_name", "text"])
    writer.writerows(csv_rows)

# Zip the folder and remove it
shutil.make_archive("kanji_png_128", "zip", output_root)
shutil.rmtree(output_root)

print(f"Processed {len(csv_rows)} kanji.")

Processed 13108 kanji.
