In [None]:
!pip uninstall cairosvg

### Extract data


In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Path to the input XML file
input_file = "data/kanjivg-20240807.xml"

# Parse the XML file
tree = ET.parse(input_file)
root = tree.getroot()

namespaces = {"kvg": "http://kanjivg.tagaini.net"}

# List to store data
data = []

def extract_paths(kanji_id, element, g_element):
    """Recursively extract paths and data from <g> elements."""
    for path in g_element.findall("path", namespaces):
        path_id = path.attrib.get("id", "")
        path_type = path.attrib.get("{http://kanjivg.tagaini.net}type", "")
        path_d = path.attrib.get("d", "")
        
        # Append data to the list
        data.append({
            "kanji_id": kanji_id,
            "element": element,
            "path_id": path_id,
            "type": path_type,
            "svg_path": path_d
        })
    
    # Recursively process nested <g> elements
    for nested_g in g_element.findall("g", namespaces):
        extract_paths(kanji_id, element, nested_g)

# Iterate through each <kanji> element
for kanji in root.findall("kanji"):
    kanji_id = kanji.get("id", "")  # Extract kanji ID
    
    # Find the first <g> element under the <kanji>
    first_g = kanji.find("g", namespaces)
    if first_g is not None:
        element = first_g.attrib.get("{http://kanjivg.tagaini.net}element", "")
        
        # Extract paths recursively
        extract_paths(kanji_id, element, first_g)

# Convert the list of dictionaries into a Pandas DataFrame
df1 = pd.DataFrame(data)

# Display the first few rows
print(df1["kanji_id"].unique().size)
df1.head()


In [None]:
df[df.kanji_id=="kvg:kanji_07f6b"]

In [None]:
from IPython.display import SVG, display

id = df["kanji_id"][29568]

# Path to your SVG file
svg_file_path = f"data/kanji_svgs/{id}.svg"

# Display the SVG
# display(SVG(svg_file_path), metadata={"width": "20px", "height": "20px"})


In [None]:
# Save to svg file

import os
import xml.etree.ElementTree as ET

# Path to the XML file and output directory
xml_file = "data/kanjivg-20240807.xml"  # Path to your KanjiVG file
output_dir = "data/kanji_svgs"  # Output directory for SVGs

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define SVG namespace
svg_namespace = "http://www.w3.org/2000/svg"
xlink_namespace = "http://www.w3.org/1999/xlink"

# Register namespaces
ET.register_namespace("", svg_namespace)
ET.register_namespace("xlink", xlink_namespace)

# Parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()

# Iterate through all <kanji> elements
for kanji in root.findall("kanji"):
    kanji_id = kanji.attrib['id']  # Get the kanji Unicode ID
    svg_content = [
        f'<svg xmlns="{svg_namespace}" xmlns:xlink="{xlink_namespace}" viewBox="0 0 109 109">',
        '<style>',
        'path { fill: none; stroke: black; stroke-width: 3; stroke-linecap: round; stroke-linejoin: round; }',
        '</style>'
    ]

    # Append all child elements (<g>, <path>, etc.) with full attributes
    for element in kanji:
        svg_content.append(ET.tostring(element, encoding='unicode'))

    svg_content.append("</svg>")

    # Write the SVG to an output file
    output_path = os.path.join(output_dir, f"{kanji_id}.svg")
    with open(output_path, "w", encoding="utf-8") as svg_file:
        svg_file.write("\n".join(svg_content))

print(f"SVG files saved in {output_dir}")


In [None]:
# Path to the input XML file
input_file = "data/kanjidic2.xml"

# Parse the XML file
tree = ET.parse(input_file)
root = tree.getroot()

# List to store extracted data
kanji_data = []

# Iterate through each <character> element
for character in root.findall("character"):
    # Extract basic information
    literal = character.findtext("literal", default="")
    ucs = character.find("codepoint/cp_value[@cp_type='ucs']").text if character.find("codepoint/cp_value[@cp_type='ucs']") else ""
    jis208 = character.find("codepoint/cp_value[@cp_type='jis208']").text if character.find("codepoint/cp_value[@cp_type='jis208']") else ""
    grade = character.find("misc/grade").text if character.find("misc/grade") else ""
    stroke_count = character.find("misc/stroke_count").text if character.find("misc/stroke_count") else ""
    freq = character.find("misc/freq").text if character.find("misc/freq") else ""
    jlpt = character.find("misc/jlpt").text if character.find("misc/jlpt") else ""
    
    # Extract meanings
    meanings = [meaning.text for meaning in character.findall("reading_meaning/rmgroup/meaning") if not meaning.attrib]
    
    # Extract readings
    readings_on = [reading.text for reading in character.findall("reading_meaning/rmgroup/reading[@r_type='ja_on']")]
    readings_kun = [reading.text for reading in character.findall("reading_meaning/rmgroup/reading[@r_type='ja_kun']")]
    
    # Store the data in a dictionary
    kanji_info = {
        "literal": literal,
        "ucs": ucs,
        "jis208": jis208,
        "grade": grade,
        "stroke_count": stroke_count,
        "frequency": freq,
        "jlpt": jlpt,
        "meanings": ", ".join(meanings),
        "readings_on": ", ".join(readings_on),
        "readings_kun": ", ".join(readings_kun),
    }
    kanji_data.append(kanji_info)

# Convert the list of dictionaries to a Pandas DataFrame
df2 = pd.DataFrame(kanji_data)

df2.head(20)
df2.info()

In [None]:
df1 = df1[["kanji_id", "element"]]
df1_cleaned = df1.drop_duplicates()
df1_cleaned = df1_cleaned.rename(columns={"element":"literal"})

df = pd.merge(df1_cleaned, df2, how="inner", on="literal")
df[["kanji_id", "literal", "meanings"]]

In [None]:
# Convert svg into png
import os
import subprocess

# Input and output directories
input_dir = "data/kanji_svgs"  # Directory containing SVG files
output_dir = "data/kanji_pngs"  # Directory for converted images
os.makedirs(output_dir, exist_ok=True)

# Batch conversion using rsvg-convert
for svg_file in os.listdir(input_dir):
    if svg_file.endswith(".svg"):
        input_path = os.path.join(input_dir, svg_file)
        output_path = os.path.join(output_dir, os.path.splitext(svg_file)[0] + ".png")
        
        # Run rsvg-convert command
        subprocess.run(["rsvg-convert", "-w", "128", "-h", "128", input_path, "-o", output_path])
        print(f"Converted: {svg_file} -> {output_path}")

In [None]:
df["file_path"] = "images/" + df["kanji_id"] + ".png"
df = df.rename(columns={"meanings":"caption"})
df[["file_path", "caption"]]


In [None]:
# Check if each file exists
df["file_exists"] = df["file_path"].apply(lambda x: os.path.exists("data/dataset/"+x))
df[~df.file_exists]

In [None]:
# Export to a JSONL file
output_file = "data/dataset/captions.jsonl"
df[["file_path", "caption"]].to_json(output_file, orient="records", lines=True)

print(f"DataFrame exported to {output_file}")

In [None]:
# Read the JSONL file back into a DataFrame
df_loaded = pd.read_json(output_file, orient="records", lines=True)
print(df_loaded)


In [None]:
from IPython.display import Image, display

# Path to your PNG file
png_file_path = "data/dataset/images/kvg:kanji_09fa0.png"

# Display the image
display(Image(filename=png_file_path))
