In [22]:
import os
from IPython import get_ipython

def configure_environment_paths():
    """Detect environment and configure paths"""
    try:
        if "google.colab" in sys.modules:
            print("✅ Environment: Google Colab")
            base_data_path = "/content/"
            base_output_path = "/content/"
            environment_name = "colab"
        elif 'kaggle_secrets' in sys.modules:
            print("✅ Environment: Kaggle")
            base_data_path = "/kaggle/input/"
            base_output_path = "/kaggle/working/"
            environment_name = "kaggle"
        else:
            print("⚠️ Environment: Local/Unknown")
            base_data_path = "./data/"
            base_output_path = "./output/"
            environment_name = "local"
    except NameError:
        print("⚠️ Non-interactive session. Using local paths.")
        base_data_path = "./data/"
        base_output_path = "./output/"
        environment_name = "local"

    os.makedirs(base_output_path, exist_ok=True)
    print(f"📂 Data Path: {base_data_path}")
    print(f"📦 Output Path: {base_output_path}")
    return base_data_path, base_output_path, environment_name


INPUT_PATH, OUTPUT_PATH, ENV_NAME = configure_environment_paths()

✅ Environment: Google Colab
📂 Data Path: /content/
📦 Output Path: /content/


In [23]:
# @title Environment Setup
import os
import sys
if 'MPLBACKEND' in os.environ:
    del os.environ['MPLBACKEND']
    print("MPLBACKEND environment variable cleared.")

# 2. Clone the repository
%cd {OUTPUT_PATH}
!rm -rf FontDiffusion
!git clone https://github.com/dzungphieuluuky/FontDiffusion.git

/content
Cloning into 'FontDiffusion'...
remote: Enumerating objects: 15166, done.[K
remote: Counting objects: 100% (2979/2979), done.[K
remote: Compressing objects: 100% (2883/2883), done.[K
remote: Total 15166 (delta 150), reused 2905 (delta 94), pack-reused 12187 (from 3)[K
Receiving objects: 100% (15166/15166), 247.44 MiB | 26.63 MiB/s, done.
Resolving deltas: 100% (538/538), done.
Updating files: 100% (128/128), done.


In [2]:
!uv pip install --upgrade pip
!uv pip install -r FontDiffusion/requirements.txt
!uv pip install gdown
# 3. Install PyTorch 1.13
%cd {OUTPUT_PATH}
print("\n⬇️ Installing PyTorch 1.13 (Required for this model)...")
# Force reinstall torch 1.13 to match the model's training environment
!uv pip uninstall torch torchvision
!uv pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117

# 4. Install other dependencies
print("\n⬇️ Installing Dependencies (Manually fixed)...")
# Install xformers compatible with Torch 1.13
!uv pip install xformers==0.0.16 -q

# Install original dependencies
!uv pip install transformers==4.33.1 accelerate==0.23.0 diffusers==0.22.0
!uv pip install gradio==4.8.0 pyyaml pygame opencv-python info-nce-pytorch kornia
# -----------------------------------------------------------------
!uv pip install lpips scikit-image pytorch-fid
!sudo apt-get update && sudo apt-get install dos2unix
print("\n✅ Environment setup complete. You can now proceed to Block 2 (Inference).")

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m1 package[0m [2min 40ms[0m[0m
[2mAudited [1m1 package[0m [2min 0.15ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m116 packages[0m [2min 70ms[0m[0m
[2K  [31m×[0m Failed to build `tokenizers==0.13.3`
[31m  ├─▶ [0mThe build backend returned an error
[31m  ╰─▶ [0mCall to `setuptools.build_meta.build_wheel` failed (exit status: 1)

[31m      [0m[31m[stdout][39m
[31m      [0mrunning bdist_wheel
[31m      [0mrunning build
[31m      [0mrunning build_py
[31m      [0mcopying py_src/tokenizers/__init__.py ->
[31m      [0mbuild/lib.linux-x86_64-cpython-312/tokenizers
[31m      [0mcopying py_src/tokenizers/models/__init__.py ->
[31m      [0mbuild/lib.linux-x86_64-cpython-312/tokenizers/models
[31m      [0mcopying py_src/tokenizers/decoders/__init__.py ->
[31m      [0mbuild/lib.linux-x86_64-cpython-312/tokenizers/decoders
[31m      [0mcopying py_src/tok

In [4]:
def load_secret(key_name: str) -> Optional[str]:
    """
    Loads a secret key from the appropriate environment (Colab, Kaggle, or local env vars).

    Args:
        key_name (str): The name of the secret key to load (e.g., "WANDB_API_KEY", "HF_TOKEN").

    Returns:
        Optional[str]: The secret key value if found, otherwise None.
    """
    env = ENV_NAME
    secret_value = None

    print(f"Attempting to load secret '{key_name}' from '{env}' environment...")

    try:
        if env == "colab":
            from google.colab import userdata
            secret_value = userdata.get(key_name)
        elif env == "kaggle":
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            secret_value = user_secrets.get_secret(key_name)
        else: # Local environment
            secret_value = os.getenv(key_name)

        if not secret_value:
            print(f"⚠️ Secret '{key_name}' not found in the {env} environment.")
            return None

        print(f"✅ Successfully loaded secret '{key_name}'.")
        return secret_value

    except Exception as e:
        print(f"❌ An error occurred while loading secret '{key_name}': {e}")
        return None

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdungngocpham171[0m ([33mdungngocpham171-university-of-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
import gdown

if not os.path.exists("ckpt"):
  url = "https://drive.google.com/drive/folders/12hfuZ9MQvXqcteNuz7JQ2B_mUcTr-5jZ"
  gdown.download_folder(url, quiet=True, use_cookies=False)

In [6]:
# @title Unzipping all archived files
import os
import glob
from zipfile import ZipFile

zip_file_paths = glob.glob(os.path.join(INPUT_PATH, '*.zip'))

if not zip_file_paths:
    print(f'No .zip files found in {INPUT_PATH}.')
else:
    for zip_file_path in zip_file_paths:
        if os.path.exists(zip_file_path):
            print(f'Unzipping {zip_file_path}...')
            !unzip -q -o {zip_file_path} -d ./
            print(f'Unzipping of {zip_file_path} complete.')
        else:
            print(f'Error: The file {zip_file_path} was not found (post-glob check).')

Unzipping /content/my_dataset.zip...
Unzipping of /content/my_dataset.zip complete.


In [7]:
# @title Checking checkpoint files (.pth)
import os
import time

CHECKPOINT_DIR = os.path.join(INPUT_PATH, "ckpt")
print(CHECKPOINT_DIR)
# Create the checkpoint directory
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
# Wait loop to check if files exist
required_files = ["unet.pth", "content_encoder.pth", "style_encoder.pth"]

while True:
    missing = [f for f in required_files if not os.path.exists(f"{CHECKPOINT_DIR}/{f}")]

    if not missing:
        print("\n✅ All weights found! You can proceed to the next step.")
        break
    else:
        print(f"Waiting for files... Missing: {missing}")
        print("Upload them to the 'ckpt' folder now.")
        time.sleep(10) # Checks every 10 seconds

/content/ckpt

✅ All weights found! You can proceed to the next step.


In [8]:
import pandas as pd
import os

def convert_csv_to_chars_txt(input_csv_path: str, output_txt_path: str, column_name: str = 'word'):
    """
    Reads a CSV file, extracts text from a specified column, and writes each character
    to a new line in a plain text file.

    Args:
        input_csv_path (str): The full path to the input CSV file.
        output_txt_path (str): The full path for the output text file.
        column_name (str): The name of the column in the CSV file containing the text.
    """
    if not os.path.exists(input_csv_path):
        print(f"Error: Input CSV file not found at '{input_csv_path}'. Please ensure the file is uploaded.")
        return

    try:
        df = pd.read_csv(input_csv_path)
    except Exception as e:
        print(f"Error reading CSV file '{input_csv_path}': {e}")
        return

    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in the CSV file '{input_csv_path}'.")
        return

    all_characters = []
    # Ensure the column values are treated as strings before iterating over them
    for item in df[column_name].astype(str).dropna().tolist():
        for char in item:
            all_characters.append(char)

    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_txt_path), exist_ok=True)

    with open(output_txt_path, "w", encoding="utf-8") as f:
        f.write("\n".join(all_characters))
    print(f"Successfully converted '{input_csv_path}' to '{output_txt_path}', with one character per line.")

# --- Example Usage (demonstration with a dummy file) ---
# As the original file 'Ds_300_ChuNom_TuTao.csv' was not found in the previous execution,
# let's create a dummy file to demonstrate the function's usage.
print("\n--- Demonstrating function with a dummy CSV file ---")
dummy_csv_path = os.path.join(INPUT_PATH, "dummy_data.csv")
dummy_output_txt_path = os.path.join(OUTPUT_PATH, "dummy_chars.txt")

# Create a dummy CSV file
dummy_data = {'word': ['hello', 'world', 'python']}
pd.DataFrame(dummy_data).to_csv(dummy_csv_path, index=False)
print(f"Created a dummy CSV file at: {dummy_csv_path}")

convert_csv_to_chars_txt(dummy_csv_path, dummy_output_txt_path)

# --- How to use with your actual file ---
# Uncomment the lines below and replace 'your_actual_file.csv' and 'your_output.txt'
# with the correct paths for your use case.
#
# original_csv_file = os.path.join(INPUT_PATH, "Ds_300_ChuNom_TuTao.csv") # Or the full path to your CSV
# original_output_txt = os.path.join(OUTPUT_PATH, "nom_tu_tao.txt") # Or your desired output path
# convert_csv_to_chars_txt(original_csv_file, original_output_txt)



--- Demonstrating function with a dummy CSV file ---
Created a dummy CSV file at: /content/dummy_data.csv
Successfully converted '/content/dummy_data.csv' to '/content/dummy_chars.txt', with one character per line.


In [None]:
from datasets import load_dataset

# Replace with your Hugging Face username and the repo name you chose
repo_name = "YourUsername/font-diffusion-generated-data"

print(f"Downloading dataset from {repo_name}...")

# This downloads the data to a local cache directory (e.g., /root/.cache/huggingface/datasets)
# and returns a Dataset object.
my_dataset = load_dataset(repo_name, split="train")

print("✅ Dataset loaded.")
print(my_dataset) # You can inspect the dataset object

In [None]:
# The dataset is now available on the local file system.
# The `my_dataset` object has info, but your script might need a file path.
# The data is stored in the cache. You can use it directly if your script supports it,
# or save it to a known location.
from pathlib import Path
# Option A: Save the dataset to a predictable location if your script needs a folder path
output_data_dir = Path(OUTPUT_PATH) / "my_dataset"
my_dataset.save_to_disk(output_data_dir)
print(f"Dataset saved to {output_data_dir} for script access.")

# Option B: Modify your script to accept a `Dataset` object (more advanced)
# This is better but might require code changes.

In [None]:
%cd {OUTPUT_PATH}/FontDiffusion
!python sample_batch.py \
    --characters "NomTuTao/Ds_10k_ChuNom_TuTao.txt" \
    --start_line 101 \
    --end_line 200 \
    --style_images "/content/FontDiffusion/styles_images" \
    --ckpt_dir "../ckpt/" \
    --ttf_path "fonts" \
    --output_dir "../my_dataset" \
    --resume_from "../my_dataset/results_checkpoint.json" \
    --batch_size 24 \
    --save_interval 5 \
    --channels_last \
    --num_inference_steps 20 \
    --guidance_scale 7.5 \
    --seed 42 \
    --compile \
    --enable_xformers

/content/FontDiffusion
pygame 2.6.1 (SDL 2.28.4, Python 3.12.12)
Hello from the pygame community. https://www.pygame.org/contribute.html

FONTDIFFUSER STANDARD FORMAT GENERATION
Loading characters from lines 101 to 200 (total: 10174 lines)
Successfully loaded 100 single characters.

Initializing font manager...

Loading 15 fonts from directory...
error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or dir

In [18]:
import os
import glob
import zipfile
from typing import List
def find_result_folders(base_path: str) -> List[str]:
    """
    Return a list of absolute paths to all directories under `base_path`
    whose names start with 'results_'.
    """
    pattern = os.path.join(base_path, "*dataset")
    # glob returns both files and directories; filter to directories only
    return [p for p in glob.glob(pattern) if os.path.isdir(p)]
def zip_folder(folder_path: str, output_base_path: str) -> bool:
    """
    Zip the contents of `folder_path` into a file named
    <folder_name>.zip` inside `output_base_path`.

    Returns True on success, False otherwise.
    """
    folder_name = os.path.basename(folder_path)
    zip_path = os.path.join(output_base_path, f"{folder_name}.zip")
    try:
        print(f"   -> Zipping folder: {folder_name}...")
        with zipfile.ZipFile(
            zip_path, mode="w", compression=zipfile.ZIP_DEFLATED
        ) as zipf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    full_path = os.path.join(root, file)
                    # Preserve relative path inside the zip
                    arcname = os.path.relpath(full_path, os.path.dirname(folder_path))
                    zipf.write(full_path, arcname)
        print(f"   ✅ Created ZIP: {os.path.basename(zip_path)}")
        return True
    except Exception as exc:
        print(f"   ❌ Failed to zip {folder_name}: {exc}")
        return False
def zip_stats_results_folders(output_base_path: str) -> None:
    """
    Main driver: locate all result folders and zip each one.
    """
    # Ensure the output directory exists
    os.makedirs(output_base_path, exist_ok=True)
    result_folders = find_result_folders(output_base_path)
    if not result_folders:
        print(f"⚠️ No folders starting with 'results_' found in '{output_base_path}'.")
        return
    print(f"🔍 Found {len(result_folders)} result folder(s) to zip.")
    successful = 0
    for folder in result_folders:
        if zip_folder(folder, output_base_path):
            successful += 1
    print(
        f"\n✅ DONE! Successfully zipped {successful} out of {len(result_folders)} folder(s)."
    )
if __name__ == "__main__":
    try:
        # Prefer an environment variable; fall back to a global if defined
        output_root = os.getenv("OUTPUT_PATH") or globals().get("OUTPUT_PATH")
        if not output_root:
            raise ValueError("OUTPUT_PATH not defined")
        # The script expects a sub‑folder named 'OuroTrace' under OUTPUT_PATH
        target_path = os.path.join(output_root, "")
        zip_stats_results_folders(target_path)
    except Exception as e:
        print(f"❌ An error occurred: {e}")


🔍 Found 1 result folder(s) to zip.
   -> Zipping folder: dataset...
   ✅ Created ZIP: dataset.zip

✅ DONE! Successfully zipped 1 out of 1 folder(s).


In [25]:
# Install the necessary libraries
!UV pip install huggingface_hub datasets
%cd {OUTPUT_PATH}
import os
from huggingface_hub import HfApi, notebook_login
from datasets import load_dataset

# --- Login to Hugging Face ---
# This will prompt you to enter your token.
notebook_login()

# Or, if you have the token as a string:
# from huggingface_hub import login
# login("hf_YOUR_TOKEN_HERE")

# --- Load your local images into a Dataset object ---
# "imagefolder" is a special builder that understands the "class/image.png" structure.
# Here, each style folder ('style0', 'style1') will be treated as a label.
dataset = load_dataset("imagefolder", data_dir="my_dataset")

# The dataset object will look something like this:
# DatasetDict({
#     train: Dataset({
#         features: ['image', 'label'],
#         num_rows: 1234
#     })
# })
print(dataset)

# --- Push the dataset to the Hub ---
# This will create a new repository under your username.
# Use private=True if you don't want it to be public.
repo_name = "font-diffusion-generated-data"
dataset.push_to_hub(repo_name, private=True)

print(f"✅ Dataset successfully uploaded to: https://huggingface.co/datasets/YourUsername/{repo_name}")

/bin/bash: line 1: UV: command not found
/content


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Resolving data files:   0%|          | 0/4801 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/4799 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image'],
        num_rows: 4799
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/4799 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/48 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   1%|1         |  524kB / 39.6MB            

✅ Dataset successfully uploaded to: https://huggingface.co/datasets/YourUsername/font-diffusion-generated-data
