In [None]:
# KONFIGURASI EKSTRAKSI DATASET DARI DRIVE

print("🚀 SEL 0: Ekstraksi Dataset dari Google Drive...")

from google.colab import drive
import zipfile
import os
import shutil

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Path file zip di Drive dan tujuan ekstraksi
DRIVE_ZIP_PATH = "/content/drive/MyDrive/datasetfix2.zip"  # ⚠️ Ganti dengan path zip Anda
EXTRACT_TO = "/content/datasetfix2"  # Path ekstraksi (sama seperti di Kaggle)

# 3. Hapus folder lama jika ada
if os.path.exists(EXTRACT_TO):
    shutil.rmtree(EXTRACT_TO)
    print(f"ℹ️ Folder lama '{EXTRACT_TO}' dihapus")

# 4. Ekstrak zip
print(f"⏳ Mengekstrak {DRIVE_ZIP_PATH} ke {EXTRACT_TO}...")
with zipfile.ZipFile(DRIVE_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(EXTRACT_TO)

# 5. Verifikasi
print("✅ Ekstraksi selesai. Struktur folder:")
!ls -lh "{EXTRACT_TO}" | head -n 10

# 6. Set variabel path asli program (PERBAIKAN ERROR DI SINI)
KAGGLE_COCO_INPUT_DIR_SEL2 = EXTRACT_TO  # Variabel sesuai program asli
print(f"\n🎉 SEL 0: Dataset siap di path: {KAGGLE_COCO_INPUT_DIR_SEL2}")  # Perbaikan nama variabel

In [None]:
# PENGATURAN PADDLEOCR DAN CHECKPOINT

print("🚀 SEL 1: Memulai Pengaturan Lingkungan di Colab...")
import os
import shutil
# ... (import lain yang relevan untuk SEL 1) ...

os.environ['OMP_NUM_THREADS'] = '1'
print("✅ Environment variable OMP_NUM_THREADS=1 telah di-set.")

print("\n🔄 Menginstal dependensi (jika diperlukan)...")
# (Baris-baris !pip install Anda tetap di sini)
# Contoh:
!pip install --quiet "protobuf==3.20.3"
!python -m pip install --quiet paddlepaddle==2.6.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install --quiet "paddleocr==2.9.1" lmdb rapidfuzz Pillow opencv-python-headless
print("✅ Dependensi (semoga) terinstal.")
# (Verifikasi instalasi Anda)

# --- Modifikasi pada bagian Clone Repository ---
base_working_dir_sel1_safe = "/content/"
paddle_ocr_official_dir_sel1_safe = os.path.join(base_working_dir_sel1_safe, "PaddleOCR_Official")

if not os.path.exists(paddle_ocr_official_dir_sel1_safe):
    print(f"\nDirektori '{paddle_ocr_official_dir_sel1_safe}' tidak ditemukan. Melakukan clone...")
    current_dir_sel1_safe = os.getcwd()
    if current_dir_sel1_safe != base_working_dir_sel1_safe:
        # Pastikan base_working_dir_sel1_safe ada jika kita chdir ke sana
        os.makedirs(base_working_dir_sel1_safe, exist_ok=True)
        os.chdir(base_working_dir_sel1_safe)
    !git clone https://github.com/PaddlePaddle/PaddleOCR.git "{paddle_ocr_official_dir_sel1_safe}"
    if current_dir_sel1_safe != base_working_dir_sel1_safe and os.path.exists(current_dir_sel1_safe):
        os.chdir(current_dir_sel1_safe) # Kembali jika perlu dan path masih valid
    print(f"✅ Repository PaddleOCR berhasil di-clone ke {paddle_ocr_official_dir_sel1_safe}.")
else:
    print(f"✅ Direktori '{paddle_ocr_official_dir_sel1_safe}' sudah ada. Clone dilewati.")

# Pindah ke direktori PaddleOCR Resmi
try:
    if os.getcwd() == base_working_dir_sel1_safe: # Jika kita masih di /content/
        os.chdir(paddle_ocr_official_dir_sel1_safe)
    elif not os.getcwd().endswith("PaddleOCR_Official"): # Jika kita di tempat lain, coba chdir
        os.chdir(paddle_ocr_official_dir_sel1_safe)

    print(f"✅ Direktori kerja saat ini: {os.getcwd()}")
    if not os.getcwd().endswith("PaddleOCR_Official"):
        print(f"⚠️ Peringatan: Direktori kerja mungkin salah. Seharusnya diakhiri 'PaddleOCR_Official'.")
except FileNotFoundError:
    print(f"❌ ERROR: Direktori '{paddle_ocr_official_dir_sel1_safe}' tidak ditemukan untuk diubah.")
    raise
print("\n🎉 SEL 1: Pengaturan Lingkungan Selesai!")

In [None]:
# KONFIGURASI PENGGUNA & AKSES DATASET COCO JSON

print("\n\n🚀 SEL 2: Konfigurasi Pengguna & Akses Dataset COCO JSON Asli...")

# --- PENGATURAN PENGGUNA ---
# Gunakan path yang sudah diekstrak di SEL 0
COLAB_COCO_INPUT_DIR_SEL2 = '/content/datasetfix2'  # Path hasil ekstraksi SEL 0
TRAIN_COCO_JSON_RELPATH_SEL2 = "train/_annotations.coco.json"
VALID_COCO_JSON_RELPATH_SEL2 = "valid/_annotations.coco.json"

# Direktori processing (tetap sama)
COLAB_PROCESSED_RECO_DATA_DIR_SEL2 = '/content/processed_recognition_data'

# --- Validasi Path ---
print(f"ℹ️ Path dataset (hasil ekstraksi): {COLAB_COCO_INPUT_DIR_SEL2}")
print(f"ℹ️ Target processed data: {COLAB_PROCESSED_RECO_DATA_DIR_SEL2}")

# Verifikasi folder hasil ekstraksi SEL 0
if not os.path.isdir(COLAB_COCO_INPUT_DIR_SEL2):
    print(f"\n❌ ERROR: Folder hasil ekstraksi tidak ditemukan di: {COLAB_COCO_INPUT_DIR_SEL2}")
    print("Pastikan SEL 0 sudah dijalankan dan ekstraksi berhasil!")
    print("Isi /content/:")
    !ls -lh /content/
    raise SystemExit("Dataset belum diekstrak")

# Verifikasi file COCO
required_files = [
    os.path.join(COLAB_COCO_INPUT_DIR_SEL2, TRAIN_COCO_JSON_RELPATH_SEL2),
    os.path.join(COLAB_COCO_INPUT_DIR_SEL2, VALID_COCO_JSON_RELPATH_SEL2)
]

missing_files = [f for f in required_files if not os.path.exists(f)]
if missing_files:
    print(f"\n❌ File COCO JSON tidak ditemukan:")
    for f in missing_files:
        print(f"- {f}")
    print("\nStruktur folder yang ada:")
    !tree -L 3 "{COLAB_COCO_INPUT_DIR_SEL2}"
    raise SystemExit("File annotasi COCO tidak lengkap")

# Persiapan folder output
if os.path.exists(COLAB_PROCESSED_RECO_DATA_DIR_SEL2):
    shutil.rmtree(COLAB_PROCESSED_RECO_DATA_DIR_SEL2)

os.makedirs(os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "train_images_cropped"), exist_ok=True)
os.makedirs(os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "val_images_cropped"), exist_ok=True)

print("\n✅ Struktur dataset valid:")
!tree -L 2 "{COLAB_COCO_INPUT_DIR_SEL2}" | head -n 10
print(f"\n🎉 SEL 2: Konfigurasi Dataset COCO JSON Selesai!")

In [None]:
# PRA-PEMROSESAN COCO JSON (ANOTASI KARAKTER) KE FORMAT RECOGNITION

print("\n\n🚀 SEL 3: Memulai Pra-Pemrosesan COCO JSON (Anotasi Karakter)...")

import json
import numpy as np
from PIL import Image
import shutil

# Path output dari sel ini - diubah ke path Colab
output_train_label_reco_file_sel3 = os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "train_label_recognition.txt")
output_val_label_reco_file_sel3 = os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "val_label_recognition.txt")
output_train_imgs_cropped_dir_sel3 = os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "train_images_cropped")
output_val_imgs_cropped_dir_sel3 = os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "val_images_cropped")

# --- Definisi fungsi coco_char_annot_to_line_recognition ---
def coco_char_annot_to_line_recognition(
    coco_json_relative_path, base_dir_of_coco_input,
    output_dir_for_cropped_lines, output_path_for_rec_label,
    cropped_img_subfolder_name_for_label
):
    full_coco_json_path = os.path.join(base_dir_of_coco_input, coco_json_relative_path)
    if not os.path.exists(full_coco_json_path):
        print(f"ℹ️ Info: File COCO JSON '{full_coco_json_path}' tidak ditemukan. Dilewati.")
        return 0
    print(f"\n⚙️ Memproses COCO JSON (anotasi karakter): {full_coco_json_path}")
    with open(full_coco_json_path, 'r', encoding='utf-8') as f_c_char:
        coco_data_char = json.load(f_c_char)

    image_id_to_info = {img['id']: img for img in coco_data_char.get('images', [])}
    category_id_to_char = {cat['id']: cat['name'] for cat in coco_data_char.get('categories', [])}
    annotations_by_image = {}

    for ann_char in coco_data_char.get('annotations', []):
        img_id_char = ann_char.get('image_id')
        if img_id_char not in annotations_by_image:
            annotations_by_image[img_id_char] = []
        annotations_by_image[img_id_char].append(ann_char)

    if not annotations_by_image:
        print(f"  ⚠️ Tidak ada anotasi di {full_coco_json_path}.")
        return 0

    rec_labels_written_count = 0

    with open(output_path_for_rec_label, 'w', encoding='utf-8') as f_rec_out_char:
        for img_idx, (img_id, char_ann_list) in enumerate(annotations_by_image.items()):
            if img_id not in image_id_to_info:
                continue

            image_details_char = image_id_to_info[img_id]
            original_img_fname = image_details_char['file_name']
            coco_json_dir_path = os.path.dirname(full_coco_json_path)
            path_to_original_img_char = os.path.join(coco_json_dir_path, original_img_fname)

            if not os.path.exists(path_to_original_img_char):
                path_to_original_img_char = os.path.join(base_dir_of_coco_input, original_img_fname)
                if not os.path.exists(path_to_original_img_char):
                    print(f"  ❌ Gbr asli '{original_img_fname}' TDK ditemukan (Path: {path_to_original_img_char}). Dilewati.")
                    continue

            if not char_ann_list:
                continue

            try:
                char_ann_list.sort(key=lambda ann: ann['bbox'][0])
            except Exception as e_sort_bbox:
                print(f"  ⚠️ Error sorting '{original_img_fname}': {e_sort_bbox}. Dilewati.")
                continue

            assembled_text_line = "".join([category_id_to_char.get(ann.get('category_id'),'') for ann in char_ann_list])
            if not assembled_text_line.strip():
                continue

            all_char_x_coords, all_char_y_coords = [], []
            valid_points_found = True

            for char_ann_item in char_ann_list:
                bbox_char = char_ann_item.get('bbox')
                x_c, y_c, w_c, h_c = bbox_char if bbox_char and len(bbox_char) == 4 else (None,None,None,None)
                if x_c is None:
                    valid_points_found = False
                    break
                all_char_x_coords.extend([x_c, x_c + w_c])
                all_char_y_coords.extend([y_c, y_c + h_c])

            if not valid_points_found or not all_char_x_coords:
                print(f"  ⚠️ Bbox karakter tdk valid u/ '{original_img_fname}'. Dilewati.")
                continue

            line_bbox_min_x, line_bbox_min_y = int(min(all_char_x_coords)), int(min(all_char_y_coords))
            line_bbox_max_x, line_bbox_max_y = int(max(all_char_x_coords)), int(max(all_char_y_coords))

            try:
                pil_orig_img = Image.open(path_to_original_img_char).convert("RGB")
                img_w_o, img_h_o = pil_orig_img.size

                crop_x1, crop_y1 = max(0, line_bbox_min_x), max(0, line_bbox_min_y)
                crop_x2, crop_y2 = min(img_w_o, line_bbox_max_x), min(img_h_o, line_bbox_max_y)

                if crop_x2 <= crop_x1 or crop_y2 <= crop_y1:
                    print(f"  ⚠️ Bbox baris tdk valid u/ '{original_img_fname}'. Dilewati.")
                    continue

                pil_cropped_line = pil_orig_img.crop((crop_x1, crop_y1, crop_x2, crop_y2))

                if pil_cropped_line.width < 2 or pil_cropped_line.height < 2:
                    print(f"  ⚠️ Crop terlalu kecil u/ '{original_img_fname}'. Dilewati.")
                    continue

                cropped_line_fname = f"linecrop_{img_id}_{char_ann_list[0].get('id', img_idx)}.png"
                path_to_save_cropped_line_img = os.path.join(output_dir_for_cropped_lines, cropped_line_fname)
                pil_cropped_line.save(path_to_save_cropped_line_img)

                relative_path_label = os.path.join(cropped_img_subfolder_name_for_label, cropped_line_fname).replace("\\","/")
                f_rec_out_char.write(f"{relative_path_label}\t{assembled_text_line}\n")
                rec_labels_written_count += 1

                if rec_labels_written_count > 0 and rec_labels_written_count % 50 == 0:
                    print(f"    ... {rec_labels_written_count} label recognition dibuat...")

            except Exception as e_img_proc:
                print(f"  ❌ Error cropping/saving '{original_img_fname}': {e_img_proc}")

    print(f"  ✅ Selesai {full_coco_json_path}. Total {rec_labels_written_count} label dibuat.")
    return rec_labels_written_count
# --- Akhir Definisi Fungsi ---

num_train_labels_final_sel3 = coco_char_annot_to_line_recognition(
    TRAIN_COCO_JSON_RELPATH_SEL2, COLAB_COCO_INPUT_DIR_SEL2,
    output_train_imgs_cropped_dir_sel3, output_train_label_reco_file_sel3,
    "train_images_cropped"
)

if VALID_COCO_JSON_RELPATH_SEL2 and VALID_COCO_JSON_RELPATH_SEL2.strip():
    num_val_labels_final_sel3 = coco_char_annot_to_line_recognition(
        VALID_COCO_JSON_RELPATH_SEL2, COLAB_COCO_INPUT_DIR_SEL2,
        output_val_imgs_cropped_dir_sel3, output_val_label_reco_file_sel3,
        "val_images_cropped"
    )
else:
    num_val_labels_final_sel3 = 0
    print("ℹ️ Tdk ada set validasi COCO, pemrosesan validasi dilewati.")

if num_train_labels_final_sel3 == 0:
    print("❌ ERROR: Tdk ada data training dibuat.")
    raise SystemExit("Transformasi training gagal.")

print(f"\n🔍 Verifikasi '{output_train_label_reco_file_sel3}' (5 baris):")
if os.path.exists(output_train_label_reco_file_sel3):
    with open(output_train_label_reco_file_sel3, 'r', encoding='utf-8') as f_v_r_sel3:
        [print(f"  {lvr_sel3.strip()}") for i_vr_sel3, lvr_sel3 in enumerate(f_v_r_sel3) if i_vr_sel3 < 5]

print("\n🎉 SEL 3: Pra-Pemrosesan COCO JSON (Karakter) Selesai!")

In [None]:
# PEMBUATAN FILE custom_char_dict.txt (KAMUS LARAKTER)

print("\n\n🚀 SEL 4: Memulai Pembuatan Kamus Karakter...")
path_char_dict_output_sel4 = os.path.join(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, "custom_char_dict.txt")
SETTING_USE_SPACE_CHAR_SEL4 = True  # Set True untuk menyertakan spasi
collected_chars_final_sel4 = set()

# --- Implementasi Fungsi yang Hilang ---
def generate_char_dict_final_sel4(label_file, char_set):
    """Mengumpulkan karakter unik dari file label"""
    if not os.path.exists(label_file):
        print(f"ℹ️ File label '{label_file}' tidak ditemukan")
        return

    print(f"🔍 Memproses karakter dari: {label_file}")
    with open(label_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:  # Format: path_image\ttext_label
                text = parts[1]
                for char in text:
                    char_set.add(char)

    print(f"  Ditemukan {len(char_set)} karakter unik")

# --- Proses File Training dan Validasi ---
print("⏳ Mengumpulkan karakter dari file label...")
generate_char_dict_final_sel4(output_train_label_reco_file_sel3, collected_chars_final_sel4)

if os.path.exists(output_val_label_reco_file_sel3):
    generate_char_dict_final_sel4(output_val_label_reco_file_sel3, collected_chars_final_sel4)

# --- Tambahkan Spasi Jika Diperlukan ---
if SETTING_USE_SPACE_CHAR_SEL4:
    collected_chars_final_sel4.add(' ')
    print("ℹ️ Menambahkan karakter spasi")

# --- Validasi dan Simpan Kamus ---
if not collected_chars_final_sel4 or (len(collected_chars_final_sel4) == 1 and ' ' in collected_chars_final_sel4):
    print("❌ ERROR: Tidak ada karakter yang valid untuk kamus")
    print("Kemungkinan penyebab:")
    print("1. File label training/validasi kosong")
    print("2. Path file label tidak benar")
    print(f"   - Training: {output_train_label_reco_file_sel3} (ada: {os.path.exists(output_train_label_reco_file_sel3)})")
    print(f"   - Validasi: {output_val_label_reco_file_sel3} (ada: {os.path.exists(output_val_label_reco_file_sel3)})")
    raise SystemExit("Pembuatan kamus karakter gagal")

# --- Urutkan dan Simpan ---
sorted_chars_final_sel4 = sorted(list(collected_chars_final_sel4))
with open(path_char_dict_output_sel4, 'w', encoding='utf-8') as f:
    for char in sorted_chars_final_sel4:
        f.write(f"{char}\n")

# --- Verifikasi Final ---
print(f"\n✅ Kamus karakter disimpan di: {path_char_dict_output_sel4}")
print(f"   Total karakter unik: {len(sorted_chars_final_sel4)}")
print("   Contoh karakter (max 100):", ''.join(sorted_chars_final_sel4[:100]))

print("\n🎉 SEL 4: Pembuatan Kamus Karakter Selesai!")

In [None]:
# MODIFIKASI FILE KONFIGURASI YAML

print("\n\n🚀 SEL 6: Modifikasi File Konfigurasi YAML untuk Arsitektur PP-OCRv4 Mobile...")
import yaml
import os

# --- Verifikasi Variabel Penting dari Sel Sebelumnya (TETAP SAMA) ---
required_vars_sel6_cpu_chkpt = [
    'COLAB_PROCESSED_RECO_DATA_DIR_SEL2',
    'output_train_label_reco_file_sel3',
    'output_val_label_reco_file_sel3',
    'num_val_labels_final_sel3',
    'path_char_dict_output_sel4',
    'SETTING_USE_SPACE_CHAR_SEL4',
    'sorted_chars_final_sel4'
]
print("🔍 Memeriksa ketersediaan variabel yang dibutuhkan...")
all_vars_present_sel6_cpu_chkpt = True
for var_name_sel6_chkpt in required_vars_sel6_cpu_chkpt:
    if var_name_sel6_chkpt not in locals():
        print(f"❌ ERROR FATAL: Variabel '{var_name_sel6_chkpt}' tidak terdefinisi!")
        all_vars_present_sel6_cpu_chkpt = False
if not all_vars_present_sel6_cpu_chkpt:
    raise NameError("Satu atau lebih variabel penting tidak terdefinisi untuk SEL 6.")
else:
    print("✅ Semua variabel yang dibutuhkan untuk SEL 6 tersedia.")

# --- [DIUBAH] Pemilihan File Konfigurasi Dasar ke PP-OCRv4 Mobile ---
base_repo_path_sel6_chkpt = "/content/PaddleOCR_Official" # Pastikan path ini benar
# Kita menggunakan config yang ringan dan cepat, cocok untuk Raspberry Pi
BASE_CFG_PATH_SEL6_chkpt = os.path.join(base_repo_path_sel6_chkpt, 'configs/rec/PP-OCRv4/PP-OCRv3_mobile_rec.yml')
print(f"ℹ️ Menggunakan file konfigurasi dasar baru: {BASE_CFG_PATH_SEL6_chkpt}")

if not os.path.exists(BASE_CFG_PATH_SEL6_chkpt):
    print(f"❌ Error: File config dasar PP-OCRv4 tidak ditemukan di: {BASE_CFG_PATH_SEL6_chkpt}")
    raise FileNotFoundError("File config dasar PP-OCRv4 tidak ditemukan. Pastikan repo PaddleOCR ter-clone dengan benar.")

# --- [DIUBAH] Nama file konfigurasi custom yang baru ---
CUSTOM_CFG_PATH_SEL6_chkpt = os.path.join(os.getcwd(), 'rec_ppocrv4_pi5.yml')
os.makedirs(os.path.dirname(CUSTOM_CFG_PATH_SEL6_chkpt), exist_ok=True)
print(f"ℹ️ Menyalin '{BASE_CFG_PATH_SEL6_chkpt}' ke '{CUSTOM_CFG_PATH_SEL6_chkpt}'...")
!cp -f "{BASE_CFG_PATH_SEL6_chkpt}" "{CUSTOM_CFG_PATH_SEL6_chkpt}"
print("✅ File konfigurasi disalin.")

# --- Baca dan Modifikasi YAML ---
with open(CUSTOM_CFG_PATH_SEL6_chkpt, 'r', encoding='utf-8') as f_y6_chkpt:
    yaml_cfg_6_chkpt = yaml.safe_load(f_y6_chkpt)

# --- Penyesuaian Konfigurasi Global ---
yaml_cfg_6_chkpt['Global']['use_gpu'] = False
print("ℹ️ Global.use_gpu diatur ke False untuk training CPU.")

yaml_cfg_6_chkpt['Global']['epoch_num'] = 100 # PP-OCRv4 butuh epoch lebih banyak untuk konvergen
SAVED_MODEL_DIR_SEL6 = "./output/rec_ppocrv4_mobile_checkpoint" # Direktori output baru

os.makedirs(SAVED_MODEL_DIR_SEL6, exist_ok=True)

yaml_cfg_6_chkpt['Global']['save_model_dir'] = SAVED_MODEL_DIR_SEL6
yaml_cfg_6_chkpt['Global']['eval_batch_step'] = [0, 2000] # Evaluasi setiap 2000 step
yaml_cfg_6_chkpt['Global']['use_space_char'] = SETTING_USE_SPACE_CHAR_SEL4
yaml_cfg_6_chkpt['Global']['max_text_length'] = 50

# --- [FIX KRITIS] Menggunakan PATH RELATIF untuk portabilitas ---
# Asumsikan file kamus dan label ada di dalam `COLAB_PROCESSED_RECO_DATA_DIR_SEL2`
# Kita akan membuat path relatif dari lokasi file yml.
relative_data_path = os.path.relpath(COLAB_PROCESSED_RECO_DATA_DIR_SEL2, os.path.dirname(CUSTOM_CFG_PATH_SEL6_chkpt))
relative_char_dict_path = os.path.relpath(path_char_dict_output_sel4, os.path.dirname(CUSTOM_CFG_PATH_SEL6_chkpt))

print(f"ℹ️ Mengkonversi path absolut ke path relatif untuk portabilitas...")
yaml_cfg_6_chkpt['Global']['character_dict_path'] = relative_char_dict_path

# --- LOGIKA CHECKPOINT YANG DIPERBAIKI (TETAP SAMA) ---
checkpoint_files = {
    'params': os.path.join(SAVED_MODEL_DIR_SEL6, "latest.pdparams"),
    'opt': os.path.join(SAVED_MODEL_DIR_SEL6, "latest.pdopt"),
}
all_checkpoint_files_exist = all(os.path.exists(f) for f in checkpoint_files.values())

if all_checkpoint_files_exist:
    print("\n✅ Checkpoint lengkap ditemukan.")
    yaml_cfg_6_chkpt['Global']['checkpoints'] = os.path.join(SAVED_MODEL_DIR_SEL6, "latest")
    if 'pretrained_model' in yaml_cfg_6_chkpt['Global']:
        del yaml_cfg_6_chkpt['Global']['pretrained_model']
    print("\n⚠️ Akan melanjutkan training dari checkpoint...")
else:
    print("\n❌ Checkpoint tidak ditemukan. Training akan dimulai dari awal.")
    yaml_cfg_6_chkpt['Global'].pop('checkpoints', None)
    # Hapus pretrained_model bawaan dari config file agar training murni dari data kita
    yaml_cfg_6_chkpt['Global'].pop('pretrained_model', None)

# --- [DIUBAH] Penyesuaian Arsitektur & Optimizer untuk PP-OCRv4 ---
# Arsitektur sudah benar dari file dasar, kita hanya perlu menyesuaikan jumlah kelas output
num_classes = len(sorted_chars_final_sel4)
if SETTING_USE_SPACE_CHAR_SEL4:
    num_classes += 1
# +1 untuk blank character CTC
yaml_cfg_6_chkpt['Architecture']['Head']['out_channels'] = num_classes + 1
print(f"✅ Arsitektur Head diatur ke {num_classes + 1} kelas output.")

# Optimizer yang lebih cocok untuk arsitektur SVTR (PP-OCRv4)
yaml_cfg_6_chkpt['Optimizer'] = {
    'name': 'Adam',
    'beta1': 0.9,
    'beta2': 0.999,
    'lr': {
        'name': 'Cosine',
        'learning_rate': 0.0005,
        'warmup_epoch': 2
    },
    'regularizer': {'name': 'L2', 'factor': 0.00001}
}
print("✅ Optimizer diatur ke Adam dengan Cosine LR Scheduler.")

# --- [DIUBAH] Transformasi Gambar untuk PP-OCRv3 ---
# PP-OCRv4 (SVTR) menggunakan 'SVTRRecResizeImg'
img_shape_sel6_chkpt = [3, 48, 320]
print(f"ℹ️ Mengatur image_shape ke: {img_shape_sel6_chkpt}")

def update_svtr_resize_cfg(transforms, shape):
    if transforms is None: return False
    for t_cfg in transforms:
        if 'SVTRRecResizeImg' in t_cfg:
            t_cfg['SVTRRecResizeImg']['image_shape'] = shape
            return True
    return False

if not update_svtr_resize_cfg(yaml_cfg_6_chkpt['Train']['dataset']['transforms'], img_shape_sel6_chkpt):
    print("⚠️ ResizeImg TDK diupdate di Train.")
if 'Eval' in yaml_cfg_6_chkpt and yaml_cfg_6_chkpt['Eval'].get('dataset') and \
   not update_svtr_resize_cfg(yaml_cfg_6_chkpt['Eval']['dataset']['transforms'], img_shape_sel6_chkpt):
    print("⚠️ RecResizeImg TDK diupdate di Eval.")


# --- DataLoader Training (menggunakan path relatif) ---
yaml_cfg_6_chkpt['Train']['dataset']['data_dir'] = relative_data_path
yaml_cfg_6_chkpt['Train']['dataset']['label_file_list'] = [os.path.basename(output_train_label_reco_file_sel3)]
yaml_cfg_6_chkpt['Train']['loader']['batch_size_per_card'] = 64 # Bisa dinaikkan untuk CPU
yaml_cfg_6_chkpt['Train']['loader']['num_workers'] = 2 # Di Colab bisa pakai 2
yaml_cfg_6_chkpt['Train']['loader']['shuffle'] = True

# --- DataLoader Evaluasi (menggunakan path relatif) ---
if 'Eval' in yaml_cfg_6_chkpt:
    if os.path.exists(output_val_label_reco_file_sel3) and num_val_labels_final_sel3 > 0:
        yaml_cfg_6_chkpt['Eval']['dataset']['data_dir'] = relative_data_path
        yaml_cfg_6_chkpt['Eval']['dataset']['label_file_list'] = [os.path.basename(output_val_label_reco_file_sel3)]
        yaml_cfg_6_chkpt['Eval']['loader']['batch_size_per_card'] = 64
        yaml_cfg_6_chkpt['Eval']['loader']['num_workers'] = 2
        print("✅ Konfigurasi Eval disesuaikan dengan path relatif.")
    else:
        if 'Eval' in yaml_cfg_6_chkpt:
            del yaml_cfg_6_chkpt['Eval']
        print("ℹ️ Konfigurasi Eval dihapus karena data validasi tidak ditemukan.")

# Hapus section yang tidak perlu
if 'Test' in yaml_cfg_6_chkpt:
    del yaml_cfg_6_chkpt['Test']
    print("ℹ️ Konfigurasi Test dihapus.")

# --- Simpan Konfigurasi ---
with open(CUSTOM_CFG_PATH_SEL6_chkpt, 'w', encoding='utf-8') as f_y_out_6_chkpt:
    yaml.dump(yaml_cfg_6_chkpt, f_y_out_6_chkpt, sort_keys=False, allow_unicode=True)

print(f"\n✅ Konfigurasi baru berhasil disimpan di: {CUSTOM_CFG_PATH_SEL6_chkpt}")
print("="*60)
print("Isi Konfigurasi Akhir:")
!cat "{CUSTOM_CFG_PATH_SEL6_chkpt}"
print("="*60)

print("\n🎉 SEL 6: Modifikasi File Konfigurasi YAML Selesai!")

In [None]:
# DOWNLOAD FILE PPOCRv3 UNTUK FINE TUNING

%cd /content/PaddleOCR_Official

# Unduh file model pre-trained
print("Downloading pre-trained model...")
!wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar

# Ekstrak file .tar tersebut
print("\nExtracting model...")
!tar -xf en_PP-OCRv3_rec_train.tar

# Verifikasi hasilnya, Anda sekarang seharusnya memiliki folder baru bernama 'en_PP-OCRv3_rec_train'
print("\nExtraction complete. Directory contents:")
!ls -l

In [None]:
# KONFIGURASI CHECKPOINT

from google.colab import drive
import os
import shutil
import time
from threading import Thread

# 1. Mount Drive
drive.mount('/content/drive')

# 2. Folder Backup Khusus
BACKUP_DIR = "/content/drive/MyDrive/OCR_Training_Backup"
os.makedirs(BACKUP_DIR, exist_ok=True)

# 3. File Penting yang Harus Dibackup
ESSENTIAL_FILES = [
    "/content/PaddleOCR_Official/output/",  # Folder model
    "/content/PaddleOCR_Official/train.log",  # Log training
    "/content/processed_data/train_label.txt",  # Label
    "/content/config.yml"  # Konfigurasi
]

def selective_backup():
    while True:
        try:
            # Backup file penting
            for file in ESSENTIAL_FILES:
                if os.path.exists(file):
                    if os.path.isdir(file):
                        shutil.copytree(file, f"{BACKUP_DIR}/{os.path.basename(file)}", dirs_exist_ok=True)
                    else:
                        shutil.copy2(file, BACKUP_DIR)

            # Kompresi untuk hemat space
            !tar -czf "{BACKUP_DIR}/backup_{int(time.time())}.tar.gz" -C "{BACKUP_DIR}" .
            !rm -rf "{BACKUP_DIR}/output" "{BACKUP_DIR}/train.log"  # Hapus duplikat

            print(f"✅ Backup sukses: {time.ctime()}")
        except Exception as e:
            print(f"❌ Gagal backup: {e}")

        time.sleep(1800)  # Backup setiap 30 menit

# Jalankan di background
backup_thread = Thread(target=selective_backup, daemon=True)
backup_thread.start()

In [None]:
# MEMULAI PELATIHAN DENGAN RESUME CHECKPOINT

print("\n\n🚀 SEL 7: Memulai Pelatihan Model...")

# 1. Verifikasi environment
print("🔍 Verifikasi Environment:")
print(f"Working Directory: {os.getcwd()}")
print("Isi output folder:")
!ls -lh "./output"

# 2. Siapkan perintah training
train_cmd = f'python tools/train.py -c "{CUSTOM_CFG_PATH_SEL6_chkpt}"'  # Menggunakan variabel yang benar

# 3. Tambahkan parameter resume jika checkpoint ada
checkpoint_path = os.path.join(SAVED_MODEL_DIR_SEL6, "latest.pdparams")  # Menggunakan variabel yang benar
if os.path.exists(checkpoint_path):
    train_cmd += f' -o Global.checkpoints="{os.path.join(SAVED_MODEL_DIR_SEL6, "latest")}"'  # Menggunakan variabel yang benar
    print(f"✅ Akan melanjutkan dari checkpoint: {checkpoint_path}")
else:
    print("ℹ️ Tidak ditemukan checkpoint, training dari awal")

# 4. Jalankan training
print(f"\n🚀 Menjalankan perintah:\n{train_cmd}")
!{train_cmd}

print("\n🎉 SEL 7: Proses Training Selesai!")