In [3]:
import kagglehub
import os
import shutil
import glob
import cv2
from tqdm import tqdm
import yaml
from sklearn.model_selection import train_test_split
import kagglehub

In [None]:

NUMBER_TO_CLASS = {
    '1': 'punching',
    '2': 'welding_line',
    '3': 'crescent_gap',
    '4': 'water_spot',
    '5': 'oil_spot',
    '6': 'silk_spot',
    '7': 'inclusion',
    '8': 'rolled_pit',
    '9': 'crease',
    '10': 'waist_folding'
}

base_dir = os.path.join(os.getcwd(), 'gc10_yolo_dataset')
raw_dataset_path = r"C:\Users\desly\.cache\kagglehub\datasets\alex000kim\gc10det\versions\1"

# Clean up previous failed attempt
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
    print(f"Cleaned up old directory: {base_dir}")

images_dir = os.path.join(base_dir, 'images')
labels_dir = os.path.join(base_dir, 'labels')

for split in ['train', 'val']:
    os.makedirs(os.path.join(images_dir, split), exist_ok=True)
    os.makedirs(os.path.join(labels_dir, split), exist_ok=True)


print(">>> Processing images from numbered folders...")


found_images = {} 

for folder_name in os.listdir(raw_dataset_path):
    # Only process folders '1' to '10'
    if folder_name not in NUMBER_TO_CLASS:
        continue
        
    class_name = NUMBER_TO_CLASS[folder_name]
    folder_path = os.path.join(raw_dataset_path, folder_name)
    
    # Collect all images in this folder
    images = []
    for ext in ['*.jpg', '*.png', '*.jpeg', '*.bmp']:
        images.extend(glob.glob(os.path.join(folder_path, ext)))
    
    if images:
        found_images[class_name] = images

# Verify we found classes
print(f"Found {len(found_images)} classes: {list(found_images.keys())}")
if len(found_images) == 0:
    raise ValueError("Still no images found. Check if the '1', '2'... folders inside the path actually contain .jpg files.")

# Create Class ID Mapping (0 to 9)
class_names_ordered = list(found_images.keys())
class_to_id = {name: idx for idx, name in enumerate(class_names_ordered)}

#Split and save
for class_name, file_list in found_images.items():
    current_id = class_to_id[class_name]
    
    # Split 90/10
    train_files, val_files = train_test_split(file_list, test_size=0.1, random_state=42)
    
    for split, files in [('train', train_files), ('val', val_files)]:
        for img_path in tqdm(files, desc=f"Processing {class_name} ({split})"):
            
            # 1. Create Fake Label (Center of image)
            # Format: class_id x_center y_center width height
            label_data = f"{current_id} 0.5 0.5 1.0 1.0\n"
            
            # 2. Define filenames
            file_name = os.path.basename(img_path)
            unique_name = f"{class_name}_{file_name}"
            
            # 3. Copy Image
            shutil.copy(img_path, os.path.join(images_dir, split, unique_name))
            
            # 4. Save Label
            txt_name = os.path.splitext(unique_name)[0] + ".txt"
            with open(os.path.join(labels_dir, split, txt_name), 'w') as f:
                f.write(label_data)

# ==========================================
# 4. Create data.yaml
# ==========================================
yaml_content = {
    'path': base_dir,
    'train': 'images/train',
    'val': 'images/val',
    'nc': len(class_names_ordered),
    'names': {i: name for i, name in enumerate(class_names_ordered)}
}

yaml_path = os.path.join(base_dir, 'data.yaml')
with open(yaml_path, 'w') as f:
    yaml.dump(yaml_content, f)

print(f"\n>>> SUCCESS! Dataset prepared at: {base_dir}")
print(f"Use this path in your training: {yaml_path}")

Cleaned up old directory: d:\Backup\WORK\MACH-3D\ModelTraining\gc10_yolo_dataset
>>> Processing images from numbered folders...
Found 10 classes: ['punching', 'waist_folding', 'welding_line', 'crescent_gap', 'water_spot', 'oil_spot', 'silk_spot', 'inclusion', 'rolled_pit', 'crease']


Processing punching (train): 100%|██████████| 197/197 [00:01<00:00, 103.46it/s]
Processing punching (val): 100%|██████████| 22/22 [00:00<00:00, 89.24it/s]
Processing waist_folding (train): 100%|██████████| 135/135 [00:01<00:00, 110.04it/s]
Processing waist_folding (val): 100%|██████████| 15/15 [00:00<00:00, 107.19it/s]
Processing welding_line (train): 100%|██████████| 245/245 [00:02<00:00, 100.78it/s]
Processing welding_line (val): 100%|██████████| 28/28 [00:00<00:00, 89.01it/s] 
Processing crescent_gap (train): 100%|██████████| 203/203 [00:01<00:00, 106.48it/s]
Processing crescent_gap (val): 100%|██████████| 23/23 [00:00<00:00, 109.87it/s]
Processing water_spot (train): 100%|██████████| 260/260 [00:02<00:00, 93.54it/s]
Processing water_spot (val): 100%|██████████| 29/29 [00:00<00:00, 108.25it/s]
Processing oil_spot (train): 100%|██████████| 183/183 [00:01<00:00, 109.35it/s]
Processing oil_spot (val): 100%|██████████| 21/21 [00:00<00:00, 112.03it/s]
Processing silk_spot (train): 100%|█


>>> SUCCESS! Dataset prepared at: d:\Backup\WORK\MACH-3D\ModelTraining\gc10_yolo_dataset
Use this path in your training: d:\Backup\WORK\MACH-3D\ModelTraining\gc10_yolo_dataset\data.yaml



