In [None]:
import shutil
from pathlib import Path
from ultralytics import YOLO

def organize_dataset(source_dir='./graph_dataset', train_ratio=0.8, val_ratio=0.2):
    import random
    
    source_dir = Path(source_dir)
    classes = ['bar', 'line_1', 'line_2', 'line_3']
    
    # Create train/val/test structure
    for split in ['train', 'val', 'test']:
        for class_name in classes:
            (source_dir / split / class_name).mkdir(parents=True, exist_ok=True)
    
    # Move images from class folders to train/val/test splits
    for class_name in classes:
        class_folder = source_dir / class_name
        if not class_folder.exists():
            continue
            
        images = list(class_folder.glob('*.jpg')) + \
                list(class_folder.glob('*.png')) + \
                list(class_folder.glob('*.jpeg'))
        
        if not images:
            continue
            
        random.shuffle(images)
        
        n_train = int(len(images) * train_ratio)
        n_val = int(len(images) * val_ratio)
        
        train_images = images[:n_train]
        val_images = images[n_train:n_train + n_val]
        test_images = images[n_train + n_val:]
        
        # Move to train
        for img in train_images:
            shutil.move(str(img), source_dir / 'train' / class_name / img.name)
        
        # Move to val
        for img in val_images:
            shutil.move(str(img), source_dir / 'val' / class_name / img.name)
        
        # Move to test
        for img in test_images:
            shutil.move(str(img), source_dir / 'test' / class_name / img.name)
        
        print(f"{class_name}: {len(train_images)} train, {len(val_images)} val, {len(test_images)} test")
        
        # Remove empty original folder
        if class_folder.exists() and not any(class_folder.iterdir()):
            class_folder.rmdir()
    
    print(f"\nDataset organized into train/val/test splits in {source_dir}")

def train(data_dir='./graph_dataset', model_dir='./models', epochs=100, imgsz=640, batch_size=16, model_size='n'):
    data_dir = Path(data_dir)
    model_dir = Path(model_dir)
    model_dir.mkdir(exist_ok=True)
    
    model = YOLO(f'yolov8{model_size}-cls.pt')
    
    results = model.train(
        data=str(data_dir),
        epochs=epochs,
        imgsz=imgsz,
        batch=batch_size,
        name='graph_classifier',
        patience=20,
        device='cuda',
        workers=4,
        plots=True,
        val=True,
    )
    
    best_model_path = Path('runs/classify/graph_classifier/weights/best.pt')
    if best_model_path.exists():
        shutil.copy(best_model_path, model_dir / 'CE_graph_classifier.pt')
        print(f"\nModel saved to {model_dir / 'CE_graph_classifier.pt'}")
    
    return model, results

def validate(data_dir='./graph_dataset', model_path='./models/CE_graph_classifier.pt'):
    model = YOLO(model_path)
    metrics = model.val(data=str(data_dir))
    print(f"\nValidation Accuracy: {metrics.results_dict.get('metrics/accuracy_top1', 'N/A'):.3f}")
    return metrics

if __name__ == "__main__":
    # Step 1: Organize dataset from 4 labeled folders into train/val/test splits
    print("Organizing dataset...")
    organize_dataset(
        source_dir='./graph_dataset',
        train_ratio=0.8,  # 80% for training
        val_ratio=0.2     # 20% for validation, 0% for test
    )
    
    # Step 2: Train model
    print("\nStarting training...")
    model, results = train(
        data_dir='./graph_dataset',
        epochs=100,
        imgsz=640,
        batch_size=16,
        model_size='n'  # 'n', 's', 'm', 'l', 'x'
    )
    
    # Step 3: Validate
    print("\nValidating model...")
    validate()

Found 21 image files to process...
[1/21] Processing: Lia_1_p3_fig1.jpeg - ✗ Skipped ([std, dab])
[2/21] Processing: Lia_1_p5_fig1.jpeg - ✗ Skipped (9)
[3/21] Processing: Lia_2_p2_fig1.png