# YOLO-OBB Mixed Data Training (From Scratch)

This notebook trains YOLO-OBB models **from scratch** on varying mixtures of Synthetic and Real-world data.

**Experiments:**
1.  **Mixed Data Training**: Train on 100% Synthetic Data + [5%, 10%, 20%, 30%, 40%] of Real Data.
2.  **Real Data Baselines**: Train on ONLY [5%, 10%, 20%, 30%, 40%] of Real Data.

**Goal**: Evaluate if adding synthetic data improves performance compared to using only limited real data.


## 1. Environment Setup

In [1]:
# 1. Setup & Imports
import os
import random
import shutil
import numpy as np
from pathlib import Path
from tqdm import tqdm
import yaml
from ultralytics import YOLO
import wandb



## 2. Configuration

In [2]:
# 2. Configuration

# --- DATASET PATHS ---
# Use absolute paths or reliable relative paths based on your environment
REAL_DATASET_PATH = Path("Real_Data/train(80_ REAL DATA)")
SYNTHETIC_DATASET_PATH = Path("Synthetized_Data")
TEST_DATASET_PATH = Path("Real_Data/test(20_ REAL DATA)")
DRIVE_UPLOAD_FOLDER = Path("YOLO_OBB_Mixed_Training_Results")

# Ensure output folder exists
DRIVE_UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)

# --- MODEL CONFIG ---
BASE_MODEL = "yolo26l-obb.pt"  # Training from scratch using this architecture
WANDB_PROJECT = "YOUR_WANDB_PROJECT"
img_exts = [".jpg", ".jpeg", ".png"]

# --- TRAINING PARAMETERS (FROM SCRATCH) ---
# High Learning Rate, No Freeze, longer epochs if needed (standard is 100)
TRAIN_CONFIG = {
    "epochs": 100,
    "imgsz": 640,
    "batch": 16,
    "workers": 8,
    "optimizer": "AdamW",
    "lr0": 0.001,       # High initial LR for scratch training
    "lrf": 0.01,        # Final LR fraction
    "momentum": 0.937,
    "weight_decay": 0.0005,
    "patience": 25,     # Early stopping
    "save_period": 10,
    "val": True,
    "device": 0,
    "exist_ok": True,
    # Augmentations (Standard YOLO settings or custom)
    "hsv_h": 0.015, "hsv_s": 0.7, "hsv_v": 0.4,
    "degrees": 10.0,
    "translate": 0.1,
    "scale": 0.5,
    "fliplr": 0.5,
    "mosaic": 1.0,
    "mixup": 0.0,
}

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

print("Configuration loaded.")

Configuration loaded.


## 3. Data Processing & Training Functions

In [3]:
# --- DATASET SUBFOLDERS ---
SYNTHETIC_SUBFOLDERS = [
    "Capture_as_our_dataset",
    "Capture_scene_8",
    "Capture_zone_with_human_occ2",
    "scene_10"
]

def get_image_files(directory, extensions=img_exts):
    files = []
    if not directory.exists():
        return []
    for ext in extensions:
        files.extend(directory.rglob(f"*{ext}"))
    return sorted(list(set(files)))

def get_all_synthetic_images(base_path, subfolders):
    all_files = []
    for sub in subfolders:
        # Structure: base_path / sub / "images"
        target_dir = base_path / sub / "images"
        if target_dir.exists():
            print(f"Scanning {target_dir}...")
            all_files.extend(get_image_files(target_dir))
        else:
            print(f"Warning: {target_dir} not found!")
    return sorted(list(set(all_files)))

def create_mixed_dataset_yaml(synthetic_path, real_path, real_percentage, run_name):
    """
    Creates a YAML for 100% Synthetic + X% Real data.
    """
    print(f"\n--- Preparing Mixed Dataset for {run_name} ---")
    
    # 1. Get All Synthetic Images
    # UPDATED: Use the new helper to gather from subfolders
    synth_images = get_all_synthetic_images(synthetic_path, SYNTHETIC_SUBFOLDERS)
    print(f"Found {len(synth_images)} Synthetic images.")
    
    # 2. Get Real Images and Subset them
    real_images = get_image_files(real_path / "images")
    random.shuffle(real_images)
    
    subset_size = int(len(real_images) * real_percentage)
    real_subset = real_images[:subset_size]
    print(f"Found {len(real_images)} Real images. Using {real_percentage*100}% -> {len(real_subset)} images.")
    
    # 3. Combine
    mixed_train_images = synth_images + real_subset
    print(f"Total Mixed Training Images: {len(mixed_train_images)}")
    
    # 4. Write train.txt
    train_txt_path = Path(f"{run_name}_train.txt")
    with open(train_txt_path, 'w') as f:
        f.write('\n'.join([str(p.resolve()) for p in mixed_train_images]))
        
    # 5. Create YAML
    val_path = TEST_DATASET_PATH
    
    yaml_content = f"""
path: {Path.cwd()}
train: {train_txt_path.resolve()}
val: {val_path.resolve()}
test: {val_path.resolve()}

nc: 1
names: ['object']
"""
    yaml_path = Path(f"{run_name}_dataset.yaml")
    with open(yaml_path, 'w') as f:
        f.write(yaml_content)
        
    return yaml_path

def create_real_only_yaml(real_path, percentage, run_name):
    """
    Creates a YAML for ONLY X% Real data.
    """
    print(f"\n--- Preparing Real-Only Dataset for {run_name} ---")
    
    # 1. Get Real Images and Subset them
    real_images = get_image_files(real_path / "images")
    random.shuffle(real_images)
    
    subset_size = int(len(real_images) * percentage)
    real_subset = real_images[:subset_size]
    print(f"Found {len(real_images)} Real images. Using {percentage*100}% -> {len(real_subset)} images.")
    
    # 2. Write train.txt
    train_txt_path = Path(f"{run_name}_train.txt")
    with open(train_txt_path, 'w') as f:
        f.write('\n'.join([str(p.resolve()) for p in real_subset]))
        
    # 3. Create YAML
    val_path = TEST_DATASET_PATH
    
    yaml_content = f"""
path: {Path.cwd()}
train: {train_txt_path.resolve()}
val: {val_path.resolve()}
test: {val_path.resolve()}

nc: 1
names: ['object']
"""
    yaml_path = Path(f"{run_name}_dataset.yaml")
    with open(yaml_path, 'w') as f:
        f.write(yaml_content)
        
    return yaml_path

# --- TRAIN EXECUTION WRAPPERS ---

def run_mixed_experiment(pct):
    pct_str = f"{int(pct*100)}pct"
    run_name = f"yolo26l_mixed_{pct_str}"
    print(f"\n>>> STARTING MIXED EXPERIMENT: {run_name} <<<")
    
    # Data
    dataset_yaml = create_mixed_dataset_yaml(SYNTHETIC_DATASET_PATH, REAL_DATASET_PATH, pct, run_name)
    
    # Model
    try: model = YOLO(BASE_MODEL)
    except: model = YOLO("yolo11l-obb.pt")
    
    # Train
    model.train(data=str(dataset_yaml), project=WANDB_PROJECT, name=run_name, **TRAIN_CONFIG)
    
    # Save
    best = Path(model.trainer.save_dir) / "weights" / "best.pt"
    target = DRIVE_UPLOAD_FOLDER / f"{run_name}.pt"
    if best.exists():
        shutil.copy(best, target)
        print(f"Saved to {target}")

def run_real_only_experiment(pct):
    pct_str = f"{int(pct*100)}pct"
    run_name = f"yolo26l_real_only_{pct_str}"
    print(f"\n>>> STARTING REAL-ONLY EXPERIMENT: {run_name} <<<")
    
    # Data
    dataset_yaml = create_real_only_yaml(REAL_DATASET_PATH, pct, run_name)
    
    # Model
    try: model = YOLO(BASE_MODEL)
    except: model = YOLO("yolo11l-obb.pt")
    
    # Train
    model.train(data=str(dataset_yaml), project=WANDB_PROJECT, name=run_name, **TRAIN_CONFIG)
    
    # Save
    best = Path(model.trainer.save_dir) / "weights" / "best.pt"
    target = DRIVE_UPLOAD_FOLDER / f"{run_name}.pt"
    if best.exists():
        shutil.copy(best, target)
        print(f"Saved to {target}")


## 4. Mixed Training Experiments
Training on 100% Synthetic + X% Real Data

### 5% Data Experiment

In [None]:
percentage = 0.05
run_mixed_experiment(percentage)


### 10% Data Experiment

In [5]:
percentage = 0.10
run_mixed_experiment(percentage)



>>> STARTING MIXED EXPERIMENT: yolo26l_mixed_10pct <<<

--- Preparing Mixed Dataset for yolo26l_mixed_10pct ---
Scanning Synthetized_Data/Capture_as_our_dataset/images...
Scanning Synthetized_Data/Capture_scene_8/images...
Scanning Synthetized_Data/Capture_zone_with_human_occ2/images...
Scanning Synthetized_Data/scene_10/images...
Found 817 Synthetic images.
Found 1044 Real images. Using 10.0% -> 104 images.
Total Mixed Training Images: 921
New https://pypi.org/project/ultralytics/8.4.10 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.4.9 🚀 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (NVIDIA H100 NVL MIG 1g.24gb, 22144MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=yolo26l_mixed_10pct_dataset.yaml, degrees=10.0, determin

Overriding model.yaml nc=15 with nc=1

                   from  n    params  module                                       arguments                     
  0                  -1  1      1856  ultralytics.nn.modules.conv.Conv             [3, 64, 3, 2]                 
  1                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  2                  -1  2    173824  ultralytics.nn.modules.block.C3k2            [128, 256, 2, True, 0.25]     
  3                  -1  1    590336  ultralytics.nn.modules.conv.Conv             [256, 256, 3, 2]              
  4                  -1  2    691712  ultralytics.nn.modules.block.C3k2            [256, 512, 2, True, 0.25]     
  5                  -1  1   2360320  ultralytics.nn.modules.conv.Conv             [512, 512, 3, 2]              
  6                  -1  2   2234368  ultralytics.nn.modules.block.C3k2            [512, 512, 2, True]           
  7                  -1  1   2360320  ultralytics

0,1
lr/pg0,▃████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▁
lr/pg1,▃███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁
lr/pg2,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
metrics/mAP50(B),▁▂▅▄▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇█████████████████
metrics/mAP50-95(B),▁▃▄▅▅▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇██████████████████
metrics/precision(B),▁▂▃▅▅▆▅▇▇▇█▇▇▇▇▇▇▇█▇▇▇█▇▇▇▇▇█▇██████████
metrics/recall(B),▁▂▂▄▄▅▅▆▆▆▆▆▇▇▆▆▇▆▆▇▇▇█▇▇▇▇██▇██████████
model/GFLOPs,▁
model/parameters,▁
model/speed_PyTorch(ms),▁

0,1
lr/pg0,2e-05
lr/pg1,2e-05
lr/pg2,2e-05
metrics/mAP50(B),0.89346
metrics/mAP50-95(B),0.70215
metrics/precision(B),0.87792
metrics/recall(B),0.8218
model/GFLOPs,100.51
model/parameters,27875556
model/speed_PyTorch(ms),9.709


Saved to YOLO_OBB_Mixed_Training_Results/yolo26l_mixed_10pct.pt


### 20% Data Experiment

In [6]:
percentage = 0.20
run_mixed_experiment(percentage)



>>> STARTING MIXED EXPERIMENT: yolo26l_mixed_20pct <<<

--- Preparing Mixed Dataset for yolo26l_mixed_20pct ---
Scanning Synthetized_Data/Capture_as_our_dataset/images...
Scanning Synthetized_Data/Capture_scene_8/images...
Scanning Synthetized_Data/Capture_zone_with_human_occ2/images...
Scanning Synthetized_Data/scene_10/images...
Found 817 Synthetic images.
Found 1044 Real images. Using 20.0% -> 208 images.
Total Mixed Training Images: 1025
New https://pypi.org/project/ultralytics/8.4.10 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.4.9 🚀 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (NVIDIA H100 NVL MIG 1g.24gb, 22144MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=yolo26l_mixed_20pct_dataset.yaml, degrees=10.0, determi

Overriding model.yaml nc=15 with nc=1

                   from  n    params  module                                       arguments                     
  0                  -1  1      1856  ultralytics.nn.modules.conv.Conv             [3, 64, 3, 2]                 
  1                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  2                  -1  2    173824  ultralytics.nn.modules.block.C3k2            [128, 256, 2, True, 0.25]     
  3                  -1  1    590336  ultralytics.nn.modules.conv.Conv             [256, 256, 3, 2]              
  4                  -1  2    691712  ultralytics.nn.modules.block.C3k2            [256, 512, 2, True, 0.25]     
  5                  -1  1   2360320  ultralytics.nn.modules.conv.Conv             [512, 512, 3, 2]              
  6                  -1  2   2234368  ultralytics.nn.modules.block.C3k2            [512, 512, 2, True]           
  7                  -1  1   2360320  ultralytics



0,1
lr/pg0,▃████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁
lr/pg1,▆███▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁
lr/pg2,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
metrics/mAP50(B),▁▃▄▄▄▅▆▅▆▆▇▇▇▇▇▇▇█▇███▇█▇███████████████
metrics/mAP50-95(B),▁▂▄▄▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
metrics/precision(B),▁▄▆▅▅▆▆▆▇▇▇▆▇▇▇▇▇▆▇▇█▇▇█▇▇▇▇██▇████▇▇▇▇█
metrics/recall(B),▁▄▅▃▅▅▅▆▆▆▆▆▇▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████▇█
model/GFLOPs,▁
model/parameters,▁
model/speed_PyTorch(ms),▁

0,1
lr/pg0,2e-05
lr/pg1,2e-05
lr/pg2,2e-05
metrics/mAP50(B),0.89473
metrics/mAP50-95(B),0.73478
metrics/precision(B),0.87409
metrics/recall(B),0.82685
model/GFLOPs,100.51
model/parameters,27875556
model/speed_PyTorch(ms),9.675


Saved to YOLO_OBB_Mixed_Training_Results/yolo26l_mixed_20pct.pt


### 30% Data Experiment

In [7]:
percentage = 0.30
run_mixed_experiment(percentage)



>>> STARTING MIXED EXPERIMENT: yolo26l_mixed_30pct <<<

--- Preparing Mixed Dataset for yolo26l_mixed_30pct ---
Scanning Synthetized_Data/Capture_as_our_dataset/images...
Scanning Synthetized_Data/Capture_scene_8/images...
Scanning Synthetized_Data/Capture_zone_with_human_occ2/images...
Scanning Synthetized_Data/scene_10/images...
Found 817 Synthetic images.
Found 1044 Real images. Using 30.0% -> 313 images.
Total Mixed Training Images: 1130
New https://pypi.org/project/ultralytics/8.4.10 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.4.9 🚀 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (NVIDIA H100 NVL MIG 1g.24gb, 22144MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=yolo26l_mixed_30pct_dataset.yaml, degrees=10.0, determi

Overriding model.yaml nc=15 with nc=1

                   from  n    params  module                                       arguments                     
  0                  -1  1      1856  ultralytics.nn.modules.conv.Conv             [3, 64, 3, 2]                 
  1                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  2                  -1  2    173824  ultralytics.nn.modules.block.C3k2            [128, 256, 2, True, 0.25]     
  3                  -1  1    590336  ultralytics.nn.modules.conv.Conv             [256, 256, 3, 2]              
  4                  -1  2    691712  ultralytics.nn.modules.block.C3k2            [256, 512, 2, True, 0.25]     
  5                  -1  1   2360320  ultralytics.nn.modules.conv.Conv             [512, 512, 3, 2]              
  6                  -1  2   2234368  ultralytics.nn.modules.block.C3k2            [512, 512, 2, True]           
  7                  -1  1   2360320  ultralytics

0,1
lr/pg0,▃████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▁▁▁
lr/pg1,▃███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁
lr/pg2,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
metrics/mAP50(B),▁▃▃▅▅▅▇▇▇▇▇▇▇███▇████▇██████████████████
metrics/mAP50-95(B),▁▃▄▄▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
metrics/precision(B),▁▄▄▅▄▆▆▆▇▇▇▇▇▇██▇▇▇██▇█████████▇█▇██████
metrics/recall(B),▁▃▄▄▅▅▅▅▆▅▆▆▆▇▇▆▆▇▆▇▆▆▇▇▇█▇▇▇██▇█▇█▇███▇
model/GFLOPs,▁
model/parameters,▁
model/speed_PyTorch(ms),▁

0,1
lr/pg0,2e-05
lr/pg1,2e-05
lr/pg2,2e-05
metrics/mAP50(B),0.91534
metrics/mAP50-95(B),0.76719
metrics/precision(B),0.89447
metrics/recall(B),0.83742
model/GFLOPs,100.51
model/parameters,27875556
model/speed_PyTorch(ms),9.711


Saved to YOLO_OBB_Mixed_Training_Results/yolo26l_mixed_30pct.pt


### 40% Data Experiment

In [None]:
percentage = 0.40
run_mixed_experiment(percentage)



>>> STARTING MIXED EXPERIMENT: yolo26l_mixed_40pct <<<

--- Preparing Mixed Dataset for yolo26l_mixed_40pct ---
Scanning Synthetized_Data/Capture_as_our_dataset/images...
Scanning Synthetized_Data/Capture_scene_8/images...
Scanning Synthetized_Data/Capture_zone_with_human_occ2/images...
Scanning Synthetized_Data/scene_10/images...
Found 817 Synthetic images.
Found 1044 Real images. Using 40.0% -> 417 images.
Total Mixed Training Images: 1234
New https://pypi.org/project/ultralytics/8.4.10 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.4.9 🚀 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (NVIDIA H100 NVL MIG 1g.24gb, 22144MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=yolo26l_mixed_40pct_dataset.yaml, degrees=10.0, determi

Overriding model.yaml nc=15 with nc=1

                   from  n    params  module                                       arguments                     
  0                  -1  1      1856  ultralytics.nn.modules.conv.Conv             [3, 64, 3, 2]                 
  1                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  2                  -1  2    173824  ultralytics.nn.modules.block.C3k2            [128, 256, 2, True, 0.25]     
  3                  -1  1    590336  ultralytics.nn.modules.conv.Conv             [256, 256, 3, 2]              
  4                  -1  2    691712  ultralytics.nn.modules.block.C3k2            [256, 512, 2, True, 0.25]     
  5                  -1  1   2360320  ultralytics.nn.modules.conv.Conv             [512, 512, 3, 2]              
  6                  -1  2   2234368  ultralytics.nn.modules.block.C3k2            [512, 512, 2, True]           
  7                  -1  1   2360320  ultralytics