In [None]:
!pip install ultralytics

# Download multiple datasets and merging

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

import shutil
import yaml
from pathlib import Path
from collections import defaultdict
from roboflow import Roboflow

class DatasetCombiner:
    def __init__(self, api_key, output_dir="combined_dataset"):
        if api_key is None:
            api_key = os.getenv('ROBOFLOW_API_KEY')
            if api_key is None:
                raise ValueError("API key must be provided either as parameter or ROBOFLOW_API_KEY environment variable")


        self.rf = Roboflow(api_key=api_key)
        self.output_dir = Path(output_dir)
        self.target_classes = ['hog', 'boar', 'wild boar', 'pig', 'pigeon', 'rabbit', 'bunny', 'deer', 'squirrel']
        self.class_mapping = {
            # Common variations for wild boar/hog
            'hog': 'hog',
            'boar': 'hog',
            'wild boar': 'hog',
            'wild_boar': 'hog',
            'pig': 'hog',
            'swine': 'hog',

            # Pigeon variations
            'pigeon': 'pigeon',
            'dove': 'pigeon',
            'bird': 'pigeon',  # Only if context suggests pigeon

            # Rabbit variations
            'rabbit': 'rabbit',
            'bunny': 'rabbit',
            'hare': 'rabbit',

            # Deer variations
            'deer': 'deer',
            'roe deer': 'deer',
            'red deer': 'deer',
            'stag': 'deer',
            'doe': 'deer',
            'buck': 'deer',

            # Squirrel variations
            'squirrel': 'squirrel',
            'red squirrel': 'squirrel',
            'grey squirrel': 'squirrel',
            'gray squirrel': 'squirrel'
        }
        self.final_classes = ['hog', 'pigeon', 'rabbit', 'deer', 'squirrel']

    def download_datasets(self):
        """Download all datasets from Roboflow"""
        datasets = []

        # Dataset configurations
        dataset_configs = [
            ("aflevering1", "my-first-project-xub7r", 1),
            ("my-game-pics", "my-game-pics", 7),
            ("animaldetection-rvmi9", "animal_detection-7wsk6", 5),
            ("uncocos", "pigeon-v8l3q", 6),
            ("trail-camera-training", "trailcam-detection", 2)
        ]

        print("Downloading datasets...")
        for i, (workspace, project_name, version_num) in enumerate(dataset_configs, 1):
            try:
                print(f"Downloading dataset {i}/5: {project_name}")
                project = self.rf.workspace(workspace).project(project_name)
                version = project.version(version_num)
                dataset = version.download("yolov11", location=f"temp_dataset_{i}")
                datasets.append(f"temp_dataset_{i}")
            except Exception as e:
                print(f"Error downloading dataset {i}: {e}")

        return datasets

    def load_yaml_config(self, dataset_path):
        """Load YAML configuration from dataset"""
        yaml_path = Path(dataset_path) / "data.yaml"
        if yaml_path.exists():
            with open(yaml_path, 'r') as f:
                return yaml.safe_load(f)
        return None

    def should_keep_class(self, class_name):
        """Check if a class should be kept based on target animals"""
        class_lower = class_name.lower().strip()

        # Direct matches
        if class_lower in [c.lower() for c in self.target_classes]:
            return True

        # Partial matches for compound names
        for target in self.target_classes:
            if target.lower() in class_lower or class_lower in target.lower():
                return True

        return False

    def map_class_name(self, original_name):
        """Map original class name to standardized name"""
        original_lower = original_name.lower().strip()

        # Direct mapping
        if original_lower in self.class_mapping:
            return self.class_mapping[original_lower]

        # Partial matching
        for key, value in self.class_mapping.items():
            if key in original_lower or original_lower in key:
                return value

        # Default mapping based on keywords
        if any(keyword in original_lower for keyword in ['boar', 'hog', 'pig', 'swine']):
            return 'hog'
        elif any(keyword in original_lower for keyword in ['pigeon', 'dove']):
            return 'pigeon'
        elif any(keyword in original_lower for keyword in ['rabbit', 'bunny', 'hare']):
            return 'rabbit'
        elif any(keyword in original_lower for keyword in ['deer', 'stag', 'doe', 'buck']):
            return 'deer'
        elif any(keyword in original_lower for keyword in ['squirrel']):
            return 'squirrel'

        return original_name.lower().replace(' ', '_')

    def process_annotations(self, annotation_file, class_names, new_class_mapping):
        """Process YOLO annotation file and filter/remap classes"""
        if not os.path.exists(annotation_file):
            return []

        new_annotations = []
        with open(annotation_file, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    class_id = int(parts[0])
                    if class_id < len(class_names):
                        original_class = class_names[class_id]
                        if self.should_keep_class(original_class):
                            mapped_class = self.map_class_name(original_class)
                            if mapped_class in new_class_mapping:
                                new_class_id = new_class_mapping[mapped_class]
                                new_line = f"{new_class_id} {' '.join(parts[1:])}"
                                new_annotations.append(new_line)

        return new_annotations

    def combine_datasets(self, dataset_paths):
        """Combine multiple datasets into one unified dataset"""
        print("Combining datasets...")

        # Create output directory structure
        self.output_dir.mkdir(exist_ok=True)
        for split in ['train', 'valid', 'test']:
            (self.output_dir / split / 'images').mkdir(parents=True, exist_ok=True)
            (self.output_dir / split / 'labels').mkdir(parents=True, exist_ok=True)

        # Create new class mapping
        new_class_mapping = {class_name: i for i, class_name in enumerate(self.final_classes)}

        image_counter = 0
        stats = defaultdict(int)

        for dataset_path in dataset_paths:
            if not os.path.exists(dataset_path):
                print(f"Dataset path {dataset_path} not found, skipping...")
                continue

            print(f"Processing dataset: {dataset_path}")

            # Load dataset configuration
            config = self.load_yaml_config(dataset_path)
            if not config or 'names' not in config:
                print(f"No valid config found for {dataset_path}, skipping...")
                continue

            original_classes = config['names']
            if isinstance(original_classes, dict):
                original_classes = list(original_classes.values())

            # Process each split
            for split in ['train', 'valid', 'test']:
                split_path = Path(dataset_path) / split
                if not split_path.exists():
                    continue

                images_path = split_path / 'images'
                labels_path = split_path / 'labels'

                if not images_path.exists():
                    continue

                # Process each image and its annotation
                for img_file in images_path.glob('*'):
                    if img_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
                        # Find corresponding label file
                        label_file = labels_path / f"{img_file.stem}.txt"

                        # Process annotations
                        new_annotations = self.process_annotations(
                            str(label_file), original_classes, new_class_mapping
                        )

                        # Only keep images that have valid annotations
                        if new_annotations:
                            # Copy image with new name
                            new_img_name = f"img_{image_counter:06d}{img_file.suffix}"
                            new_img_path = self.output_dir / split / 'images' / new_img_name
                            shutil.copy2(img_file, new_img_path)

                            # Write new annotation file
                            new_label_path = self.output_dir / split / 'labels' / f"img_{image_counter:06d}.txt"
                            with open(new_label_path, 'w') as f:
                                f.write('\n'.join(new_annotations) + '\n')

                            image_counter += 1
                            stats[split] += 1

                            # Count classes
                            for ann in new_annotations:
                                class_id = int(ann.split()[0])
                                class_name = self.final_classes[class_id]
                                stats[f"{split}_{class_name}"] += 1

        # Create new data.yaml
        new_config = {
            'path': str(self.output_dir.absolute()),
            'train': 'train/images',
            'val': 'valid/images',
            'test': 'test/images',
            'nc': len(self.final_classes),
            'names': {i: name for i, name in enumerate(self.final_classes)}
        }

        # Write test split to val if test doesn't exist
        if stats.get('test', 0) == 0 and stats.get('valid', 0) == 0:
            print("No validation or test set found, creating validation split from train...")
            self.create_validation_split()
            new_config['val'] = 'valid/images'

        with open(self.output_dir / 'data.yaml', 'w') as f:
            yaml.dump(new_config, f, default_flow_style=False)

        # Print statistics
        self.print_statistics(stats)

        return str(self.output_dir)

    def create_validation_split(self, val_ratio=0.2):
        """Create validation split from training data"""
        import random

        train_images = list((self.output_dir / 'train' / 'images').glob('*'))
        random.shuffle(train_images)

        val_count = int(len(train_images) * val_ratio)
        val_images = train_images[:val_count]

        for img_path in val_images:
            # Move image
            val_img_path = self.output_dir / 'valid' / 'images' / img_path.name
            shutil.move(str(img_path), str(val_img_path))

            # Move corresponding label
            label_path = self.output_dir / 'train' / 'labels' / f"{img_path.stem}.txt"
            if label_path.exists():
                val_label_path = self.output_dir / 'valid' / 'labels' / f"{img_path.stem}.txt"
                shutil.move(str(label_path), str(val_label_path))

    def print_statistics(self, stats):
        """Print dataset statistics"""
        print("\n" + "="*50)
        print("DATASET STATISTICS")
        print("="*50)

        total_images = sum(stats[split] for split in ['train', 'valid', 'test'] if split in stats)
        print(f"Total images: {total_images}")

        for split in ['train', 'valid', 'test']:
            if split in stats:
                print(f"{split.capitalize()} images: {stats[split]}")

        print("\nClass distribution:")
        for class_name in self.final_classes:
            total_instances = sum(stats.get(f"{split}_{class_name}", 0)
                                  for split in ['train', 'valid', 'test'])
            print(f"  {class_name}: {total_instances} instances")

            for split in ['train', 'valid', 'test']:
                split_count = stats.get(f"{split}_{class_name}", 0)
                if split_count > 0:
                    print(f"    {split}: {split_count}")

        print("="*50)

    def cleanup_temp_datasets(self, dataset_paths):
        """Remove temporary downloaded datasets"""
        print("Cleaning up temporary files...")
        for dataset_path in dataset_paths:
            if os.path.exists(dataset_path):
                shutil.rmtree(dataset_path)
        print("Cleanup completed!")

    def run(self, cleanup=True):
        """Main execution function"""
        try:
            # Download datasets
            dataset_paths = self.download_datasets()

            if not dataset_paths:
                print("No datasets were downloaded successfully!")
                return None

            # Combine datasets
            combined_path = self.combine_datasets(dataset_paths)

            # Cleanup temporary files
            if cleanup:
                self.cleanup_temp_datasets(dataset_paths)

            print(f"\nDataset combination completed!")
            print(f"Combined dataset saved to: {combined_path}")
            print(f"Use the data.yaml file for YOLO training")

            return combined_path

        except Exception as e:
            print(f"Error during dataset combination: {e}")
            return None

# Usage
# Initialize the combiner
combiner = DatasetCombiner(
    api_key="D17qu78OWJlZOB5KchPA",
    output_dir="wildlife_dataset"
)

# Run the combination process
result_path = combiner.run(cleanup=True)

if result_path:
    print(f"\n🎉 Success! Your dataset is ready at: {result_path}")
    print("\nTo train with YOLO:")
    print(f"yolo train data={result_path}/data.yaml model=yolo11n.pt epochs=100 imgsz=640")
else:
    print("❌ Dataset combination failed!")



Downloading datasets...
Downloading dataset 1/5: my-first-project-xub7r
loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in temp_dataset_1 to yolov11:: 100%|██████████| 550910/550910 [03:33<00:00, 2582.52it/s]





Extracting Dataset Version Zip to temp_dataset_1 in yolov11:: 100%|██████████| 23060/23060 [00:02<00:00, 8503.47it/s] 

Downloading dataset 2/5: my-game-pics
loading Roboflow workspace...





loading Roboflow project...


Downloading Dataset Version Zip in temp_dataset_2 to yolov11:: 100%|██████████| 511120/511120 [03:40<00:00, 2322.64it/s]





Extracting Dataset Version Zip to temp_dataset_2 in yolov11:: 100%|██████████| 24270/24270 [00:05<00:00, 4249.66it/s]

Downloading dataset 3/5: animal_detection-7wsk6
loading Roboflow workspace...





loading Roboflow project...


Downloading Dataset Version Zip in temp_dataset_3 to yolov11:: 100%|██████████| 2128460/2128460 [16:30<00:00, 2148.20it/s]





Extracting Dataset Version Zip to temp_dataset_3 in yolov11:: 100%|██████████| 5206/5206 [00:03<00:00, 1378.10it/s]


Downloading dataset 4/5: pigeon-v8l3q
loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in temp_dataset_4 to yolov11:: 100%|██████████| 182867/182867 [01:16<00:00, 2399.58it/s]





Extracting Dataset Version Zip to temp_dataset_4 in yolov11:: 100%|██████████| 5480/5480 [00:00<00:00, 7985.09it/s]

Downloading dataset 5/5: trailcam-detection
loading Roboflow workspace...





loading Roboflow project...


Downloading Dataset Version Zip in temp_dataset_5 to yolov11:: 100%|██████████| 1157698/1157698 [07:52<00:00, 2451.10it/s]





Extracting Dataset Version Zip to temp_dataset_5 in yolov11:: 100%|██████████| 31382/31382 [00:03<00:00, 9015.97it/s] 


Combining datasets...
Processing dataset: temp_dataset_1
Processing dataset: temp_dataset_2
Processing dataset: temp_dataset_3
Processing dataset: temp_dataset_4
Processing dataset: temp_dataset_5

DATASET STATISTICS
Total images: 33862
Train images: 30283
Valid images: 2018
Test images: 1561

Class distribution:
  hog: 21641 instances
    train: 19665
    valid: 1408
    test: 568
  pigeon: 4821 instances
    train: 3535
    valid: 542
    test: 744
  rabbit: 955 instances
    train: 885
    valid: 45
    test: 25
  deer: 25691 instances
    train: 23317
    valid: 1405
    test: 969
  squirrel: 0 instances
Cleaning up temporary files...
Cleanup completed!

Dataset combination completed!
Combined dataset saved to: wildlife_dataset
Use the data.yaml file for YOLO training

🎉 Success! Your dataset is ready at: wildlife_dataset

To train with YOLO:
yolo train data=wildlife_dataset/data.yaml model=yolo11n.pt epochs=100 imgsz=640


# Train model

In [None]:
!yolo detect train data=/teamspace/studios/this_studio/wildlife_dataset/data.yaml model=yolo11n.pt epochs=100 imgsz=640