# 📥 Download COCO Dataset for ControlNet Testing

This notebook downloads the COCO validation dataset and prepares it for ControlNet training.

**What we'll download:**
- COCO val2017 images (~1GB, 5000 images)
- We'll use a subset (500 images) for RTX 3050Ti testing

In [1]:
import os
import requests
import zipfile
from pathlib import Path
from tqdm.notebook import tqdm
import shutil

def download_file(url, output_path, chunk_size=8192):
    """Download a file with progress bar."""
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    total_size = int(response.headers.get('content-length', 0))
    
    with open(output_path, 'wb') as f, tqdm(
        desc=output_path.name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                pbar.update(len(chunk))

print("📁 Setting up COCO dataset download...")

📁 Setting up COCO dataset download...


In [2]:
# Create dataset directory
dataset_dir = Path("./datasets/coco_controlnet")
dataset_dir.mkdir(parents=True, exist_ok=True)

print(f"📁 Dataset directory: {dataset_dir.absolute()}")
print(f"💾 Available disk space: {shutil.disk_usage('.').free / (1024**3):.1f} GB")

# Check if we need ~2GB free space
if shutil.disk_usage('.').free < 3 * (1024**3):  # 3GB minimum
    print("⚠️  Warning: Low disk space. Ensure you have at least 3GB free.")
else:
    print("✅ Sufficient disk space available")

📁 Dataset directory: C:\Users\tobio\PycharmProjects\ControlNet-Implementation\notebooks\datasets\coco_controlnet
💾 Available disk space: 24.8 GB
✅ Sufficient disk space available


In [3]:
# Download COCO val2017 images
coco_url = "http://images.cocodataset.org/zips/val2017.zip"
coco_zip = dataset_dir / "val2017.zip"

if not coco_zip.exists():
    print("📥 Downloading COCO val2017 images (~1GB)...")
    print("This may take 10-20 minutes depending on your internet speed.")
    
    try:
        download_file(coco_url, coco_zip)
        print(f"✅ Downloaded: {coco_zip}")
        print(f"📊 File size: {coco_zip.stat().st_size / (1024**2):.1f} MB")
    except Exception as e:
        print(f"❌ Download failed: {e}")
        print("\n🔧 Alternative solutions:")
        print("1. Check your internet connection")
        print("2. Try downloading manually from: http://images.cocodataset.org/zips/val2017.zip")
        print("3. Use a different dataset (see next cell)")
else:
    print(f"✅ COCO dataset already downloaded: {coco_zip}")
    print(f"📊 File size: {coco_zip.stat().st_size / (1024**2):.1f} MB")

📥 Downloading COCO val2017 images (~1GB)...
This may take 10-20 minutes depending on your internet speed.


Exception ignored in: <function tqdm.__del__ at 0x00000216FD6D3240>
Traceback (most recent call last):
  File "C:\Users\tobio\PycharmProjects\ControlNet-Implementation\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "C:\Users\tobio\PycharmProjects\ControlNet-Implementation\.venv\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


❌ Download failed: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

🔧 Alternative solutions:
1. Check your internet connection
2. Try downloading manually from: http://images.cocodataset.org/zips/val2017.zip
3. Use a different dataset (see next cell)


In [None]:
# Extract COCO images
val2017_dir = dataset_dir / "val2017"

if not val2017_dir.exists() and coco_zip.exists():
    print("📦 Extracting COCO images...")
    print("This may take 5-10 minutes...")
    
    try:
        with zipfile.ZipFile(coco_zip, 'r') as zip_ref:
            zip_ref.extractall(dataset_dir)
        
        print(f"✅ Extracted to: {val2017_dir}")
        
        # Count extracted images
        image_count = len(list(val2017_dir.glob("*.jpg")))
        print(f"📸 Found {image_count} images")
        
        # Clean up zip file to save space
        if input("🗑️  Delete zip file to save space? (y/n): ").lower() == 'y':
            coco_zip.unlink()
            print("✅ Zip file deleted")
            
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        
elif val2017_dir.exists():
    image_count = len(list(val2017_dir.glob("*.jpg")))
    print(f"✅ COCO images already extracted: {image_count} images")
    print(f"📁 Location: {val2017_dir}")
else:
    print("❌ Cannot extract - zip file not found")

In [None]:
# Verify dataset is ready
val2017_dir = dataset_dir / "val2017"

if val2017_dir.exists():
    image_files = list(val2017_dir.glob("*.jpg"))
    
    print("🎉 COCO Dataset Ready!")
    print("=" * 40)
    print(f"📁 Location: {val2017_dir}")
    print(f"📸 Total images: {len(image_files)}")
    print(f"💾 Dataset size: {sum(f.stat().st_size for f in image_files[:100]) / (1024**2) * len(image_files) / 100:.0f} MB")
    
    # Show sample image names
    print(f"\n📋 Sample images:")
    for img in image_files[:5]:
        print(f"  - {img.name}")
    
    print(f"\n✅ Ready for ControlNet training!")
    print(f"\n🚀 Next step: Run the main training notebook")
    print(f"   notebooks/rtx3050ti_coco_training_test.ipynb")
    
else:
    print("❌ Dataset setup failed")
    print("\n🔧 Manual setup instructions:")
    print("1. Download: http://images.cocodataset.org/zips/val2017.zip")
    print(f"2. Extract to: {dataset_dir}")
    print("3. Ensure val2017/ folder contains .jpg files")

In [None]:
# Optional: Show sample images
import matplotlib.pyplot as plt
from PIL import Image
import random

if val2017_dir.exists():
    image_files = list(val2017_dir.glob("*.jpg"))
    
    if len(image_files) >= 4:
        print("🖼️  Sample COCO images:")
        
        # Show 4 random sample images
        fig, axes = plt.subplots(2, 2, figsize=(10, 10))
        axes = axes.flatten()
        
        sample_images = random.sample(image_files, 4)
        
        for i, img_path in enumerate(sample_images):
            try:
                img = Image.open(img_path)
                axes[i].imshow(img)
                axes[i].set_title(f"{img_path.name}\n{img.size[0]}x{img.size[1]}")
                axes[i].axis('off')
            except Exception as e:
                axes[i].text(0.5, 0.5, f"Error loading\n{img_path.name}", 
                           ha='center', va='center', transform=axes[i].transAxes)
                axes[i].axis('off')
        
        plt.tight_layout()
        plt.show()
        
        print("✅ Sample images loaded successfully")
        print("These will be converted to Canny edges for ControlNet training")
    else:
        print("⚠️  Not enough images found for preview")