In [4]:
import os
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time

def download_images(query, num_images, output_dir):
    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    headers = {"User-Agent": "Mozilla/5.0 Chrome/91.0.4472.124"}

    # Format the query for URL encoding
    query = urllib.parse.quote(query)
    
    downloaded = 0
    page = 0
    
    while downloaded < num_images:
        # Construct the Google Image search URL with pagination
        search_url = f"https://www.google.com/search?q={query}&tbm=isch&start={page*20}"
        
        # Get the HTML content of the search page
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all image elements
        img_tags = soup.find_all('img')[1:]  # Skip the first image which is usually the Google logo
        
        if not img_tags:
            break
        
        for i, img in enumerate(img_tags):
            if downloaded >= num_images:
                break
            try:
                img_url = img.get('src') or img.get('data-src')
                if not img_url or not img_url.startswith('http'):
                    continue
                img_data = requests.get(img_url).content
                with open(os.path.join(output_dir, f"{query}_{downloaded}.jpg"), 'wb') as handler:
                    handler.write(img_data)
                print(f"Downloaded {query}_{downloaded}.jpg")
                downloaded += 1
            except Exception as e:
                print(f"Could not download image {downloaded}: {e}")
        
        # Increase page count to move to the next set of images
        page += 1
        time.sleep(2)  # Pause to avoid overwhelming the server

# Example usage
download_images("black bear", 560, "data/black_bear")
download_images("black bear cub", 560, "data/black_bear")
download_images("Ursus americanus", 560, "data/black_bear")

download_images("grizzly bear", 560, "data/grizzly_bear")
download_images("grizzly bear cub", 560, "data/grizzly_bear")
download_images("Ursus arctos horribilis", 560, "data/grizzly_bear")

download_images("polar bear", 560, "data/polar_bear")
download_images("polar bear cub", 560, "data/polar_bear")
download_images("Ursus maritimus", 560, "data/polar_bear")

download_images("panda", 560, "data/panda")
download_images("panda cub", 560, "data/panda")
download_images("Ailuropoda melanoleuca", 560, "data/panda")

download_images("sloth bear", 560, "data/sloth_bear")
download_images("sloth bear cub", 560, "data/sloth_bear")
download_images("Melursus ursinus", 560, "data/sloth_bear")

download_images("sun bear", 560, "data/sun_bear")
download_images("sun bear cub", 560, "data/sun_bear")
download_images("Helarctos malayanus", 560, "data/sun_bear")


Downloaded black%20bear_0.jpg
Downloaded black%20bear_1.jpg
Downloaded black%20bear_2.jpg
Downloaded black%20bear_3.jpg
Downloaded black%20bear_4.jpg
Downloaded black%20bear_5.jpg
Downloaded black%20bear_6.jpg
Downloaded black%20bear_7.jpg
Downloaded black%20bear_8.jpg
Downloaded black%20bear_9.jpg
Downloaded black%20bear_10.jpg
Downloaded black%20bear_11.jpg
Downloaded black%20bear_12.jpg
Downloaded black%20bear_13.jpg
Downloaded black%20bear_14.jpg
Downloaded black%20bear_15.jpg
Downloaded black%20bear_16.jpg
Downloaded black%20bear_17.jpg
Downloaded black%20bear_18.jpg
Downloaded black%20bear_19.jpg
Downloaded black%20bear_20.jpg
Downloaded black%20bear_21.jpg
Downloaded black%20bear_22.jpg
Downloaded black%20bear_23.jpg
Downloaded black%20bear_24.jpg
Downloaded black%20bear_25.jpg
Downloaded black%20bear_26.jpg
Downloaded black%20bear_27.jpg
Downloaded black%20bear_28.jpg
Downloaded black%20bear_29.jpg
Downloaded black%20bear_30.jpg
Downloaded black%20bear_31.jpg
Downloaded black%2

In [5]:
import os
import shutil
import random

def split_dataset(data_dir, output_dir, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    # Create directories if they do not exist
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(output_dir, split), exist_ok=True)
        for category in os.listdir(data_dir):
            os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)
    
    for category in os.listdir(data_dir):
        category_path = os.path.join(data_dir, category)
        images = os.listdir(category_path)
        random.shuffle(images)
        
        train_end = int(train_ratio * len(images))
        val_end = train_end + int(val_ratio * len(images))
        
        for i, image in enumerate(images):
            if i < train_end:
                shutil.copy(os.path.join(category_path, image), os.path.join(output_dir, 'train', category))
            elif i < val_end:
                shutil.copy(os.path.join(category_path, image), os.path.join(output_dir, 'val', category))
            else:
                shutil.copy(os.path.join(category_path, image), os.path.join(output_dir, 'test', category))

# Example usage
split_dataset('data', 'split_data')
