<a href="https://colab.research.google.com/github/MiroPol21/dspracticum2025/blob/main/lesson02/Own_images_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let's build a simple neural network to classify images.

**1. Install ddgs package**

In [1]:
# run only once
!pip install ddgs



**2. Import libraries**

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from ddgs import DDGS

import os
import requests
from urllib.parse import urlparse
import warnings

import matplotlib.pyplot as plt

**3. Import images**

In [3]:
def search_images(keyword, max_results=10):
    with DDGS() as ddgs:
        images = ddgs.images(
            keyword,
            max_results=max_results
        )
        return [img['image'] for img in images]

def search_more_images(base_keyword, target_results=200):
    """
    Search for more images using multiple keyword variations
    """
    import time
    
    all_urls = []
    
    # Different search term variations to get more results
    search_terms = [
        base_keyword,
        f"{base_keyword} photo",
        f"{base_keyword} picture", 
        f"{base_keyword} image",
        f"cute {base_keyword}",
        f"{base_keyword} animal",
    ]
    
    print(f"Searching for {base_keyword} images...")
    
    for i, term in enumerate(search_terms):
        if len(all_urls) >= target_results:
            break
            
        try:
            # Search for images with this term
            urls = search_images(term, max_results=100)
            
            # Remove duplicates
            new_urls = [url for url in urls if url not in all_urls]
            all_urls.extend(new_urls)
            
            print(f"  Search {i+1}/8: '{term}' -> {len(urls)} found, {len(new_urls)} new (total: {len(all_urls)})")
            
            # Add small delay to avoid rate limiting
            if i < len(search_terms) - 1:  # Don't sleep after the last search
                time.sleep(1)
                
        except Exception as e:
            print(f"  Error searching for '{term}': {e}")
            continue
    
    print(f"Final result for '{base_keyword}': {len(all_urls)} images")
    return all_urls[:target_results]  # Return only the requested amount

In [4]:
keywords = ['cat', 'dog', 'rabbit', 'capybara', 'owl']

# Use the enhanced search function to get 200 images per category
image_urls_list = []
for keyword in keywords:
    urls = search_more_images(keyword, 200)
    image_urls_list.append(urls)

Searching for cat images...
  Search 1/8: 'cat' -> 100 found, 100 new (total: 100)
  Search 2/8: 'cat photo' -> 100 found, 99 new (total: 199)
  Search 3/8: 'cat picture' -> 100 found, 93 new (total: 292)
Final result for 'cat': 292 images
Searching for dog images...
  Search 1/8: 'dog' -> 100 found, 100 new (total: 100)
  Search 2/8: 'dog photo' -> 100 found, 98 new (total: 198)
  Search 3/8: 'dog picture' -> 100 found, 88 new (total: 286)
Final result for 'dog': 286 images
Searching for rabbit images...
  Search 1/8: 'rabbit' -> 100 found, 100 new (total: 100)
  Search 2/8: 'rabbit photo' -> 100 found, 95 new (total: 195)
  Search 3/8: 'rabbit picture' -> 100 found, 86 new (total: 281)
Final result for 'rabbit': 281 images
Searching for capybara images...
  Search 1/8: 'capybara' -> 100 found, 100 new (total: 100)
  Search 2/8: 'capybara photo' -> 100 found, 86 new (total: 186)
  Search 3/8: 'capybara picture' -> 100 found, 72 new (total: 258)
Final result for 'capybara': 258 images


In [5]:
print("Images found per category:")
for i, keyword in enumerate(keywords):
    print(f"  {keyword}: {len(image_urls_list[i])} images")

Images found per category:
  cat: 200 images
  dog: 200 images
  rabbit: 200 images
  capybara: 200 images
  owl: 200 images


**4. Download Images**

In [6]:
def download_image(url, folder, custom_name=None, verbose=True):
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Get the filename from the URL or use the custom name
    if custom_name:
        filename = custom_name
    else:
        filename = os.path.basename(urlparse(url).path)
        if not filename:
            filename = 'image.jpg'  # Default filename if none is found in the URL

    # Ensure the filename has an extension
    if not os.path.splitext(filename)[1]:
        filename += '.jpg'

    filepath = os.path.join(folder, filename)

    # If the file already exists, append a number to make it unique
    base, extension = os.path.splitext(filepath)
    counter = 1
    while os.path.exists(filepath):
        filepath = f"{base}_{counter}{extension}"
        counter += 1

    try:
        # Send a GET request to the URL with a timeout of 10 seconds
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        # Check if the content type is an image
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image'):
            if verbose:
                warnings.warn(f"The URL does not point to an image. Content-Type: {content_type}")
            return False

        # Write the image content to the file
        with open(filepath, 'wb') as f:
            f.write(response.content)

        if verbose:
            print(f"Image successfully downloaded: {filepath}")
        return True

    except requests.exceptions.Timeout:
        if verbose:
            warnings.warn(f"Download timed out for URL: {url}")
    except requests.exceptions.HTTPError as e:
        if verbose:
            warnings.warn(f"HTTP error occurred: {e}")
    except requests.exceptions.RequestException as e:
        if verbose:
            warnings.warn(f"An error occurred while downloading the image: {e}")
    except IOError as e:
        if verbose:
            warnings.warn(f"An error occurred while writing the file: {e}")


In [7]:
from tqdm.notebook import tqdm

for i, image_urls in enumerate(tqdm(image_urls_list)):
    for j, url in enumerate(tqdm(image_urls)):
        download_image(url, f"./dataset/{keywords[i]}/", f'image{j}.jpg', verbose=False)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [8]:
import os
from PIL import Image
import warnings

def remove_invalid_images(directory):
    """
    Removes files from a directory that are not valid images.
    """
    print(f"Checking directory: {directory}")
    for subdir, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(subdir, file)
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", Image.DecompressionBombWarning)
                    img = Image.open(filepath)
                    img.verify() # Verify that it is an image
            except Exception:
                # If verification fails, delete the file
                if os.path.exists(filepath):
                    print(f"Removing invalid image: {filepath}")
                    os.remove(filepath)

remove_invalid_images('./dataset')

Checking directory: ./dataset


**5. Split dataset into train/test folders**

According to the homework requirements, we need to organize our dataset into the proper folder structure with 75% train and 25% test split.

In [9]:
import shutil
from sklearn.model_selection import train_test_split
import glob

def split_dataset_to_folders(source_dir='./dataset', target_dir='./animals_dataset', test_size=0.25, random_state=42):
    """
    Split images from source_dir into train/test folders with the required structure.
    
    Args:
        source_dir: Current dataset directory with class folders
        target_dir: New directory to create with train/test split
        test_size: Proportion of images to use for testing (0.25 = 25%)
        random_state: Random seed for reproducible splits
    """
    
    # Create target directory structure
    train_dir = os.path.join(target_dir, 'train')
    test_dir = os.path.join(target_dir, 'test')
    
    # Get all class folders (cat, dog, rabbit, etc.)
    class_folders = [d for d in os.listdir(source_dir) 
                    if os.path.isdir(os.path.join(source_dir, d))]
    
    print(f"Found classes: {class_folders}")
    
    for class_name in class_folders:
        # Create class directories in train and test
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(test_dir, class_name), exist_ok=True)
        
        # Get all image files for this class
        class_path = os.path.join(source_dir, class_name)
        image_files = glob.glob(os.path.join(class_path, '*.jpg')) + \
                     glob.glob(os.path.join(class_path, '*.png')) + \
                     glob.glob(os.path.join(class_path, '*.jpeg'))
        
        print(f"Class '{class_name}': {len(image_files)} images")
        
        if len(image_files) == 0:
            print(f"Warning: No images found for class {class_name}")
            continue
            
        # Split the file paths
        train_files, test_files = train_test_split(
            image_files, 
            test_size=test_size, 
            random_state=random_state
        )
        
        # Copy files to train directory
        for file_path in train_files:
            filename = os.path.basename(file_path)
            target_path = os.path.join(train_dir, class_name, filename)
            shutil.copy2(file_path, target_path)
        
        # Copy files to test directory  
        for file_path in test_files:
            filename = os.path.basename(file_path)
            target_path = os.path.join(test_dir, class_name, filename)
            shutil.copy2(file_path, target_path)
            
        print(f"  - Train: {len(train_files)} images")
        print(f"  - Test: {len(test_files)} images")

# Run the split
split_dataset_to_folders()

Found classes: ['capybara', 'cat', 'dog', 'owl', 'rabbit']
Class 'capybara': 175 images
  - Train: 131 images
  - Test: 44 images
Class 'cat': 168 images
  - Train: 126 images
  - Test: 42 images
Class 'dog': 174 images
  - Train: 130 images
  - Test: 44 images
Class 'owl': 157 images
  - Train: 117 images
  - Test: 40 images
Class 'rabbit': 177 images
  - Train: 132 images
  - Test: 45 images
