# Mount Google Drive and set up environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import libraries
import requests
import matplotlib.pyplot as plt
import pathlib
from PIL import Image
from fastai.vision.all import *
import time

# Bing Image Search API key
key = "443d109f968544768bf87e4986832c53"

In [3]:
path = Path('../content/drive/MyDrive/Nebulae') # Define folder name where subfolders will be stored in

if not path.exists(): # Check if folder exists.
    path.mkdir() # If it doesn't exist, create the folder

# Define subfolders, search terms and functions to pass through Bing Image API

In [None]:
# Define subfolder names / categories
# categories = ['emission', 'planetary', 'dark', 'reflection', 'supernova']

# # Define search terms. Run this step twice with different search terms to get as many images as possible
# search_terms = { # 200 images
#     'emission': ['emission nebula', 'Orion Nebula', 'M42 Nebula', 'Eagle Nebula', 'M16 Nebula', 'North American Nebula', 'California Nebula', 'Bubble Nebula', 'Lagoon Nebula', 'Heart Nebula', 'Tarantula Nebula', 'Carina Nebula'],
#     'planetary': ['planetary nebula', 'Ring Nebula', 'Dumbbell Nebula', 'Helix Nebula', 'NGC 6751', 'NGC 6543', 'Butterfly Nebula', 'NGC 3132'],
#     'dark': ['dark nebula', 'Coalsack Nebula', 'Dark Horse Nebula', 'Pipe Nebula', 'Cone Nebula', 'Horsehead Nebula'],
#     'reflection': ['reflection nebula', 'Iris Nebula', 'NGC 7023 Nebula', 'Ghost Nebula', 'IC 63 Nebula', 'Angel Nebula', 'NGC 2170 Nebula', 'Witch Head Nebula', 'IC 2118 Nebula', 'Running Man Nebula', 'NGC 1977 Nebula'],
#     'supernova': ['supernova remnant', 'Vela Supernova Remnant', 'Cygnus Loop', 'Crab Nebula', 'SN 1054', 'SN 1604', 'SN 1572', 'SN 1885A', 'SN 1006', 'IC 443 Nebula']}

# # search_terms = { # 100 images as there are probably less images for these and we want to minimise data cleaning later on
#     'emission': ['Flame Nebula', 'Gum Nebula', 'Omega Nebula', 'Pelican Nebula', 'Flame Nebula'],
#     'planetary': ['NGC 6210', 'NGC 40', 'NGC 3242', 'NGC 6826', 'NGC 2392', 'NGC 6369', 'NGC 7027', 'M76 Nebula', 'M97 Nebula', 'NGC 7662', 'NGC 1501', 'NGC 6818', 'NGC 6563', '	NGC 6326'],
#     'dark': ['Barnard 68 Nebula', 'Barnard 59 Nebula', 'Barnard 78 Nebula', 'Barnard 33 Nebula' 'B33 Nebula'],
#     'reflection': ['NGC 1555 Nebula', 'M78 Nebula', 'NGC 2261 Nebula', 'NGC 1999 Nebula', 'NGC 1432 Nebula', 'NGC 1435 Nebula'],
#     'supernova': ['Sh2-264 Nebula', 'Sagittarius A East Nebula', 'Spaghetti Nebula', 'Jellyfish Nebula']}

In [None]:
# Scrape for individual nebulae to build a model to identify specific ones
categories = ['orion-nebula', 'eagle-nebula', 'horsehead-nebula', 'crab-nebula', 'trifid-nebula', 'carina-nebula', 'ring-nebula', 'tarantula-nebula', 'witchhead-nebula', 'dolphin-nebula']
search_terms = {
    'orion-nebula': ['orion nebula', 'M42 nebula'],
    'eagle-nebula': ['eagle nebula', 'M16 nebula'],
    'horsehead-nebula':['horsehead nebula', 'IC 434 nebula'],
    'crab-nebula': ['crab nebula','M1 nebula'],
    'trifid-nebula': ['trifid nebula', 'M20 nebula'],
    'carina-nebula': ['carina nebula','NGC 3372 nebula'],
    'ring-nebula': ['ring-nebula', 'M57 nebula'],
    'tarantula-nebula': ['tarantula nebula', 'NGC 2070 nebula'],
    'witchhead-nebula': ['witch head nebula', 'NGC 1909 nebula'],
    'dolphin-nebula': ['dolphin nebula', 'Sh 2-308 nebula']}


In [None]:
# Create a function to download images from Bing Images API

def search_images_bing(key, term, total_count=500):
    search_url = "https://api.bing.microsoft.com/v7.0/images/search"
    headers = {"Ocp-Apim-Subscription-Key": key}
    all_images = []
    count = 50
    offset = 0

    # Bypass the 150 image search limit
    while len(all_images) < total_count:
        remaining_images = total_count - len(all_images)
        current_count = min(count, remaining_images)

        params = {
            'q': term,
            'count': current_count,
            'offset': offset
        }

        try:
            response = requests.get(search_url, headers=headers, params=params)
            response.raise_for_status()
            search_results = response.json()

            # Add images to the all_images list
            all_images.extend(search_results.get('value', []))

            # Sleep for 0.05 of a second to respect rate limit
            time.sleep(0.05)

            offset += current_count  # Increment the offset for the next batch

        except requests.exceptions.HTTPError as err:
            print(f"HTTP error occurred: {err}")
            break  # Exit on HTTP error
        except Exception as err:
            print(f"An unexpected error occurred: {err}")
            time.sleep(1)  # Optional: wait a bit before retrying

    return all_images[:total_count]


In [None]:
# Loop through each of the subfolder and search terms within the previously defined list
for term, category in zip(search_terms, categories):
    dest = path /f'{category}'  # Specify destination (subfolder within the 'Nebula' parent folder)
    dest.mkdir(exist_ok=True)  # Create subfolder if it doesn't exist
    results = search_images_bing(key, term, 180)  # Search for images using the function created earlier
    imgs = [img_data['contentUrl'] for img_data in results]
    download_images(dest=dest, urls=imgs)  # Download images to the destination subfolder

In [None]:
# Loop through each of the subfolder and search terms within the previously defined list
for category in categories:
    dest = path /f'{category}'  # Specify destination (subfolder within the 'Nebula' parent folder)
    dest.mkdir(exist_ok=True)  # Create subfolder if it doesn't exist

    # Get the terms for the current category
    terms = search_terms.get(category, [])

    for term in terms:
        results = search_images_bing(key, term, 250)  # Search for images using the function created earlier
        imgs = [img_data['contentUrl'] for img_data in results]
        download_images(dest=dest, urls=imgs)  # Download images to the destination subfolder

# Clean Data - Remove corrupted images, unsupported formats, duplicated images

## Remove corrupted images

In [None]:
# Use fastai.vision.utils functions to verify if there are corrupted images
fnames = get_image_files(path)
failed = verify_images(fnames)
failed

(#5) [Path('../content/drive/MyDrive/Nebulae/supernova/6d982fb0-07d7-4388-a60e-a5355c65458d.jpg'),Path('../content/drive/MyDrive/Nebulae/supernova/24ec9ce9-bf6b-48a4-a629-a0c9f5d6a248.jpg'),Path('../content/drive/MyDrive/Nebulae/supernova/b4d89f0a-2cd4-4d99-9041-a619d1e4ed2f.jpg'),Path('../content/drive/MyDrive/Nebulae/supernova/44215d5d-92b4-47fa-a312-fa5906dfd78a.png'),Path('../content/drive/MyDrive/Nebulae/supernova/c21431aa-9ac9-475f-b8de-9d5f9cd3ea71.png')]

In [None]:
# Remove corrupted images
failed.map(Path.unlink)

(#5) [None,None,None,None,None]

## Convert and remove unsupported formats - Part 1

In [None]:
def convert_images(data_dir, target_format='JPEG', supported_formats=['.jpg', '.jpeg', '.png',]):
    # Define the target format for saving
    target_format = target_format.upper()

    for root, _, files in os.walk(data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            # Check if the file extension is not in the supported formats
            if not file_path.lower().endswith(tuple(supported_formats)):
                try:
                    img = Image.open(file_path)
                    # Create new file name with the target format
                    new_file_path = os.path.splitext(file_path)[0] + '.' + target_format.lower()
                    img.save(new_file_path, target_format)
                    print(f"Converted {file_path} to {new_file_path}")
                except Exception as e:
                    print(f"Error converting {file_path}: {e}")


In [None]:
convert_images(path)

Error converting ../content/drive/MyDrive/New/bok globules/8f59f73e-d805-4e9f-b579-f0ac19d1d8d8.gif: cannot write mode P as JPEG
Error converting ../content/drive/MyDrive/New/bok globules/60de3bff-606e-4a01-abd1-77adb8e61d14.gif: cannot write mode P as JPEG
Converted ../content/drive/MyDrive/New/bok globules/13aa0351-f4a6-4be9-8bdb-23002f106cbe.webp to ../content/drive/MyDrive/New/bok globules/13aa0351-f4a6-4be9-8bdb-23002f106cbe.jpeg
Converted ../content/drive/MyDrive/New/bok globules/d66d20e6-c3e7-4f21-9205-fd65fa49dc80.gif to ../content/drive/MyDrive/New/bok globules/d66d20e6-c3e7-4f21-9205-fd65fa49dc80.jpeg
Converted ../content/drive/MyDrive/New/bok globules/8faf9ff6-8eb9-4699-ba99-0125b7e8f3ec.gif to ../content/drive/MyDrive/New/bok globules/8faf9ff6-8eb9-4699-ba99-0125b7e8f3ec.jpeg
Error converting ../content/drive/MyDrive/New/bok globules/5295d789-ac0c-4404-9169-5d7406854d53.webp: cannot write mode RGBA as JPEG
Converted ../content/drive/MyDrive/New/Cone Nebula/9eec0265-6c89-46c

In [None]:
def remove_unsupported_formats(data_dir, supported_formats=['.jpg', '.jpeg', '.png']):
    for root, _, files in os.walk(data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if not file_path.lower().endswith(tuple(supported_formats)):
                try:
                    os.remove(file_path)
                    print(f"Removed unsupported file: {file_path}")
                except Exception as e:
                    print(f"Error removing {file_path}: {e}")


remove_unsupported_formats(path)


Removed unsupported file: ../content/drive/MyDrive/New/bok globules/8f59f73e-d805-4e9f-b579-f0ac19d1d8d8.gif
Removed unsupported file: ../content/drive/MyDrive/New/bok globules/60de3bff-606e-4a01-abd1-77adb8e61d14.gif
Removed unsupported file: ../content/drive/MyDrive/New/bok globules/13aa0351-f4a6-4be9-8bdb-23002f106cbe.webp
Removed unsupported file: ../content/drive/MyDrive/New/bok globules/d66d20e6-c3e7-4f21-9205-fd65fa49dc80.gif
Removed unsupported file: ../content/drive/MyDrive/New/bok globules/8faf9ff6-8eb9-4699-ba99-0125b7e8f3ec.gif
Removed unsupported file: ../content/drive/MyDrive/New/bok globules/5295d789-ac0c-4404-9169-5d7406854d53.webp
Removed unsupported file: ../content/drive/MyDrive/New/Cone Nebula/9eec0265-6c89-46c5-83f9-cded223ddf4b.jpg!d
Removed unsupported file: ../content/drive/MyDrive/New/Coalsack Nebula/d87da08a-8238-4e54-a561-b733d0be7c8d.webp
Removed unsupported file: ../content/drive/MyDrive/New/Coalsack Nebula/2bce56c1-9025-431e-9ab6-f9a7a308267e.webp
Removed 

## Remove duplications

In [None]:
def find_duplicate_images(folder_path):
  image_hashes = {}
  for root, _, files in os.walk(folder_path):
    for file in files:
      file_path = os.path.join(root, file)
      if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        try:
          with Image.open(file_path) as img:
            image_hash = hashlib.md5(img.tobytes()).hexdigest()
            if image_hash in image_hashes:
              image_hashes[image_hash].append(file_path)
            else:
              image_hashes[image_hash] = [file_path]
        except Exception as e:
          print(f"Error processing {file_path}: {e}")

  duplicate_images = {
      hash_value: file_paths for hash_value, file_paths in image_hashes.items()
      if len(file_paths) > 1
  }

  return duplicate_images

In [None]:
duplicate_images = find_duplicate_images(path)
len(duplicate_images)



Error processing ../content/drive/MyDrive/Nebulae/supernova/6d982fb0-07d7-4388-a60e-a5355c65458d.jpg: cannot identify image file '/content/drive/MyDrive/Nebulae/supernova/6d982fb0-07d7-4388-a60e-a5355c65458d.jpg'
Error processing ../content/drive/MyDrive/Nebulae/supernova/24ec9ce9-bf6b-48a4-a629-a0c9f5d6a248.jpg: cannot identify image file '/content/drive/MyDrive/Nebulae/supernova/24ec9ce9-bf6b-48a4-a629-a0c9f5d6a248.jpg'
Error processing ../content/drive/MyDrive/Nebulae/supernova/b4d89f0a-2cd4-4d99-9041-a619d1e4ed2f.jpg: cannot identify image file '/content/drive/MyDrive/Nebulae/supernova/b4d89f0a-2cd4-4d99-9041-a619d1e4ed2f.jpg'
Error processing ../content/drive/MyDrive/Nebulae/supernova/44215d5d-92b4-47fa-a312-fa5906dfd78a.png: cannot identify image file '/content/drive/MyDrive/Nebulae/supernova/44215d5d-92b4-47fa-a312-fa5906dfd78a.png'
Error processing ../content/drive/MyDrive/Nebulae/supernova/c21431aa-9ac9-475f-b8de-9d5f9cd3ea71.png: cannot identify image file '/content/drive/MyD

543

In [None]:
def remove_duplicate_images(duplicate_images, keep_first=True):
  for _, file_paths in duplicate_images.items():
    if keep_first:
      files_to_remove = file_paths[1:]
    else:
      files_to_remove = file_paths[:-1]

    for file_path in files_to_remove:
      try:
        os.remove(file_path)
        print(f"Removed: {file_path}")
      except Exception as e:
        print(f"Error removing {file_path}: {e}")

In [None]:
if duplicate_images:
  print("Found duplicate images:")
  for hash_value, file_paths in duplicate_images.items():
    print(f"  Hash: {hash_value}")
    for file_path in file_paths:
      print(f"    - {file_path}")

  remove_duplicate_images(duplicate_images)
else:
  print("No duplicate images found.")

Found duplicate images:
  Hash: 8e131b19f03be84c9816a1dc493e2635
    - ../content/drive/MyDrive/Nebulae/emission/118b767a-30c6-4256-b738-8da636834df8.jpg
    - ../content/drive/MyDrive/Nebulae/dark/648e4110-d062-4144-9aba-cf3c260ae52e.jpg
  Hash: fe4b4ca3532a43c93b033637f65722b1
    - ../content/drive/MyDrive/Nebulae/emission/6cd06b23-ba97-4962-ab3c-5cc148befbae.jpg
    - ../content/drive/MyDrive/Nebulae/supernova/0608b3f8-86b2-4173-b97a-9f61874ce88a.jpg
  Hash: e43ce0f1d7dfd79b9e8ba679d56202b6
    - ../content/drive/MyDrive/Nebulae/emission/16078535-4636-42ba-aa16-cd0435176b33.jpg
    - ../content/drive/MyDrive/Nebulae/supernova/26c48197-80ba-4870-8b70-290321c77b69.jpg
  Hash: e46681c60f1a27c09c3bab088c071c68
    - ../content/drive/MyDrive/Nebulae/emission/d945dd20-5891-499c-bb77-131f831120c1.jpg
    - ../content/drive/MyDrive/Nebulae/supernova/4a2bdef8-a8cb-4f8f-b647-11ce93676ccc.jpg
  Hash: 9869613d980f1c8d45649bfd73bacae5
    - ../content/drive/MyDrive/Nebulae/emission/81aa9138-2bf

## Convert and remove unsupported formats - Part 2

In [4]:
# Not sure why but even though the file extension is jpeg / jpg, some of them are still not supported when passing through tf.keras.utils.image_dataset_from_directory
# Hence this check will look for those images and remove them
def check_image_format(image_path):
  """
  Checks if an image is in a supported format (JPEG, PNG, GIF, BMP).
  Returns True if the format is supported, False otherwise.
  """
  try:
    img = Image.open(image_path)
    return img.format in ['JPEG', 'PNG', 'GIF', 'BMP']
  except Exception as e:
    print(f"Error opening image {image_path}: {e}")
    return False

# Assuming your datasets are stored in a directory structure
def find_incorrect_images(dataset_path):
  """
  Finds images with incorrect formats in a dataset directory.
  Returns a list of image paths that are not in supported formats.
  """
  incorrect_images = []
  for root, _, files in os.walk(dataset_path):
    for file in files:
      image_path = os.path.join(root, file)
      if not check_image_format(image_path):
        incorrect_images.append(image_path)
  return incorrect_images

In [5]:
incorrect_images = find_incorrect_images(path)

if incorrect_images:
  print("Incorrect images in training dataset:")
  for image_path in incorrect_images:
    print(image_path)

len(incorrect_images)



Incorrect images in training dataset:
../content/drive/MyDrive/Nebulae/dark/cddef0d8-773a-4f58-8d29-4e42b91c05a6.jpg
../content/drive/MyDrive/Nebulae/dark/97e53789-b73b-40c3-a282-ba27ce157b7c.jpg
../content/drive/MyDrive/Nebulae/dark/99615f6e-4571-4727-90be-99c773a313ff.jpg
../content/drive/MyDrive/Nebulae/dark/b6e732b2-91a6-4bfa-a55e-00722cfa20e3.jpg
../content/drive/MyDrive/Nebulae/dark/497016ce-358a-47d4-a320-7819b9c41787.jpg
../content/drive/MyDrive/Nebulae/dark/d7d77550-6fa4-4e29-b748-dd90f506b6ea.jpg
../content/drive/MyDrive/Nebulae/dark/9aee43e1-c523-430a-9398-d27f5c126e1a.jpg
../content/drive/MyDrive/Nebulae/dark/be20720f-84a9-4324-9b78-7c18d2411412.jpg
../content/drive/MyDrive/Nebulae/dark/52326566-459c-421f-a662-c04699c99a6a.jpg
../content/drive/MyDrive/Nebulae/dark/34be2f8c-363d-44b3-a292-92c8f40c136c.jpg
../content/drive/MyDrive/Nebulae/dark/6743ee63-d439-42d5-ba27-537c444858bf.jpg


11

In [6]:
def remove_incorrect_images(image_paths):
  for image_path in image_paths:
    try:
      os.remove(image_path)
      print(f"Removed incorrect image: {image_path}")
    except Exception as e:
      print(f"Error removing {image_path}: {e}")

In [7]:
# Remove incorrect images from the training and validation datasets
remove_incorrect_images(incorrect_images)

Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/cddef0d8-773a-4f58-8d29-4e42b91c05a6.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/97e53789-b73b-40c3-a282-ba27ce157b7c.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/99615f6e-4571-4727-90be-99c773a313ff.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/b6e732b2-91a6-4bfa-a55e-00722cfa20e3.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/497016ce-358a-47d4-a320-7819b9c41787.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/d7d77550-6fa4-4e29-b748-dd90f506b6ea.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/9aee43e1-c523-430a-9398-d27f5c126e1a.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/be20720f-84a9-4324-9b78-7c18d2411412.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/52326566-459c-421f-a662-c04699c99a6a.jpg
Removed incorrect image: ../content/drive/MyDrive/Nebulae/dark/3