# Set up Environment - Mount Google Drive and import libraries


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Import libraries
import numpy as np
import pandas as pd
import os
import pathlib
import hashlib
import PIL
import PIL.Image as Image
from fastai.vision.all import *
import matplotlib.pyplot as plt

In [3]:
# Specify directory to store images
base_dir = '/content/drive/MyDrive/Nebulae'
data_dir = pathlib.Path(base_dir)

# EDA

In [4]:
# Create function to get image size for EDA
def get_image_size(image_path):
    img = Image.open(image_path)
    return img.size

In [5]:
image_paths = []
image_sizes = []
labels = []
image_modes = []

for class_folder in os.listdir(base_dir): # Loop through each subfolder of the parent 'Nebulae' folder
    class_folder_path = os.path.join(base_dir, class_folder) # Create path for each subfolder
    if os.path.isdir(class_folder_path):
        for image_file in os.listdir(class_folder_path): # Loop through each image file
            image_path = os.path.join(class_folder_path, image_file) # Create path for each image file within each subfolders
            image_paths.append(image_path) # Append to list
            image_sizes.append(get_image_size(image_path))  # Get the size of each image file and append to list
            labels.append(class_folder)  # Use the folder name as the label and append to list

      # Check if the image mode (eg 'RGB', 'RGBA', 'P', 'CMYK', 'L' etc.)
            with Image.open(image_path) as img:
                mode = img.mode
                image_modes.append(mode)  # Append to list



In [6]:
print(len(image_paths), len(image_sizes), len(labels), len(image_modes))

10119 10119 10119 10119


In [7]:
# Create dataframe from the list just created
df = pd.DataFrame({
    'image_path': image_paths,
    'image_size': image_sizes,
    'image_mode': image_modes,
    'label': labels
})

In [8]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
emission,2342
planetary,2165
reflection,2099
supernova,1832
dark,1681


In [9]:
# As we will need to understand how many channels we need to input into our modelling later, do a preliminary check of whether images are in RGB, RGBA etc.
df.image_mode.value_counts()

Unnamed: 0_level_0,count
image_mode,Unnamed: 1_level_1
RGB,9765
RGBA,183
L,149
P,17
CMYK,3
I;16,1
LA,1


We can see that most have the standard RGB colour channels, there are a number with a 4th alpha channel RGBA, which denotes transparency, some images denoted with P, represents palette, which is a collection of specific colours, where L represents lightness, which describes colours in terms of their hue, saturation and lightness.

In [10]:
# Look at value counts for image sizes
df.image_size.value_counts()

Unnamed: 0_level_0,count
image_size,Unnamed: 1_level_1
"(1200, 630)",140
"(1920, 1080)",124
"(1280, 720)",121
"(980, 980)",109
"(1024, 1024)",98
...,...
"(640, 757)",1
"(728, 960)",1
"(696, 750)",1
"(900, 625)",1


We can see that the image sizes vary greatly, which need to be resized later on

We can see that we have between 1400 to 2400 images per category after cleaning, with a total of 5 categories. There is a significant imbalance here so will augment half of the data in supernova and dark categories to balance this out later

In [11]:
# Create a new folder to store files to be used later
data_folder = Path('../content/drive/MyDrive/NewData')
if not data_folder.exists(): # Check if "data_folder" folder exists.
    data_folder.mkdir() # If it doesn't exist, create the folder

In [12]:
# Save the dataframe as a csv
df.to_csv(data_folder / 'image_data.csv', index=False)

In [13]:
# Check that csv is saved and can load properly
df = pd.read_csv(data_folder / 'image_data.csv')
df

Unnamed: 0,image_path,image_size,image_mode,label
0,/content/drive/MyDrive/Nebulae/emission/f0cfb706-45de-4c34-93e5-67f8151e8913.jpg,"(1079, 1101)",RGB,emission
1,/content/drive/MyDrive/Nebulae/emission/11b50af0-cf8b-4012-9dca-183eba67292b.jpg,"(1280, 720)",RGB,emission
2,/content/drive/MyDrive/Nebulae/emission/3031461d-8b28-434b-9067-d0ccbbc183d6.jpg,"(1600, 1067)",RGB,emission
3,/content/drive/MyDrive/Nebulae/emission/69503b88-b9fe-43d7-b889-7b1131585459.jpg,"(1500, 1143)",RGB,emission
4,/content/drive/MyDrive/Nebulae/emission/990a61b4-cc3c-4c5f-aaec-128302bc483d.jpg,"(600, 610)",RGB,emission
...,...,...,...,...
10114,/content/drive/MyDrive/Nebulae/supernova/855f1340-e8c6-4415-9177-2de5486da2ed.jpg,"(1500, 630)",RGB,supernova
10115,/content/drive/MyDrive/Nebulae/supernova/ad7d2118-d3f9-4c12-b2d0-216c19917d4a.jpg,"(980, 704)",RGB,supernova
10116,/content/drive/MyDrive/Nebulae/supernova/b9390c3b-25dd-4252-b533-6aa9d67a13c1.jpg,"(3301, 2480)",RGB,supernova
10117,/content/drive/MyDrive/Nebulae/supernova/b131cf89-1181-48e4-9afa-357f749e3412.jpg,"(519, 650)",RGB,supernova


In [14]:
# Group the dataframe by label
label_groups = df.groupby('label')

# Loop through each label and plot the first two images
for label, group in label_groups:
    print(f"Label: {label}")
    for i in range(min(2, len(group))):
        image_path = group.iloc[i]['image_path']
        img = plt.imread(image_path)
        plt.imshow(img)
        plt.title(f"Image {i+1} for label {label}")
        plt.show()


Output hidden; open in https://colab.research.google.com to view.