In [None]:
print('Good vs No Good Classification - Data Exploration')
print('\nObjective: Explore and understand the dataset for our binary image classification project.')

In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

In [None]:
# Set up data directories
PROJECT_ROOT = os.path.abspath('../')
TRAIN_DIR = os.path.join(PROJECT_ROOT, 'data', 'train')
VALIDATION_DIR = os.path.join(PROJECT_ROOT, 'data', 'validation')

# Function to count images in each class
def count_images_in_directory(directory):
    class_counts = {}
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            class_counts[class_name] = len(os.listdir(class_path))
    return class_counts

In [None]:
# Count images in training and validation sets
train_counts = count_images_in_directory(TRAIN_DIR)
validation_counts = count_images_in_directory(VALIDATION_DIR)

print("Training Set Image Counts:")
print(train_counts)
print("\nValidation Set Image Counts:")
print(validation_counts)

In [None]:
# Visualize class distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.title('Training Set Class Distribution')
plt.pie(train_counts.values(), labels=train_counts.keys(), autopct='%1.1f%%')

plt.subplot(1, 2, 2)
plt.title('Validation Set Class Distribution')
plt.pie(validation_counts.values(), labels=validation_counts.keys(), autopct='%1.1f%%')

plt.tight_layout()
plt.show()

In [None]:
# Image size and format analysis
def analyze_image_properties(directory):
    image_sizes = []
    image_formats = {}
    
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            for image_name in os.listdir(class_path):
                image_path = os.path.join(class_path, image_name)
                try:
                    with Image.open(image_path) as img:
                        image_sizes.append(img.size)
                        image_formats[img.format] = image_formats.get(img.format, 0) + 1
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
    
    return image_sizes, image_formats

train_sizes, train_formats = analyze_image_properties(TRAIN_DIR)

print("Image Formats:", train_formats)
print("\nImage Size Statistics:")
print("Minimum Size:", min(train_sizes))
print("Maximum Size:", max(train_sizes))

In [None]:
# Visualize image size distribution
widths, heights = zip(*train_sizes)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.title('Image Width Distribution')
plt.hist(widths, bins=20)
plt.xlabel('Width (pixels)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.title('Image Height Distribution')
plt.hist(heights, bins=20)
plt.xlabel('Height (pixels)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Visualize sample images from each class
def plot_sample_images(directory, num_samples=5):
    plt.figure(figsize=(15, 3))
    for i, class_name in enumerate(os.listdir(directory)):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            samples = np.random.choice(os.listdir(class_path), num_samples)
            for j, sample in enumerate(samples):
                img_path = os.path.join(class_path, sample)
                plt.subplot(len(os.listdir(directory)), num_samples, i*num_samples + j + 1)
                img = plt.imread(img_path)
                plt.imshow(img)
                plt.axis('off')
                if j == 0:
                    plt.title(class_name)
    plt.tight_layout()
    plt.show()

# Plot sample images from training set
plot_sample_images(TRAIN_DIR)

In [None]:
print('Data Exploration Insights:')
print('\n1. Class Distribution')
print('   - Describes the number of images in \'good\' and \'no good\' categories')
print('\n2. Image Formats')
print('   - Identifies types of image files (JPEG, PNG, etc.)')
print('\n3. Image Size Analysis')
print('   - Shows variability in image dimensions')
print('   - Helps determine preprocessing requirements')
print('\n4. Sample Visualization')
print('   - Provides a quick visual overview of the dataset')