# Data Exploration

This notebook is used for exploring the dataset, visualizing data distributions, and understanding anomalies present in the dataset.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image

# Set the random seed for reproducibility
np.random.seed(42)

# Define the path to the dataset
DATASET_PATH = 'path/to/your/dataset'

# Load the dataset
def load_images_from_folder(folder):
    images = []
    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename))
        if img is not None:
            images.append(img)
    return images

images = load_images_from_folder(DATASET_PATH)
print(f'Loaded {len(images)} images from {DATASET_PATH}')

In [2]:
# Display the shape of the first image
first_image = np.array(images[0])
first_image.shape

(100, 100, 3)

In [3]:
# Visualize some sample images
def plot_sample_images(images, n=5):
    plt.figure(figsize=(15, 5))
    for i in range(n):
        plt.subplot(1, n, i + 1)
        plt.imshow(images[i])
        plt.axis('off')
    plt.show()

plot_sample_images(images, n=5)

In [4]:
# Analyze the distribution of image classes (if applicable)
# Assuming a CSV file with labels exists
labels_df = pd.read_csv('path/to/your/labels.csv')
sns.countplot(data=labels_df, x='label')
plt.title('Distribution of Image Classes')
plt.xticks(rotation=45)
plt.show()

In [5]:
# Further analysis can be added here
# For example, visualizing anomalies, checking for missing values, etc.