In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aditmagotra/gameplay-images")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/aditmagotra/gameplay-images/versions/1


In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from collections import defaultdict


In [12]:
import kagglehub
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from collections import defaultdict
import seaborn as sns

# Download latest version
path = kagglehub.dataset_download("aditmagotra/gameplay-images")
print("Path to dataset files:", path)

# Function to load images from a directory (recursively)
def load_images_from_folder(folder):
    images = []
    valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')  # Add more if needed

    for root, _, files in os.walk(folder):
        for filename in files:
            if filename.lower().endswith(valid_extensions):  # Check for valid image extensions
                img_path = os.path.join(root, filename)
                try:
                    img = Image.open(img_path)
                    images.append((img, root))  # Store image and its subdirectory as a tuple
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")
    return images

# Function to display images
def display_images(images, title="Images"):
    plt.figure(figsize=(10, 10))
    for i in range(min(9, len(images))):  # Display up to 9 images
        plt.subplot(3, 3, i + 1)
        plt.imshow(images[i][0])  # Display the image (first element of the tuple)
        plt.title(f"From: {os.path.basename(images[i][1])}")  # Show subdirectory name (second element of the tuple)
        plt.axis('off')
    plt.suptitle(title)
    plt.show()

# Function to get image statistics
def get_image_statistics(images):
    widths, heights = [], []
    for img, _ in images:  # Unpack the tuple (ignore the subdirectory for statistics)
        width, height = img.size
        widths.append(width)
        heights.append(height)
    return widths, heights

# Function to plot resolution distribution
def plot_resolution_distribution(class_images):
    all_widths, all_heights = [], []

    for class_name, images in class_images.items():
        widths, heights = get_image_statistics(images)
        all_widths.extend(widths)
        all_heights.extend(heights)

    # Plot width distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(all_widths, bins=50, kde=True, color='blue')
    plt.title("Distribution of Image Widths")
    plt.xlabel("Width (pixels)")
    plt.ylabel("Frequency")
    plt.show()

    # Plot height distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(all_heights, bins=50, kde=True, color='green')
    plt.title("Distribution of Image Heights")
    plt.xlabel("Height (pixels)")
    plt.ylabel("Frequency")
    plt.show()

# Function to plot aspect ratio distribution
def plot_aspect_ratio_distribution(class_images):
    aspect_ratios = []

    for class_name, images in class_images.items():
        for img, _ in images:
            width, height = img.size
            aspect_ratios.append(width / height)

    # Plot aspect ratio distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(aspect_ratios, bins=50, kde=True, color='purple')
    plt.title("Distribution of Aspect Ratios (Width/Height)")
    plt.xlabel("Aspect Ratio")
    plt.ylabel("Frequency")
    plt.axvline(1, color='red', linestyle='--', label="Square (1:1)")
    plt.legend()
    plt.show()

# Function to plot class-wise resolution distribution
def plot_classwise_resolution(class_images):
    for class_name, images in class_images.items():
        widths, heights = get_image_statistics(images)

        # Plot width and height distribution for each class
        plt.figure(figsize=(12, 6))
        sns.histplot(widths, bins=30, kde=True, color='blue', label="Width")
        sns.histplot(heights, bins=30, kde=True, color='green', label="Height")
        plt.title(f"Resolution Distribution for Class: {class_name}")
        plt.xlabel("Pixels")
        plt.ylabel("Frequency")
        plt.legend()
        plt.show()

# Function to identify resolution outliers
def identify_resolution_outliers(class_images, min_resolution=(64, 64), max_resolution=(2048, 2048)):
    outliers = []

    for class_name, images in class_images.items():
        for img, root in images:
            width, height = img.size
            if width < min_resolution[0] or height < min_resolution[1] or width > max_resolution[0] or height > max_resolution[1]:
                outliers.append((img, root, (width, height)))

    # Print outliers
    print(f"Found {len(outliers)} resolution outliers:")
    for img, root, resolution in outliers:
        print(f"Image from {root}: Resolution = {resolution}")

    # Display outlier images
    if outliers:
        print("Displaying outlier images:")
        display_images(outliers, title="Resolution Outliers")

# Function to plot resolution vs. class distribution
def plot_resolution_vs_class(class_images):
    resolutions = []
    classes = []

    for class_name, images in class_images.items():
        for img, _ in images:
            width, height = img.size
            resolutions.append(width * height)  # Use area as a measure of resolution
            classes.append(class_name)

    # Create a DataFrame for plotting
    df = pd.DataFrame({"Class": classes, "Resolution": resolutions})

    # Plot resolution vs. class
    plt.figure(figsize=(12, 6))
    sns.boxplot(x="Class", y="Resolution", data=df)
    plt.title("Resolution Distribution by Class")
    plt.xlabel("Class")
    plt.ylabel("Resolution (Width * Height)")
    plt.xticks(rotation=45)
    plt.show()

# Main EDA function
def perform_eda(path):
    # Dictionary to hold images for each class
    class_images = defaultdict(list)

    # Load images from the dataset (recursively)
    all_images = load_images_from_folder(path)

    # Organize images by their subdirectory (class)
    for img, root in all_images:
        class_name = os.path.basename(root)  # Get the subdirectory name
        class_images[class_name].append((img, root))  # Store image and its subdirectory

    # Display some images from each class
    for class_name, images in class_images.items():
        if images:  # Only display if there are images
            print(f"Displaying images for class: {class_name}")
            display_images(images, title=f"Class: {class_name}")
        else:
            print(f"No images found for class {class_name}")

    # Get image statistics
    for class_name, images in class_images.items():
        if images:  # Only calculate stats if there are images
            widths, heights = get_image_statistics(images)
            print(f"Class: {class_name}")
            print(f"Average width: {np.mean(widths)}, Average height: {np.mean(heights)}")
            print(f"Min width: {np.min(widths)}, Min height: {np.min(heights)}")
            print(f"Max width: {np.max(widths)}, Max height: {np.max(heights)}")
            print("-" * 40)

    # Return the class_images dictionary
    return class_images

# Perform EDA and store class_images
class_images = perform_eda(path)

# Perform additional analyses
plot_resolution_distribution(class_images)
plot_aspect_ratio_distribution(class_images)
plot_classwise_resolution(class_images)
identify_resolution_outliers(class_images)
plot_resolution_vs_class(class_images)

Output hidden; open in https://colab.research.google.com to view.