# ModularyNN Dataset Examples for Classification

This notebook demonstrates the various dataset classes available in the ModularyNN framework.

In [None]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Add parent directory to path to import local modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

# Import dataset classes
from Data.DatasetClasses import (
    RandomData, 
    IrisData, 
    DigitData, 
    MNISTData, 
    CaltechData, 
    CifarData
)

## 1. RandomData

RandomData generates random inputs and one-hot encoded labels for testing purposes.

In [None]:
# Initialize RandomData with 100 features and 5 categories
random_data = RandomData(input_size=100, batch_size=32, categories=5)

# Print dataset information
random_data.print_dataset_info("Random Data")

# Show a sample
print("\nShowing a random sample:")
random_index = np.random.randint(0, random_data._input_tensor_train.shape[0])
random_data.show_sample(random_index)

## 2. IrisData

The classic Iris dataset with 4 features and 3 classes.

In [None]:
# Initialize IrisData
iris_data = IrisData(batch_size=16)

# Print dataset information
iris_data.print_dataset_info("Iris Data")

# Show a sample
print("\nShowing a random sample:")
random_index = np.random.randint(0, iris_data._input_tensor_train.shape[0])
iris_data.show_sample(random_index)

## 3. DigitData

The scikit-learn digits dataset with 8x8 images of handwritten digits.

In [None]:
# Initialize DigitData
digit_data = DigitData(batch_size=16)

# Print dataset information
digit_data.print_dataset_info("Digit Data")

# Show a sample
print("\nShowing a random sample:")
random_index = np.random.randint(0, digit_data._input_tensor_train.shape[0])
digit_data.show_sample(random_index)

## 4. MNISTData

The MNIST dataset with 28x28 grayscale images of handwritten digits.

In [None]:
# Initialize MNISTData
mnist_data = MNISTData(batch_size=64)

# Print dataset information
mnist_data.print_dataset_info("MNIST Data")

# Show a sample
print("\nShowing a random sample:")
random_index = np.random.randint(0, mnist_data._input_tensor_train.shape[0])
mnist_data.show_sample(random_index)

## 5. CaltechData

The Caltech101 dataset with images of objects from 101 categories.

In [None]:
# Initialize CaltechData
caltech_data = CaltechData(batch_size=32, image_size=(64, 64))

# Print dataset information
caltech_data.print_dataset_info("Caltech101")

# Show sample categories
category_names = list(caltech_data.label_mapping.keys())
print(f"\nSample categories (out of {len(category_names)}):")
for i in range(min(10, len(category_names))):
    print(f"  - {category_names[i]}")

# Show a sample
print("\nShowing a random sample:")
random_index = np.random.randint(0, caltech_data._input_tensor_train.shape[0])
caltech_data.show_sample(random_index)

## 6. CifarData

The CIFAR-10 dataset with 32x32 RGB images in 10 classes.

In [None]:
# Initialize CifarData
cifar_data = CifarData(batch_size=64)

# Print dataset information
cifar_data.print_dataset_info("Cifar Data")

# Show classes
print(f"\nClass names: {cifar_data.classes}")

# Show a sample
print("\nShowing a random sample:")
random_index = np.random.randint(0, cifar_data._input_tensor_train.shape[0])
cifar_data.show_sample(random_index)

## Dataset Size Comparison

Let's compare the sizes of all datasets.

In [None]:
# Create all datasets
datasets = {
    "Random": RandomData(input_size=100, batch_size=32, categories=5),
    "Iris": IrisData(batch_size=16),
    "Digit": DigitData(batch_size=16),
    "MNIST": MNISTData(batch_size=64),
    "CIFAR-10": CifarData(batch_size=64),
    "Caltech101": CaltechData(batch_size=32, image_size=(64, 64))
}

# Extract information
train_sizes = []
test_sizes = []
sample_shapes = []
class_counts = []
names = []

for name, dataset in datasets.items():
    names.append(name)
    train_sizes.append(dataset._input_tensor_train.shape[0])
    test_sizes.append(dataset._input_tensor_test.shape[0])
    sample_shapes.append(str(dataset._input_tensor_train.shape[1:]))
    class_counts.append(dataset._label_tensor_train.shape[1])
    
# Plot comparison
plt.figure(figsize=(12, 8))

# Plot dataset sizes
plt.subplot(2, 1, 1)
x = np.arange(len(names))
width = 0.35
plt.bar(x - width/2, train_sizes, width, label='Training')
plt.bar(x + width/2, test_sizes, width, label='Test')
plt.yscale('log')
plt.ylabel('Number of Samples (log scale)')
plt.title('Dataset Sizes')
plt.xticks(x, names)
plt.legend()

# Plot number of classes
plt.subplot(2, 1, 2)
plt.bar(names, class_counts)
plt.ylabel('Number of Classes')
plt.title('Number of Classes per Dataset')
plt.xticks(rotation=45)

# Add sample shape as text
for i, shape in enumerate(sample_shapes):
    plt.text(i, class_counts[i] + 2, f"Shape: {shape}", ha='center')

plt.tight_layout()
plt.show()

# Print summary table
print("\nDataset Summary:")
print("-" * 80)
print(f"{'Dataset':<15} {'Train Samples':<15} {'Test Samples':<15} {'Sample Shape':<20} {'Classes':<10}")
print("-" * 80)
for i, name in enumerate(names):
    print(f"{name:<15} {train_sizes[i]:<15} {test_sizes[i]:<15} {sample_shapes[i]:<20} {class_counts[i]:<10}")

## Batch Iteration Example

Demonstrate how to iterate through batches for training.

In [None]:
# Create a dataset (MNIST) for demonstration
mnist_data = MNISTData(batch_size=32)

# Get a few batches
print("Getting batches from MNIST dataset:")
for i in range(3):
    inputs, labels = mnist_data.next()
    print(f"Batch {i+1}:")
    print(f"  Input shape: {inputs.shape}")
    print(f"  Label shape: {labels.shape}")
    
    # Count the distribution of digits in the batch
    digit_counts = np.zeros(10, dtype=int)
    for label in labels:
        digit = np.argmax(label)
        digit_counts[digit] += 1
    
    print(f"  Digit distribution: {digit_counts}")

## Multi-Dataset Sample Visualization

Let's visualize samples from all datasets side by side.

In [None]:
# Create figure for image datasets
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
random_index = np.random.randint(0, digit_data._input_tensor_train.shape[0])
img = digit_data._input_tensor_train[random_index, 0]
label = np.argmax(digit_data._label_tensor_train[random_index])
plt.imshow(img, cmap='gray')
plt.title(f"Digit Dataset - Class: {label}")
plt.axis('off')

plt.subplot(2, 2, 2)
random_index = np.random.randint(0, mnist_data._input_tensor_train.shape[0])
img = mnist_data._input_tensor_train[random_index, 0]
label = np.argmax(mnist_data._label_tensor_train[random_index])
plt.imshow(img, cmap='gray')
plt.title(f"MNIST Dataset - Class: {label}")
plt.axis('off')

plt.subplot(2, 2, 3)
random_index = np.random.randint(0, cifar_data._input_tensor_train.shape[0])
img = cifar_data._input_tensor_train[random_index].transpose(1, 2, 0)
label = np.argmax(cifar_data._label_tensor_train[random_index])
plt.imshow(img)
plt.title(f"CIFAR-10 Dataset - Class: {cifar_data.classes[label]}")
plt.axis('off')

plt.subplot(2, 2, 4)
try:
    random_index = np.random.randint(0, caltech_data._input_tensor_train.shape[0])
    img = caltech_data._input_tensor_train[random_index].transpose(1, 2, 0)
    label = np.argmax(caltech_data._label_tensor_train[random_index])
    category = next(cat for cat, idx in caltech_data.label_mapping.items() if idx == label)
    plt.imshow(img)
    plt.title(f"Caltech101 Dataset - Class: {category}")
    plt.axis('off')
except (FileNotFoundError, AttributeError) as e:
    plt.text(0.5, 0.5, "Caltech101 dataset not available\nor not extracted", 
             ha='center', va='center', fontsize=12)
    plt.axis('off')

plt.tight_layout()
plt.suptitle("Image Dataset Samples", fontsize=16, y=1.05)
plt.show()

# Create figure for non-image datasets
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
random_index = np.random.randint(0, random_data._input_tensor_train.shape[0])
features = random_data._input_tensor_train[random_index]
label = np.argmax(random_data._label_tensor_train[random_index])
plt.plot(features)
plt.title(f"Random Dataset - Class: {label}")
plt.xlabel("Feature Index")
plt.ylabel("Value")

plt.subplot(1, 2, 2)
random_index = np.random.randint(0, iris_data._input_tensor_train.shape[0])
features = iris_data._input_tensor_train[random_index]
label = np.argmax(iris_data._label_tensor_train[random_index])
class_names = ['setosa', 'versicolor', 'virginica']
feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
plt.bar(feature_names, features)
plt.title(f"Iris Dataset - Class: {class_names[label]}")
plt.xticks(rotation=45, ha='right')
plt.ylim([0, 1])  # Normalized features

plt.tight_layout()
plt.suptitle("Non-Image Dataset Samples", fontsize=16, y=1.05)
plt.show()