In [4]:
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import os

# === File paths ===
base_path = r"C:\Users\Gurukumar\Desktop"
original_path = os.path.join(base_path, "Accepted_dataset.h5")
train_path = os.path.join(base_path, "train_dataset.h5")
test_path = os.path.join(base_path, "test_dataset.h5")

# === Load original data ===
with h5py.File(original_path, 'r') as f:
    X = f['InputFeatures']['SoundPressure'][:]
    y = f['TargetLabels']['target'][:]

# === Split data ===
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.5, random_state=42)

# === Save train data ===
with h5py.File(train_path, 'w') as f:
    f.create_dataset('InputFeatures/SoundPressure', data=X_train)
    f.create_dataset('TargetLabels/target', data=y_train)

# === Save test data ===
with h5py.File(test_path, 'w') as f:
    f.create_dataset('InputFeatures/SoundPressure', data=X_test)
    f.create_dataset('TargetLabels/target', data=y_test)

# === Helper function to extract and print class info ===
def print_class_info(file_path):
    with h5py.File(file_path, 'r') as f:
        y = f['TargetLabels']['target'][:]
    counter = Counter(y)
    num_classes = len(counter)
    print(f"\nFile: {os.path.basename(file_path)}")
    print(f"Number of classes: {num_classes}")
    print("Elements per class:")
    for cls, count in sorted(counter.items()):
        print(f"  Class {cls}: {count} samples")

# === Print class info for all datasets ===
print_class_info(original_path)
print_class_info(train_path)
print_class_info(test_path)



File: Accepted_dataset.h5
Number of classes: 5
Elements per class:
  Class b'New': 500 samples
  Class b'Severe': 500 samples
  Class b'Slightly': 500 samples
  Class b'Working': 500 samples
  Class b'Wornout': 500 samples

File: train_dataset.h5
Number of classes: 5
Elements per class:
  Class b'New': 250 samples
  Class b'Severe': 250 samples
  Class b'Slightly': 250 samples
  Class b'Working': 250 samples
  Class b'Wornout': 250 samples

File: test_dataset.h5
Number of classes: 5
Elements per class:
  Class b'New': 250 samples
  Class b'Severe': 250 samples
  Class b'Slightly': 250 samples
  Class b'Working': 250 samples
  Class b'Wornout': 250 samples
