In [19]:
import os
import shutil
from random import sample
import hdf5storage

imbalanced_dataset_path = '/Users/elizabethnemeti/Desktop/imbalanced_data'
balanced_dataset_path = '/Users/elizabethnemeti/Desktop/balanced_data'

# Ensure the balanced dataset path exists
os.makedirs(balanced_dataset_path, exist_ok=True)

# Function to load .mat file data
def load_mat_file(file_path):
    try:
        return hdf5storage.loadmat(file_path)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Initialize dictionaries for organizing and counting files
file_paths = {'1': [], '2': [], '3': []}
class_counts = {'1': 0, '2': 0, '3': 0}

# Read .mat files and organize by class label
total_files = 0

for root, dirs, files in os.walk(imbalanced_dataset_path):
    for file in files:
        if file == '.DS_Store' or not file.endswith('.mat'):
            continue  # Skip non-MAT files and .DS_Store
        total_files += 1
        file_path = os.path.join(root, file)
        mat_data = load_mat_file(file_path)
        if mat_data and 'cjdata' in mat_data:
            label = str(int(mat_data['cjdata']['label'][0][0]))
            file_paths[label].append(file_path)
        else:
            print(f"Invalid or missing 'cjdata' in file: {file_path}")

# Balance and copy files to the new directory
for label, paths in file_paths.items():
    selected_paths = sample(paths, 708) if len(paths) > 708 else paths
    class_counts[label] = len(selected_paths)
    for file_path in selected_paths:
        shutil.copy(file_path, balanced_dataset_path)

print("Balancing complete.")
print(f"Total valid .mat files processed: {total_files}")

# Verification step: print how many files are in each class
for label, count in class_counts.items():
    print(f"Class {label} contains {count} files.")


Balancing complete.
Total valid .mat files processed: 3064
Class 1 contains 708 files.
Class 2 contains 708 files.
Class 3 contains 708 files.
