<a href="https://colab.research.google.com/github/gladysvalerie/nejm-brain-to-text-competition/blob/gladys/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import h5py
import os
import pandas as pd
import numpy as np

# Initialize dictionaries to store data for each category
train_data = []
test_data = []
validation_data = []
other_data = []  # For anything that doesn't match the above categories

with os.scandir('hdf5_data_final') as path:
    for folder in path:
        if folder.is_dir():
            print(f"Processing folder: {folder.name}")
            
            with os.scandir(folder.path) as subfolder:
                for file in subfolder:
                    if file.name.endswith((".h5", ".hdf5")):
                        print(f"  Processing file: {file.name}")
                        
                        # Determine category based on file name
                        file_lower = file.name.lower()
                        if 'test' in file_lower and not 'testing' in file_lower:
                            category = 'test'
                            target_list = test_data
                        elif 'train' in file_lower:
                            category = 'train'
                            target_list = train_data
                        elif 'val' in file_lower or 'valid' in file_lower:
                            category = 'validation'
                            target_list = validation_data
                        else:
                            category = 'other'
                            target_list = other_data
                            
                        print(f"    Categorized as: {category}")
                        
                        try:
                            with h5py.File(file.path, "r") as f:
                                # First identify the datasets you want
                                datasets_to_extract = []
                                
                                def find_datasets(name, obj):
                                    if isinstance(obj, h5py.Dataset):
                                        datasets_to_extract.append(name)
                                
                                # Find all datasets in the file
                                f.visititems(find_datasets)
                                
                                # Extract trial_id and parent_id if they exist
                                trial_id = None
                                parent_id = None
                                
                                # Look for trial_id and parent_id in dataset names or attributes
                                for ds_path in datasets_to_extract:
                                    if 'trial_id' in ds_path.lower():
                                        trial_id = np.array(f[ds_path]).tolist()
                                    if 'parent_id' in ds_path.lower():
                                        parent_id = np.array(f[ds_path]).tolist()
                                        
                                # Extract data from each dataset
                                for ds_path in datasets_to_extract:
                                    try:
                                        dataset = f[ds_path]
                                        # Convert to numpy array and then to a regular Python object
                                        data_array = np.array(dataset)
                                        
                                        # Create a record for this dataset
                                        record = {
                                            "Category": category,
                                            "Folder": folder.name,
                                            "File": file.name,
                                            "Dataset": ds_path,
                                            "Data": data_array,
                                            "Shape": data_array.shape,
                                            "DType": data_array.dtype,
                                            "Trial_ID": trial_id,
                                            "Parent_ID": parent_id
                                        }
                                        target_list.append(record)
                                    except Exception as e:
                                        print(f"    Error extracting dataset {ds_path}: {e}")
                        except Exception as e:
                            print(f"    Error opening file {file.name}: {e}")

# Create DataFrames from the collected data
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)
df_validation = pd.DataFrame(validation_data)
df_other = pd.DataFrame(other_data)

# Display summary for each category
print("\n=== Summary of Data ===")
print(f"Train data: {len(df_train)} datasets")
print(f"Test data: {len(df_test)} datasets")
print(f"Validation data: {len(df_validation)} datasets")
print(f"Other data: {len(df_other)} datasets")

# Save each category to separate CSV files (just the metadata)
if not df_train.empty:
    df_train[['Folder', 'File', 'Dataset', 'Shape', 'DType', 'Trial_ID', 'Parent_ID']].to_csv("train_data_summary.csv", index=False)
    print("Saved train data summary")
    
if not df_test.empty:
    df_test[['Folder', 'File', 'Dataset', 'Shape', 'DType', 'Trial_ID', 'Parent_ID']].to_csv("test_data_summary.csv", index=False)
    print("Saved test data summary")
    
if not df_validation.empty:
    df_validation[['Folder', 'File', 'Dataset', 'Shape', 'DType', 'Trial_ID', 'Parent_ID']].to_csv("validation_data_summary.csv", index=False)
    print("Saved validation data summary")
    
if not df_other.empty:
    df_other[['Folder', 'File', 'Dataset', 'Shape', 'DType', 'Trial_ID', 'Parent_ID']].to_csv("other_data_summary.csv", index=False)
    print("Saved other data summary")

# Display preview of each DataFrame if not empty
if not df_train.empty:
    print("\nTrain data preview:")
    print(df_train[['Folder', 'File', 'Dataset', 'Shape', 'Trial_ID', 'Parent_ID']].head(5))
    
if not df_test.empty:
    print("\nTest data preview:")
    print(df_test[['Folder', 'File', 'Dataset', 'Shape', 'Trial_ID', 'Parent_ID']].head(5))
    
if not df_validation.empty:
    print("\nValidation data preview:")
    print(df_validation[['Folder', 'File', 'Dataset', 'Shape', 'Trial_ID', 'Parent_ID']].head(5))