### Final Project Data Exploration Phase

##### Dataset Loading

In [None]:
# Imports and root path - adjust the ROOT_DIR if your data is elsewhere
import os
import h5py
import numpy as np
import pandas as pd
from pathlib import Path


ROOT_DIR = r'D:\Data Mining Final Proj\Data\t15_copyTask_neuralData\hdf5_data_final'
print('Using ROOT_DIR:', ROOT_DIR)

Using ROOT_DIR: D:\Data Mining Final Proj\Data\t15_copyTask_neuralData\hdf5_data_final


In [None]:
import h5py
import os
import pandas as pd
import numpy as np

# Initialize lists to store data for each category
train_records = []
test_records = []
validation_records = []

with os.scandir(ROOT_DIR) as path:
    for folder in path:
        if folder.is_dir():
            print(f"Processing folder: {folder.name}")
            
            with os.scandir(folder.path) as subfolder:
                for file in subfolder:
                    if file.name.endswith((".h5", ".hdf5")):
                        print(f"  Processing file: {file.name}")
                        
                        # Determine category based on file name
                        file_lower = file.name.lower()
                        if 'test' in file_lower and not 'testing' in file_lower:
                            category = 'test'
                            target_list = test_records
                        elif 'train' in file_lower:
                            category = 'train'
                            target_list = train_records
                        elif 'val' in file_lower or 'valid' in file_lower:
                            category = 'validation'
                            target_list = validation_records
                        else:
                            print(f"    Skipping file: {file.name} - not a train/test/validation file")
                            continue
                            
                        print(f"    Categorized as: {category}")
                        
                        try:
                            with h5py.File(file.path, "r") as f:
                                # List all trial folders (assuming they are groups)
                                trial_groups = [key for key in f.keys() if 'trial' in key.lower()]
                                
                                print(f"    Found {len(trial_groups)} trial groups")
                                
                                for trial in trial_groups:
                                    print(f"      Processing {trial}")
                                    trial_group = f[trial]
                                    
                                    # Extract the data we need from each trial
                                    input_features = None
                                    seq_class_ids = None
                                    transcription = None
                                    
                                    # Look for our target datasets within this trial group
                                    for dataset_name in trial_group.keys():
                                        dataset_lower = dataset_name.lower()
                                        if 'input' in dataset_lower or 'feature' in dataset_lower:
                                            print(f"        Found input features: {dataset_name}")
                                            input_features = np.array(trial_group[dataset_name])
                                        elif 'class' in dataset_lower or 'seq' in dataset_lower:
                                            print(f"        Found sequence class IDs: {dataset_name}")
                                            seq_class_ids = np.array(trial_group[dataset_name])
                                        elif 'transcript' in dataset_lower:
                                            print(f"        Found transcription: {dataset_name}")
                                            transcription = np.array(trial_group[dataset_name])
                                    
                                    # Create records for each sample in this trial
                                    # Determine how many samples we have (assuming they all have the same first dimension)
                                    num_samples = 0
                                    if input_features is not None:
                                        num_samples = input_features.shape[0]
                                    elif seq_class_ids is not None:
                                        num_samples = seq_class_ids.shape[0]
                                    elif transcription is not None:
                                        num_samples = transcription.shape[0]
                                    
                                    if num_samples == 0:
                                        print(f"        No samples found in {trial}")
                                        continue
                                    
                                    # Create records for each sample
                                    for i in range(num_samples):
                                        record = {
                                            "Category": category,
                                            "Folder": folder.name,
                                            "File": file.name,
                                            "Trial": trial,
                                            "Sample_Index": i
                                        }
                                        
                                        # Add the actual data for this sample
                                        if input_features is not None:
                                            if len(input_features.shape) > 1:
                                                record["Input_Features"] = input_features[i]
                                            else:
                                                record["Input_Features"] = input_features[i] if i < len(input_features) else None
                                                
                                        if seq_class_ids is not None:
                                            if len(seq_class_ids.shape) > 1:
                                                record["Seq_Class_IDs"] = seq_class_ids[i]
                                            else:
                                                record["Seq_Class_IDs"] = seq_class_ids[i] if i < len(seq_class_ids) else None
                                                
                                        if transcription is not None:
                                            if len(transcription.shape) > 1:
                                                record["Transcription"] = transcription[i]
                                            else:
                                                record["Transcription"] = transcription[i] if i < len(transcription) else None
                                        
                                        target_list.append(record)
                                    
                                    print(f"        Added {num_samples} records from {trial}")
                                            
                        except Exception as e:
                            print(f"    Error processing file {file.name}: {e}")

# Create DataFrames from the collected records
print("\nCreating DataFrames...")
df_train = pd.DataFrame(train_records)
df_test = pd.DataFrame(test_records)
df_validation = pd.DataFrame(validation_records)

# Display summary for each category
print("\n=== Summary of Data ===")
print(f"Train data: {len(df_train)} records")
print(f"Test data: {len(df_test)} records")
print(f"Validation data: {len(df_validation)} records")

# Save each category to separate CSV files
if not df_train.empty:
    # Create a version without the potentially large data arrays for CSV export
    df_train_meta = df_train[['Category', 'Folder', 'File', 'Trial', 'Sample_Index']]
    df_train_meta.to_csv("train_data_summary.csv", index=False)
    print("Saved train data summary")
    
if not df_test.empty:
    df_test_meta = df_test[['Category', 'Folder', 'File', 'Trial', 'Sample_Index']]
    df_test_meta.to_csv("test_data_summary.csv", index=False)
    print("Saved test data summary")
    
if not df_validation.empty:
    df_validation_meta = df_validation[['Category', 'Folder', 'File', 'Trial', 'Sample_Index']]
    df_validation_meta.to_csv("validation_data_summary.csv", index=False)
    print("Saved validation data summary")

# Display preview of each DataFrame if not empty
if not df_train.empty:
    print("\nTrain data preview:")
    preview_cols = ['Folder', 'File', 'Trial', 'Sample_Index']
    if 'Input_Features' in df_train.columns:
        print("Input_Features shape:", df_train['Input_Features'].iloc[0].shape if hasattr(df_train['Input_Features'].iloc[0], 'shape') else 'scalar')
    if 'Seq_Class_IDs' in df_train.columns:
        print("Seq_Class_IDs shape:", df_train['Seq_Class_IDs'].iloc[0].shape if hasattr(df_train['Seq_Class_IDs'].iloc[0], 'shape') else 'scalar')
    if 'Transcription' in df_train.columns:
        print("Transcription shape:", df_train['Transcription'].iloc[0].shape if hasattr(df_train['Transcription'].iloc[0], 'shape') else 'scalar')
    print(df_train[preview_cols].head(5))
    
if not df_test.empty:
    print("\nTest data preview:")
    preview_cols = ['Folder', 'File', 'Trial', 'Sample_Index']
    if 'Input_Features' in df_test.columns:
        print("Input_Features shape:", df_test['Input_Features'].iloc[0].shape if hasattr(df_test['Input_Features'].iloc[0], 'shape') else 'scalar')
    if 'Seq_Class_IDs' in df_test.columns:
        print("Seq_Class_IDs shape:", df_test['Seq_Class_IDs'].iloc[0].shape if hasattr(df_test['Seq_Class_IDs'].iloc[0], 'shape') else 'scalar')
    if 'Transcription' in df_test.columns:
        print("Transcription shape:", df_test['Transcription'].iloc[0].shape if hasattr(df_test['Transcription'].iloc[0], 'shape') else 'scalar')
    print(df_test[preview_cols].head(5))
    
if not df_validation.empty:
    print("\nValidation data preview:")
    preview_cols = ['Folder', 'File', 'Trial', 'Sample_Index']
    if 'Input_Features' in df_validation.columns:
        print("Input_Features shape:", df_validation['Input_Features'].iloc[0].shape if hasattr(df_validation['Input_Features'].iloc[0], 'shape') else 'scalar')
    if 'Seq_Class_IDs' in df_validation.columns:
        print("Seq_Class_IDs shape:", df_validation['Seq_Class_IDs'].iloc[0].shape if hasattr(df_validation['Seq_Class_IDs'].iloc[0], 'shape') else 'scalar')
    if 'Transcription' in df_validation.columns:
        print("Transcription shape:", df_validation['Transcription'].iloc[0].shape if hasattr(df_validation['Transcription'].iloc[0], 'shape') else 'scalar')
    print(df_validation[preview_cols].head(5))

# Optionally, save the full DataFrames with data (might be large)
# df_train.to_pickle("train_data_full.pkl")
# df_test.to_pickle("test_data_full.pkl")
# df_validation.to_pickle("validation_data_full.pkl")

Processing folder: t15.2023.08.11
  Processing file: data_train.hdf5
    Categorized as: train
    Found 288 trial groups
      Processing trial_0000
        Found input features: input_features
        Found sequence class IDs: seq_class_ids
        Found transcription: transcription
        Added 321 records from trial_0000
      Processing trial_0001
        Found input features: input_features
        Found sequence class IDs: seq_class_ids
        Found transcription: transcription
        Added 481 records from trial_0001
      Processing trial_0002
        Found input features: input_features
        Found sequence class IDs: seq_class_ids
        Found transcription: transcription
        Added 480 records from trial_0002
      Processing trial_0003
        Found input features: input_features
        Found sequence class IDs: seq_class_ids
        Found transcription: transcription
        Added 502 records from trial_0003
      Processing trial_0004
        Found input featur

In [None]:
print(df_train.head())
# print(df_test.head())

  Category          Folder             File       Trial  Sample_Index  \
0    train  t15.2023.08.11  data_train.hdf5  trial_0000             0   
1    train  t15.2023.08.11  data_train.hdf5  trial_0000             1   
2    train  t15.2023.08.11  data_train.hdf5  trial_0000             2   
3    train  t15.2023.08.11  data_train.hdf5  trial_0000             3   
4    train  t15.2023.08.11  data_train.hdf5  trial_0000             4   

                                      Input_Features  Seq_Class_IDs  \
0  [2.3076649, -0.78699756, -0.64687246, -0.54658...            7.0   
1  [-0.5859305, -0.78699756, -0.64687246, -0.5465...           28.0   
2  [-0.5859305, -0.78699756, -0.64687246, -0.5465...           17.0   
3  [0.8608672, 1.2421287, -0.64687246, -0.5465877...           24.0   
4  [-0.5859305, -0.78699756, -0.64687246, -0.5465...           40.0   

   Transcription  
0           66.0  
1          114.0  
2          105.0  
3          110.0  
4          103.0  


In [None]:
print(df_test.head())

  Category          Folder            File       Trial  Sample_Index  \
0     test  t15.2023.08.13  data_test.hdf5  trial_0000             0   
1     test  t15.2023.08.13  data_test.hdf5  trial_0000             1   
2     test  t15.2023.08.13  data_test.hdf5  trial_0000             2   
3     test  t15.2023.08.13  data_test.hdf5  trial_0000             3   
4     test  t15.2023.08.13  data_test.hdf5  trial_0000             4   

                                      Input_Features  
0  [-0.46759456, -0.27936023, -0.40907395, -0.459...  
1  [-0.46759456, -0.27936023, -0.40907395, -0.459...  
2  [-0.46759456, -0.27936023, -0.40907395, -0.459...  
3  [-0.46759456, -0.27936023, -0.40907395, -0.459...  
4  [-0.46759456, -0.27936023, -0.40907395, -0.459...  
