In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install -U jupyter ipywidgets tqdm
!pip install h5py



In [2]:
# IMPORT AND GLOBAL VARIABLES

import os
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

BASE_DIR = 'data/hdf5_data_final'

NEURAL_DATA_KEY = 'input_features'
TRANSCRIPTION_KEY = 'transcription'

DESC_DIR = 'data/t15_copyTaskData_description.csv'

In [3]:
# READ THE COPY TASK DESC FILE

b2txt_csv_df = pd.read_csv(DESC_DIR)

b2txt_csv_df.head()

Unnamed: 0,Date,Post-implant day,Block number,Number of sentences,Corpus,Split
0,2023-08-11,25,2,20,50-Word,Train
1,2023-08-11,25,3,30,50-Word,Train
2,2023-08-11,25,4,40,50-Word,Train
3,2023-08-11,25,5,50,50-Word,Train
4,2023-08-11,25,6,50,50-Word,Train


In [11]:
# LOAD AND MAKE METADATA

def extract_hdf5(file_path, b2txt_csv_df):
    """Return a list of dictionaries, one per trial."""
    records = []

    with h5py.File(file_path, 'r') as f:
        for key in f.keys():
            g = f[key]

            neural_features = g['input_features'][:]
            n_time_steps = g.attrs.get('n_time_steps')
            seq_class_ids = g['seq_class_ids'][:] if 'seq_class_ids' in g else None
            seq_len = g.attrs.get('seq_len')
            transcription = g['transcription'][:] if 'transcription' in g else None
            sentence_label = g.attrs.get('sentence_label')
            session = g.attrs['session']
            block_num = g.attrs['block_num']
            trial_num = g.attrs['trial_num']

            # Look up corpus name from CSV
            try:
                year, month, day = session.split('.')[1:]
                date = f"{year}-{month}-{day}"
                row = b2txt_csv_df[
                    (b2txt_csv_df["Date"] == date)
                    & (b2txt_csv_df["Block number"] == block_num)
                ]
                corpus_name = row["Corpus"].values[0] if len(row) > 0 else None
            except Exception:
                corpus_name = None

            records.append({
                "neural_features": neural_features,
                "n_time_steps": n_time_steps,
                "seq_class_ids": seq_class_ids,
                "seq_len": seq_len,
                "transcription": transcription,
                "sentence_label": sentence_label,
                "session": session,
                "block_num": block_num,
                "trial_num": trial_num,
                "corpus": corpus_name,
            })

    return records


def load_metadata(base_dir):
    all_metadata = []

    session_dirs = sorted([
        d for d in os.listdir(base_dir) 
        if os.path.isdir(os.path.join(base_dir, d))
        ])

    for session in tqdm(session_dirs, desc="Processing outcome..."):
        session_path = os.path.join(base_dir, session)

        for split in ['train', 'test', 'val']:
            file_name = f"data_{split}.hdf5"
            file_path = os.path.join(session_path, file_name)

            if os.path.exists(file_path):
                session_metadata = extract_hdf5(file_path, b2txt_csv_df)

                for item in session_metadata:
                    item['session'] = session
                    item['split'] = split
                
                all_metadata.extend(session_metadata)
        
    return pd.DataFrame(all_metadata)

df = load_metadata(BASE_DIR)

print(df.info())
      

Processing outcome...:   0%|          | 0/45 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10948 entries, 0 to 10947
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   neural_features  10948 non-null  object 
 1   n_time_steps     10948 non-null  int64  
 2   seq_class_ids    9498 non-null   object 
 3   seq_len          9498 non-null   float64
 4   transcription    9498 non-null   object 
 5   sentence_label   9498 non-null   object 
 6   session          10948 non-null  object 
 7   block_num        10948 non-null  int64  
 8   trial_num        10948 non-null  int64  
 9   corpus           10948 non-null  object 
 10  split            10948 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 941.0+ KB
None


In [12]:
df.head()

Unnamed: 0,neural_features,n_time_steps,seq_class_ids,seq_len,transcription,sentence_label,session,block_num,trial_num,corpus,split
0,"[[2.3076649, -0.78699756, -0.64687246, -0.5465...",321,"[7, 28, 17, 24, 40, 17, 31, 40, 20, 21, 25, 29...",14.0,"[66, 114, 105, 110, 103, 32, 105, 116, 32, 99,...",Bring it closer.,t15.2023.08.11,2,0,50-Word,train
1,"[[-0.51709145, -0.70207363, -0.64330804, -0.48...",481,"[22, 6, 40, 14, 2, 22, 3, 21, 18, 40, 17, 38, ...",19.0,"[77, 121, 32, 102, 97, 109, 105, 108, 121, 32,...",My family is closer.,t15.2023.08.11,2,1,50-Word,train
2,"[[0.95464545, -0.6912571, 2.5334082, -0.459320...",480,"[36, 3, 31, 40, 9, 34, 40, 10, 13, 40, 21, 6, ...",14.0,"[87, 104, 97, 116, 32, 100, 111, 32, 116, 104,...",What do they like?,t15.2023.08.11,2,2,50-Word,train
3,"[[-0.4997814, -0.6836047, -0.6305947, 1.260037...",502,"[16, 5, 40, 17, 38, 40, 10, 2, 31, 40, 15, 33,...",14.0,"[72, 111, 119, 32, 105, 115, 32, 116, 104, 97,...",How is that good?,t15.2023.08.11,2,3,50-Word,train
4,"[[-0.4850082, -0.66607094, -0.62398034, -0.433...",402,"[23, 18, 9, 40, 16, 11, 21, 27, 40, 16, 18, 28...",13.0,"[78, 101, 101, 100, 32, 104, 101, 108, 112, 32...",Need help here?,t15.2023.08.11,2,4,50-Word,train


EDA YEAH

1. Dataset sanity and structure
   1. How many total trials exist
   2. How data is distributed across sessions and splits
   3. Which corpus or blocks have most samples
2. Neural feature structure
   1. 

In [None]:
df['split'].value_counts()

split
train    8072
test     1450
val      1426
Name: count, dtype: int64

In [None]:
"""Splitting the data as intended"""

df_train = df[df['split'] == 'train'].reset_index(drop=True)
df_val = df[df['split'] == 'test'].reset_index(drop=True)
df_test = df[df['split'] == 'val'].reset_index(drop=True)

"""From now onwards--for the sake of sampling--we are going to explore df_train only"""

In [17]:
len(df_train)

8072