In [26]:
import pandas as pd
import os

In [9]:
b2t_24 = pd.read_pickle("/data2/brain2text/b2t_24/brain2text24.pkl")

In [29]:
import numpy as np
import pickle
from pathlib import Path

# --- Configuration ---
data_paths = ['/data2/brain2text/b2t_24/brain2text24.pkl', '/data2/brain2text/b2t_25/brain2text25.pkl']
output_dirs = ['/data2/brain2text/b2t_24/trial_level_data/', '/data2/brain2text/b2t_25/trial_level_data/']
# ---------------------

print("Starting preprocessing...")
file_manifest = {'train': [], 'val': [], 'test': []} # To save paths

for p_id, pkl_path in enumerate(data_paths):
    print(f"Processing participant {p_id} from {pkl_path}...")
    
    output_dir = output_dirs[p_id]
    
    with open(pkl_path, "rb") as handle:
        participant_data = pickle.load(handle)

    for split in ['train', 'val', 'test']:
        if split not in participant_data:
            continue
            
        split_data = participant_data[split]
        
        for day in range(len(split_data)):
            if split_data[day] is None:
                continue
            
            # Use 'sentenceDat' to find the number of trials
            n_trials = len(split_data[day]["sentenceDat"])
            
            for trial in range(n_trials):
                # Define where to save this single trial
                trial_dir = f"{output_dir}/{split}"
                os.makedirs(trial_dir, exist_ok=True)
                trial_path = f"{trial_dir}/day_{day}_trial_{trial}.npz"
                
                # --- Get the data for this trial ---
                sentenceDat = split_data[day]["sentenceDat"][trial]
                
                # Handle test mode (missing keys)
                is_test = "text" not in split_data[day]
                
                transcript = (split_data[day]['transcriptions'][trial] 
                              if not is_test else "FILLER")
                
                text = (split_data[day]["text"][trial] 
                        if not is_test else np.array([0], dtype=np.int32))

                # Save all trial data to a single compressed .npz file
                # We also save 'pid', 'day', and 'trial' as metadata
                np.savez_compressed(
                    trial_path,
                    sentenceDat=sentenceDat,
                    transcription=np.array(transcript, dtype=object), # Save string as 0-D object array
                    text=text,
                    pid=p_id,
                    day=day
                )
                
                # Add the new file path to our manifest
                file_manifest[split].append(str(trial_path))

print("Done preprocessing.")

# Optionally, save the manifest of all file paths for easy loading
with open(output_dir / "manifest.json", "w") as f:
    import json
    json.dump(file_manifest, f)

Starting preprocessing...
Processing participant 0 from /data2/brain2text/b2t_24/brain2text24.pkl...
Processing participant 1 from /data2/brain2text/b2t_25/brain2text25.pkl...


TypeError: 'NoneType' object is not subscriptable

In [31]:
text

array([10, 11, 28, 40, 35, 11, 28, 18, 40,  3, 14, 11, 20, 30,  3, 23,  3,
       31, 40, 36, 17, 10, 40, 36,  3, 23, 40,  3, 23,  3, 10, 12, 40,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [30]:
transcript

"They're very affectionate with one another."

In [23]:
print(split_data[0])

None


In [21]:
print(len(split_data))


45


In [20]:
trial_path

PosixPath('preprocessed_dataset_npz/participant_1/test/day_1_trial_0.npz')

In [19]:
print(split_data[0])

None
