In [None]:
# import os
# import pandas as pd
# import glob


# def load_session_data(rec_path):
#     """
#     Load a single session's HDF5 file by dynamically searching the MIR_Aligned folder.

#     Selection logic:
#       1) If exactly one file matches aligned_predictions_with_ca_and_dF_F*.h5, use it.
#       2) Otherwise, pick the file with the longest stem (most extra info).
#     """
#     rec_path = Path(rec_path)
#     aligned_dir = rec_path / "MIR_Aligned"

#     # find all matching .h5 files
#     h5_paths = list(aligned_dir.glob("aligned_predictions_with_ca_and_dF_F*.h5"))
#     if not h5_paths:
#         raise FileNotFoundError(f"No .h5 files found in {aligned_dir}")

#     # selection rule
#     if len(h5_paths) == 1:
#         hdf5_file_path = h5_paths[0]
#     else:
#         # choose the file whose stem has the most characters
#         hdf5_file_path = max(h5_paths, key=lambda p: len(p.stem))

#     print("Using:", hdf5_file_path)

#     # load dataframe
#     df = pd.read_hdf(hdf5_file_path, key='df')

#     # extract metadata
#     session_id      = rec_path.name
#     recording_date  = rec_path.parent.name
#     experiment_name = rec_path.parent.parent.name

#     df['session_id']     = session_id
#     df['recording_date'] = recording_date
#     df['experiment']     = experiment_name
#     df['session_path']   = str(rec_path)
#     df['file_path']      = str(hdf5_file_path)

#     return df

# def load_sessions_from_csv(csv_filepath, base_path, verbose=True):
#     """
#     Load session data from a CSV file containing relative session paths.
    
#     This function assumes that the CSV file contains a column (by default, it will look for 
#     a column named 'relative_path'; if not found, it will use the first column) that lists the 
#     relative paths for each session. The provided base path is then prepended to each relative path 
#     to form the full session directory. Finally, it uses your preexisting function load_session_data 
#     to load each session's data.
    
#     Parameters:
#       - csv_filepath (str): The path to the CSV file containing the relative session paths.
#       - base_path (str): The base path to be prepended to each relative path.
#       - verbose (bool): If True, print messages when a session fails to load.
    
#     Returns:
#       - sessions (list): A list of DataFrames, one for each successfully loaded session.
#     """
#     try:
#         df_paths = pd.read_csv(csv_filepath)
#     except Exception as e:
#         print(f"Error reading CSV file at {csv_filepath}: {e}")
#         return []
    
#     # Determine which column contains the relative paths. Look for a column named 'relative_path'
#     # otherwise default to the first column.
#     if 'relative_path' in df_paths.columns:
#         relative_paths = df_paths['relative_path'].tolist()
#     else:
#         relative_paths = df_paths.iloc[:, 0].tolist()
    
#     sessions = []
#     for rel_path in relative_paths:
#         # Build the full session path using the base path
#         session_path = os.path.join(base_path, rel_path)
#         try:
#             df_session = load_session_data(session_path)
#             sessions.append(df_session)
#         except Exception as e:
#             if verbose:
#                 print(f"Could not load session at {session_path}: {e}")
#     return sessions

# # =============================================================================
# # Example Usage:
# # =============================================================================
# # Set the base path that will be prepended to each relative path.
# base_path = "/data/big_rim/rsync_dcc_sum/Oct3V1" #"/hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1"

# # CSV file that contains the relative session paths.
# csv_file = "/home/lq53/mir_repos/BBOP/random_tests/25mar_minibbop_integration/250331_sum_aligned_good_path_relative.csv" #"/hpc/group/tdunn/Bryan_Rigs/BigOpenField/2504_mir_loader/250331_sum_aligned_good_path_relative.csv"
# # Load all sessions
# all_sessions = load_sessions_from_csv(csv_file, base_path)
# print(f"Loaded {len(all_sessions)} sessions.")

Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r1_16_37/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r1_16_53/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r2_14_30/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r2_15_58/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_17/20240819V1r1_13_41/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1500_stp700_max25_diff3.5_pnr1.1.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_17/20240819V1r1_14_25/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1000_stp700_max25_diff3.5_pnr1.1.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_24/20241001PMCr2_16_19/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1500_stp700_max15_diff3.5_pnrauto.h5
Using: /data/big_rim/r

In [15]:
import os
import pandas as pd
from pathlib import Path

def load_session_data(rec_path):
    """
    Load a single session's HDF5 file by dynamically searching the MIR_Aligned folder.

    Selection logic:
      1) If exactly one file matches aligned_predictions_with_ca_and_dF_F*.h5, use it.
      2) Otherwise, pick the file with the longest stem (most extra info).
    """
    rec_path = Path(rec_path)
    aligned_dir = rec_path / "MIR_Aligned"

    # find all matching .h5 files
    h5_paths = list(aligned_dir.glob("aligned_predictions_with_ca_and_dF_F*.h5"))
    if not h5_paths:
        raise FileNotFoundError(f"No .h5 files found in {aligned_dir}")

    # selection rule
    if len(h5_paths) == 1:
        hdf5_file_path = h5_paths[0]
    else:
        # choose the file whose stem has the most characters
        hdf5_file_path = max(h5_paths, key=lambda p: len(p.stem))

    print("Using:", hdf5_file_path)

    # load dataframe
    df = pd.read_hdf(hdf5_file_path, key='df')

    # extract metadata
    session_id      = rec_path.name
    recording_date  = rec_path.parent.name
    experiment_name = rec_path.parent.parent.name

    df['session_id']     = session_id
    df['recording_date'] = recording_date
    df['experiment']     = experiment_name
    df['session_path']   = str(rec_path)
    df['file_path']      = str(hdf5_file_path)

    return df


def load_sessions_from_csv(
    csv_filepath,
    base_path,
    filter_fn=None,
    verbose=True
):
    """
    Load session data from a CSV file of relative paths, with optional dynamic filtering.

    Params:
      - csv_filepath (str): path to the CSV file (must contain a column 'relative_path' or use first column)
      - base_path (str): prefix for each relative path
      - filter_fn (callable, optional): function taking a relative_path string and returning True if
        that session should be loaded, False to skip it. If None, all are loaded.
      - verbose (bool): print failures

    Returns:
      - List[pd.DataFrame]: one DataFrame per session that passed filter_fn and loaded successfully.
    """
    try:
        df_paths = pd.read_csv(csv_filepath)
    except Exception as e:
        print(f"Error reading CSV file at {csv_filepath}: {e}")
        return []

    # determine which column contains relative paths
    if 'relative_path' in df_paths.columns:
        relative_paths = df_paths['relative_path'].tolist()
    else:
        relative_paths = df_paths.iloc[:, 0].astype(str).tolist()

    # apply dynamic filter if provided
    if filter_fn is not None:
        relative_paths = [rp for rp in relative_paths if filter_fn(rp)]

    sessions = []
    for rel_path in relative_paths:
        session_path = Path(base_path) / rel_path
        try:
            df_session = load_session_data(session_path)
            sessions.append(df_session)
        except Exception as e:
            if verbose:
                print(f"Could not load session at {session_path}: {e}")
    return sessions



# === Example usage ===
base_path = "/data/big_rim/rsync_dcc_sum/Oct3V1"
csv_file = "/home/lq53/mir_repos/BBOP/random_tests/25mar_minibbop_integration/250331_sum_aligned_good_path_relative.csv" #"/hpc/group/tdunn/Bryan_Rigs/BigOpenField/2504_mir_loader/250331_sum_aligned_good_path_relative.csv"


# Load only sessions whose path contains 'v1'
v1_sessions = load_sessions_from_csv(
    csv_file,
    base_path,
    filter_fn=lambda rp: 'v1' in rp.lower()
)
print(f"Loaded {len(v1_sessions)} v1 sessions.")


Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r1_16_37/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r1_16_53/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r2_14_30/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_14/20240916v1r2_15_58/MIR_Aligned/aligned_predictions_with_ca_and_dF_F.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_17/20240819V1r1_13_41/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1500_stp700_max25_diff3.5_pnr1.1.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_17/20240819V1r1_14_25/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1000_stp700_max25_diff3.5_pnr1.1.h5
Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_11_01/20240819V1r1_AO_14_56/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1500_stp700_max25_diff5.0_pnrauto.h5
Using: /data/big_rim

In [6]:
import pandas as pd
from pathlib import Path



# --- Load prediction H5 and compute head frames ---
base_path = "/data/big_rim/rsync_dcc_sum/Oct3V1/2024_11_06/20241015pmcr2_17_13"
aligned_dir = Path(base_path) / 'MIR_Aligned'
h5_paths = list(aligned_dir.glob('aligned_predictions_with_ca_and_dF_F_*.h5'))
if not h5_paths:
    raise FileNotFoundError(f"No .h5 files found in {aligned_dir}")
print("H5 files:", h5_paths)

h5 = h5_paths[0]
with pd.HDFStore(h5) as store:
    print(store.keys())


H5 files: [PosixPath('/data/big_rim/rsync_dcc_sum/Oct3V1/2024_11_06/20241015pmcr2_17_13/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1000_stp700_max15_diff3.5_pnrauto.h5')]
['/df']


In [8]:
from pathlib import Path

base_path = Path("/data/big_rim/rsync_dcc_sum/Oct3V1/2024_11_06/20241015pmcr2_17_13")
aligned_dir = base_path / "MIR_Aligned"

# find all .h5 files matching the base pattern (with or without suffix)
h5_paths = list(aligned_dir.glob("aligned_predictions_with_ca_and_dF_F*.h5"))
if not h5_paths:
    raise FileNotFoundError(f"No .h5 files found in {aligned_dir}")

# classify exact vs. suffix variants
exact_name  = "aligned_predictions_with_ca_and_dF_F.h5"
exact_files = [p for p in h5_paths if p.name == exact_name]
suffix_files = [p for p in h5_paths if p.name != exact_name]

# decision rules:
# 1) If only the exact file exists, use it.
# 2) Otherwise, if any suffix file exists, pick the one with the longest name
#    (i.e. the "more things" variant).
# 3) Fallback: first match.
if exact_files and not suffix_files:
    hdf5_file_path = exact_files[0]
elif suffix_files:
    hdf5_file_path = max(suffix_files, key=lambda p: len(p.stem))
else:
    hdf5_file_path = h5_paths[0]

print("Using:", hdf5_file_path)


Using: /data/big_rim/rsync_dcc_sum/Oct3V1/2024_11_06/20241015pmcr2_17_13/MIR_Aligned/aligned_predictions_with_ca_and_dF_F_wnd1000_stp700_max15_diff3.5_pnrauto.h5


In [None]:

#loader function that i shared with all.

# import os
# import pandas as pd
# import glob

# # load_session_data is a function from BBOP, but copy pasted here:
# def load_session_data(rec_path):
#     """
#     Load a single session's HDF5 file by dynamically searching the MIR_Aligned folder.
    
#     If only one HDF5 file is present (typically named exactly
#     'aligned_predictions_with_ca_and_dF_F.h5'), it is used.
    
#     If more than one file exists, the function will look for one that contains
#     additional identifying information (e.g., with 'wnd1500' in the filename).
#     If found, that file is used; otherwise, the first file is chosen.

#     The function extracts metadata from the folder structure and appends it
#     to the DataFrame.
#     """
#     mir_aligned_path = os.path.join(rec_path, 'MIR_Aligned')
#     # Search for all HDF5 files in the MIR_Aligned folder
#     hdf5_files = glob.glob(os.path.join(mir_aligned_path, '*.h5'))
#     if not hdf5_files:
#         raise FileNotFoundError(f"No HDF5 files found in {mir_aligned_path}")
#     if len(hdf5_files) == 1:
#         hdf5_file_path = hdf5_files[0]
#     else:
#         # Look for a file that contains 'wnd1500' in its name
#         matching_files = [f for f in hdf5_files if 'wnd1500' in os.path.basename(f)]
#         if matching_files:
#             hdf5_file_path = matching_files[0]
#         else:
#             hdf5_file_path = hdf5_files[0]  # fallback to first file if no match found

#     df = pd.read_hdf(hdf5_file_path, key='df')
    
#     # Extract metadata based on the folder structure:
#     # e.g., /data/big_rim/rsync_dcc_sum/Oct3V1/2024_10_25/20241002PMCr2_15_42
#     norm_path = os.path.normpath(rec_path)
#     session_id = os.path.basename(norm_path)
#     recording_date = os.path.basename(os.path.dirname(norm_path))
#     experiment_name = os.path.basename(os.path.dirname(os.path.dirname(norm_path)))
    
#     df['session_id'] = session_id
#     df['recording_date'] = recording_date
#     df['experiment'] = experiment_name
#     df['session_path'] = rec_path
#     df['file_path'] = hdf5_file_path
    
#     return df

# def load_sessions_from_csv(csv_filepath, base_path, verbose=True):
#     """
#     Load session data from a CSV file containing relative session paths.
    
#     This function assumes that the CSV file contains a column (by default, it will look for 
#     a column named 'relative_path'; if not found, it will use the first column) that lists the 
#     relative paths for each session. The provided base path is then prepended to each relative path 
#     to form the full session directory. Finally, it uses your preexisting function load_session_data 
#     to load each session's data.
    
#     Parameters:
#       - csv_filepath (str): The path to the CSV file containing the relative session paths.
#       - base_path (str): The base path to be prepended to each relative path.
#       - verbose (bool): If True, print messages when a session fails to load.
    
#     Returns:
#       - sessions (list): A list of DataFrames, one for each successfully loaded session.
#     """
#     try:
#         df_paths = pd.read_csv(csv_filepath)
#     except Exception as e:
#         print(f"Error reading CSV file at {csv_filepath}: {e}")
#         return []
    
#     # Determine which column contains the relative paths. Look for a column named 'relative_path'
#     # otherwise default to the first column.
#     if 'relative_path' in df_paths.columns:
#         relative_paths = df_paths['relative_path'].tolist()
#     else:
#         relative_paths = df_paths.iloc[:, 0].tolist()
    
#     sessions = []
#     for rel_path in relative_paths:
#         # Build the full session path using the base path
#         session_path = os.path.join(base_path, rel_path)
#         try:
#             df_session = load_session_data(session_path)
#             sessions.append(df_session)
#         except Exception as e:
#             if verbose:
#                 print(f"Could not load session at {session_path}: {e}")
#     return sessions

# # =============================================================================
# # Example Usage:
# # =============================================================================
# # Set the base path that will be prepended to each relative path.
# base_path = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1"

# # CSV file that contains the relative session paths.
# csv_file = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/2504_mir_loader/250331_sum_aligned_good_path_relative.csv"

# # Load all sessions
# all_sessions = load_sessions_from_csv(csv_file, base_path)
# print(f"Loaded {len(all_sessions)} sessions.")