In [3]:
# import sys
# import os
# sys.path.append(os.path.abspath('..'))
# import pandas as pd
# import pyarrow as pa
# import pyarrow.parquet as pq
# from utlis.scan_engine_utlis.scan_engine_utlis import (
#     load_status_mapping,
#     read_failed_paths,
#     assign_status_codes,
#     find_calib_file,
#     match_date_pattern,
#     load_parquet
# )

# # Function to log folder structure into a Parquet file with a hierarchy of rec_files and associated calib_files
# def log_folder_to_parquet(base_folder, parquet_file, failed_paths_file, status_mapping):
#     # Read manually inputted failed paths
#     failed_paths = read_failed_paths(failed_paths_file)
    
#     # Data storage for DataFrame
#     log_data = []
    
#     # Traverse the base folder
#     for folder_name in os.listdir(base_folder):
#         folder_path = os.path.join(base_folder, folder_name)
        
#         # Check if the folder name matches the date pattern
#         if os.path.isdir(folder_path) and match_date_pattern(folder_name):
#             rec_files_data = []  # To store rec files and their status
#             calib_files = []  # To store calibration files

#             # Check for calibration files starting with 'calib'
#             for file_name in os.listdir(folder_path):
#                 if file_name.startswith("calib"):
#                     calib_files.append(file_name)

#             # Traverse subfolders within this folder
#             for subfolder_name in os.listdir(folder_path):
#                 subfolder_path = os.path.join(folder_path, subfolder_name)
                
#                 # Check for subfolders starting with a digit (rec folders)
#                 if os.path.isdir(subfolder_path) and subfolder_name[0].isdigit():
#                     # Find calibration file for each subfolder
#                     calib_file = find_calib_file(subfolder_path)
                    
#                     # Assign status codes based on folder name and subfolder path
#                     rec_file_data = assign_status_codes(
#                         folder_name, subfolder_path, calib_file, failed_paths, status_mapping
#                     )

#                     rec_file_data['rec_file'] = subfolder_name  # Add rec_file to the data
#                     rec_files_data.append(rec_file_data)
            
#             # Append data to list for DataFrame creation, including hierarchical structure
#             log_data.append({
#                 'date_folder': folder_name,
#                 'calib_files': calib_files,  # Store the calibration files under date_folder level
#                 'rec_files_data': rec_files_data  # Each rec file with its status (label3d_dannce, sync, z_adjusted)
#             })
    
#     # Cast all status columns to string before saving to Parquet
#     df = pd.json_normalize(log_data, 'rec_files_data', ['date_folder', 'calib_files'])

#     # Ensure all relevant columns are strings
#     status_columns = ['sync', 'label3d_status', 'z_adjusted', 'mir_generate_param']  # Add all your status columns here
#     df[status_columns] = df[status_columns].astype(str)

#     # Create pyarrow Table and save as Parquet
#     table = pa.Table.from_pandas(df)
#     pq.write_table(table, parquet_file)


# if __name__ == "__main__":
#     base_folder = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/24summ"  # Replace with your base folder
#     save_path = os.path.join(base_folder, 'paret')
#     failed_paths_file = '/hpc/group/tdunn/Bryan_Rigs/BigOpenField/24summ/mir_bundle_run/synced_folders/240914_failed_sum_test.txt'  # File containing failed paths

#     if not os.path.exists(save_path):
#         os.makedirs(save_path)

#     parquet_file = os.path.join(save_path, "folder_log_encoded.parquet")  # Output Parquet file

#     # Load the universal status mapping from the YAML file
#     status_mapping_file = '/hpc/group/tdunn/lq53/BBOP/scan_engine/status_mapping.yaml'  # Replace with the path to your YAML file
#     status_mapping = load_status_mapping(status_mapping_file)

#     log_folder_to_parquet(base_folder, parquet_file, failed_paths_file, status_mapping)


In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from utlis.scan_engine_utlis.scan_engine_utlis import (
    load_status_mapping,
    read_failed_paths,
    assign_status_codes,
    find_calib_file,
    match_date_pattern
)

# Function to log folder structure into a Parquet file with a hierarchy of rec_files and associated calib_files
def log_folder_to_parquet(base_folder, parquet_file, failed_paths_file):
    # Read manually inputted failed paths
    failed_paths = read_failed_paths(failed_paths_file)
    
    # Data storage for DataFrame
    log_data = []
    
    # Traverse the base folder
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        
        # Check if the folder name matches the date pattern
        if os.path.isdir(folder_path) and match_date_pattern(folder_name):
            rec_files_data = []  # To store rec files and their status
            calib_files = []  # To store calibration files

            # Check for calibration files starting with 'calib'
            for file_name in os.listdir(folder_path):
                if file_name.startswith("calib"):
                    calib_files.append(file_name)

            # Traverse subfolders within this folder
            for subfolder_name in os.listdir(folder_path):
                subfolder_path = os.path.join(folder_path, subfolder_name)
                
                # Check for subfolders starting with a digit (rec folders)
                if os.path.isdir(subfolder_path) and subfolder_name[0].isdigit():
                    # Find calibration file for each subfolder
                    calib_file = find_calib_file(subfolder_path)
                    
                    # Assign status codes based on folder name and subfolder path
                    rec_file_data = assign_status_codes(
                        folder_name, subfolder_path, calib_file, failed_paths
                    )

                    rec_file_data['rec_file'] = subfolder_name  # Add rec_file to the data
                    rec_files_data.append(rec_file_data)
            
            # Append data to list for DataFrame creation, including hierarchical structure
            log_data.append({
                'date_folder': folder_name,
                'calib_files': calib_files,  # Store the calibration files under date_folder level
                'rec_files_data': rec_files_data  # Each rec file with its status (label3d_dannce, sync, z_adjusted)
            })
    
    # Cast all status columns to string before saving to Parquet
    df = pd.json_normalize(log_data, 'rec_files_data', ['date_folder', 'calib_files'])

    # Ensure all relevant columns are strings
    status_columns = ['sync', 'label3d_status', 'z_adjusted', 'mir_generate_param']  # Add all your status columns here
    df[status_columns] = df[status_columns].astype(str)

    # Create pyarrow Table and save as Parquet
    table = pa.Table.from_pandas(df)
    pq.write_table(table, parquet_file)


if __name__ == "__main__":
    base_folder = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/24summ"  # Replace with your base folder
    save_path = os.path.join(base_folder, 'paret')
    failed_paths_file = '/hpc/group/tdunn/Bryan_Rigs/BigOpenField/24summ/mir_bundle_run/synced_folders/240914_failed_sum_test.txt'  # File containing failed paths

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    parquet_file = os.path.join(save_path, "folder_log_encoded_numb.parquet")  # Output Parquet file

    # # Load the universal status mapping from the YAML file
    # status_mapping_file = '/hpc/group/tdunn/lq53/BBOP/scan_engine/status_mapping.yaml'  # Replace with the path to your YAML file
    # status_mapping = load_status_mapping(status_mapping_file)

    log_folder_to_parquet(base_folder, parquet_file, failed_paths_file)


In [2]:
from utlis.scan_engine_utlis.scan_engine_utlis import load_parquet
# Load the Parquet file into a DataFrame


# Display the DataFrame
# parquet_file = '/home/lq53/mir_data/24summ/paret/folder_log.parquet'  # Adjust the path as needed
df = load_parquet(parquet_file)

# Display the DataFrame in the notebook
# df.head()  # Shows the top rows of the DataFrame
df.head(len(df)) # display all


Unnamed: 0,mir_generate_param,label3d_status,sync,z_adjusted,rec_file,date_folder,calib_files
0,0,0,0,0,1691486_left_right_habituation,2024_07_03,"[calib_before_label3d_dannce.mat, calib_1532, ..."
1,0,0,3,0,1691486_left_habituation,2024_07_03,"[calib_before_label3d_dannce.mat, calib_1532, ..."
2,0,0,3,0,1691486_left_caffeine_1050,2024_07_03,"[calib_before_label3d_dannce.mat, calib_1532, ..."
3,0,0,0,0,1691486_left_right_caffeine_1448,2024_07_03,"[calib_before_label3d_dannce.mat, calib_1532, ..."
4,0,0,0,0,1691486_left_caffeine_1051,2024_07_03,"[calib_before_label3d_dannce.mat, calib_1532, ..."
...,...,...,...,...,...,...,...
101,0,0,3,0,230815PMC_left_hole_caffeine_16_50,2024_07_01,"[calib_413pm, calib_before_2, calib_before, ca..."
102,0,0,0,0,230815PMC_no_hole_caffine_13_56,2024_07_01,"[calib_413pm, calib_before_2, calib_before, ca..."
103,0,0,0,0,20240717_PMCr2,2024_08_29,"[calib_before, calib_before_18_49]"
104,0,0,0,0,240605PMC_window2_right2holes_12_14,2024_07_19,"[calib_test_offset_3_38_label3d_dannce.mat, ca..."


In [3]:
from utlis.scan_engine_utlis.scan_engine_utlis import load_status_mapping, translate_status_code

# Example usage:
# Load the status mapping from your YAML file
status_mapping_file = '/hpc/group/tdunn/lq53/BBOP/scan_engine/status_mapping.yaml'  # Update with the correct path
status_mapping = load_status_mapping(status_mapping_file)

# Translate a sample status code
status_code = 2  # Example status code
status_string = translate_status_code(status_code, status_mapping)
print(f"Status code {status_code} corresponds to: {status_string}")


Status code 2 corresponds to: NO-NEED
