In [3]:
import sys
import os
sys.path.append(os.path.abspath('../'))
import datetime
import concurrent.futures
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import threading
from utlis.sync_utlis.sync_df_utlis import find_calib_file
from utlis.scan_engine_utlis.scan_engine_utlis import (
    read_failed_paths,
    match_date_pattern,
    assign_status_codes,
)
from scan_engine_mini.mini_status_fields_config import STATUS_FIELDS_CONFIG

# Import functions from utils.py
from utlis.scan_engine_utlis.scan_log_utlis import (
    load_scan_log,
    save_scan_log,
    clean_scan_log,
    update_scan_log,
    get_folders_to_scan
)

def scan_folder(folder_name, base_folder, failed_paths, config, rec_files_to_scan):
    folder_path = os.path.join(base_folder, folder_name)
    rec_files_data = []  # To store rec files and their status
    calib_files = []  # To store calibration files

    # Check for calibration files starting with 'calib'
    for file_name in os.listdir(folder_path):
        if file_name.startswith("calib"):
            calib_files.append(file_name)

    # Traverse subfolders within this folder
    for subfolder_name in rec_files_to_scan:
        subfolder_path = os.path.join(folder_path, subfolder_name)

        # Check for subfolders starting with a digit (rec folders)
        if os.path.isdir(subfolder_path) and subfolder_name[0].isdigit():
            # Find calibration file for each subfolder
            calib_file = find_calib_file(subfolder_path)

            # Assign status codes dynamically based on the config
            rec_file_data = assign_status_codes(
                folder_name, subfolder_path, calib_file, failed_paths, config
            )

            rec_file_data['rec_file'] = subfolder_name  # Add rec_file to the data
            # Add date-time for update and some future
            rec_file_data['scan_time'] = datetime.datetime.now().isoformat()

            rec_files_data.append(rec_file_data)

    return {
        'date_folder': folder_name,
        'calib_files': calib_files,  # Store the calibration files under date_folder level
        'rec_files_data': rec_files_data  # Each rec file with its status fields
    }

def log_folder_to_parquet_sep(base_folder, failed_paths_file, config, force_rescan_rec_files=None, rescan_threshold_days=7):
    """Log folders and save Parquet in subfolders with partial scan support."""

    # Paths for scan log
    scan_log_path = os.path.join(base_folder, 'paret', 'scan_log.csv')

    # Load or initialize the scan log
    scan_log_df = load_scan_log(scan_log_path)

    # Read manually inputted failed paths
    failed_paths = read_failed_paths(failed_paths_file) if failed_paths_file else set()

    # Forced rescans
    # force_rescan_rec_files = [
    #     # ('2023-10-01', '001'),
    #     # ('2023-10-02', '002'),
    #     # Add more as needed
    # ]
    # force_rescan_rec_files_set = set(force_rescan_rec_files)
    
    if force_rescan_rec_files is None:
        force_rescan_rec_files = []
    force_rescan_rec_files_set = set(force_rescan_rec_files)



    # Rescan threshold
    # rescan_threshold_days = 7

    # Determine folders to scan
    folders_to_scan = get_folders_to_scan(base_folder, scan_log_df, rescan_threshold_days, force_rescan_rec_files_set)

    if not folders_to_scan:
        print("No new or modified folders to scan.")
        return

    # Use ThreadPoolExecutor for parallel folder scanning
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for date_folder, rec_files_to_scan in folders_to_scan.items():
            futures.append(
                executor.submit(
                    scan_folder,
                    date_folder,
                    base_folder,
                    failed_paths,
                    config,
                    rec_files_to_scan
                )
            )

        for future in concurrent.futures.as_completed(futures):
            folder_log = future.result()
            date_folder = folder_log['date_folder']
            calib_files = folder_log.get('calib_files', [])

            # Ensure 'calib_files' is always a list of strings
            calib_files = [str(f) for f in calib_files] if calib_files else []

            # Process and save each experiment's log separately
            for rec_file_data in folder_log['rec_files_data']:
                rec_file = rec_file_data['rec_file']
                subfolder_save_path = os.path.join(base_folder, date_folder, rec_file, "folder_log.parquet")

                # Ensure the experiment/rec_file folder exists
                os.makedirs(os.path.dirname(subfolder_save_path), exist_ok=True)

                # Add 'date_folder' and 'calib_files' to rec_file_data
                rec_file_data['date_folder'] = date_folder
                rec_file_data['calib_files'] = calib_files

                # Dynamically ensure all relevant columns are strings based on config
                status_columns = list(config.keys())
                df = pd.DataFrame([rec_file_data])
                df[status_columns] = df[status_columns].astype(str)

                # Convert the data into a DataFrame and save the Parquet file
                table = pa.Table.from_pandas(df)
                pq.write_table(table, subfolder_save_path)

                print(f"Log for {rec_file} saved at {subfolder_save_path}")

                # Update the scan log
                scan_log_df = update_scan_log(scan_log_df, date_folder, rec_file)

    # Clean up the scan log
    scan_log_df = clean_scan_log(scan_log_df, base_folder)

    # Save the updated scan log
    save_scan_log(scan_log_df, scan_log_path)

if __name__ == "__main__":
    base_folder = "/home/lq53/mir_data/24summ"  # Replace with your base folder
    failed_paths_file = ""  # File containing failed paths

    force_rescan_rec_files = [
        # ('2023-10-01', '001'),
        # ('2023-10-02', '002'),
        # Add more as needed
    ]
    rescan_threshold_days = 7

    log_folder_to_parquet_sep(base_folder, failed_paths_file, STATUS_FIELDS_CONFIG,
                              force_rescan_rec_files=force_rescan_rec_files,
                              rescan_threshold_days=rescan_threshold_days)


No new or modified folders to scan.


In [4]:
from utlis.scan_engine_utlis.scan_engine_utlis import read_all_parquet_files_auto_exclude

"""actually can use read_all_parquet_files in cluster probably, i do not have calib folders locally so need to handle this hhh"""

# Example usage
# base_folder = "/path/to/your/base_folder"  # Replace with your base folder
combined_df = read_all_parquet_files_auto_exclude(base_folder, exclude_columns = ['calib_files']
)

# Display or use the combined DataFrame
print(combined_df)

   mir_generate_param sync z_adjusted                             rec_file  \
0                   1    1          1                         1686940_left   
1                   1    1          1                 1686941_left_right_2   
2                   1    1          1       1691486_left_right_habituation   
3                   1    1          1       1691485_left_hole_saline_10_35   
4                   1    1          1    1691485_no_hole_habituation_13_59   
5                   1    1          1                     1691485BMCFF1505   
6                   1    1          1                     1691485BMCFS1547   
7                   1    1          1          240605PMC1_right_hole_11_27   
8                   1    1          1                     1691485RMHBN1425   
9                   1    1          1                     1691485RMPBF1531   
10                  1    1          1                     1691485RMPBS1659   
11                  1    1          1                     169148

In [14]:
# import os
# import pyarrow.parquet as pq

# def print_parquet_schemas(base_folder):
#     for root, dirs, files in os.walk(base_folder):
#         for file in files:
#             if file.endswith('.parquet'):
#                 file_path = os.path.join(root, file)
#                 try:
#                     parquet_file = pq.ParquetFile(file_path)
#                     print(f"Schema for {file_path}:")
#                     print(parquet_file.schema_arrow)
#                     print("\n")
#                 except Exception as e:
#                     print(f"Error reading {file_path}: {e}")
# print_parquet_schemas(base_folder)

Schema for /home/lq53/mir_data/24summ/2024_08_26/20240730_PMCr2/folder_log.parquet:
mir_generate_param: string
sync: string
z_adjusted: string
rec_file: string
scan_time: string
date_folder: string
calib_files: list<element: string>
  child 0, element: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1107


Schema for /home/lq53/mir_data/24summ/2024_08_26/20240717_PMCr2/folder_log.parquet:
mir_generate_param: string
sync: string
z_adjusted: string
rec_file: string
scan_time: string
date_folder: string
calib_files: list<element: string>
  child 0, element: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1107


Schema for /home/lq53/mir_data/24summ/2024_08_26/20240717_PMCr1/folder_log.parquet:
mir_generate_param: string
sync: string
z_adjusted: string
rec_file: string
scan_time: string
date_folder: string
calib_files: list<element: string>
  child 0, element: string
-- schem

In [None]:
# 

In [14]:
# import os
# import pandas as pd
# import pyarrow.parquet as pq

# def read_all_parquet_files(base_folder):
#     """Read all Parquet files from the date folder structure and return a combined DataFrame."""
#     all_data = []  # List to store data from all Parquet files

#     # Loop through each date folder
#     for date_folder in os.listdir(base_folder):
#         date_folder_path = os.path.join(base_folder, date_folder)
        
#         # Check if it's a directory (date folder)
#         if os.path.isdir(date_folder_path):
#             # Loop through each subfolder (rec_file/experiment)
#             for rec_file_folder in os.listdir(date_folder_path):
#                 rec_file_folder_path = os.path.join(date_folder_path, rec_file_folder)
                
#                 # Check if it's a directory (rec_file folder)
#                 if os.path.isdir(rec_file_folder_path):
#                     # Look for Parquet file inside the rec_file folder
#                     parquet_file = os.path.join(rec_file_folder_path, "folder_log.parquet")
                    
#                     if os.path.exists(parquet_file):
#                         # Read the Parquet file into a DataFrame
#                         table = pq.read_table(parquet_file)
#                         df = table.to_pandas()
                        
#                         # Add the folder information (optional, for better tracking)
#                         df['date_folder'] = date_folder
#                         df['rec_file_folder'] = rec_file_folder
                        
#                         # Append to the list of data
#                         all_data.append(df)

#     # Concatenate all the DataFrames into one
#     if all_data:
#         combined_df = pd.concat(all_data, ignore_index=True)
#         return combined_df
#     else:
#         print("No Parquet files found.")
#         return pd.DataFrame()  # Return empty DataFrame if no files found

# # Example usage
# # base_folder = "/path/to/your/base_folder"  # Replace with your base folder
# combined_df = read_all_parquet_files(base_folder)

# # Display or use the combined DataFrame
# print(combined_df)


    mir_generate_param  sync  z_adjusted                             rec_file  \
0                    0     0           0                       20240730_PMCr2   
1                    0     0           0                       20240717_PMCr2   
2                    0     0           0                       20240717_PMCr1   
3                    1     0           0                         1686940_left   
4                    1     0           0                     1691485RMHBN1425   
5                    1     0           0                     1691485RMPBF1531   
6                    1     0           0                     1691485RMPBS1659   
7                    1     0           0                     1691485RMHBN1405   
8                    0     0           0                20240717_PMC_r1_11_50   
9                    0     0           0                20240717_PMC_r2_11_00   
10                   1     0           0                20240628_PMC_r1_11_43   
11                   1     0

In [1]:
# # # deleted some prior parque, okay, should not be okay... should not use this again i guess unless want to get rid of all parques..

# # import os

# # def delete_parquet_files_in_date_folders(base_folder):
# #     """Delete Parquet files saved directly under date folders."""
# #     # Loop through each date folder
# #     for date_folder in os.listdir(base_folder):
# #         date_folder_path = os.path.join(base_folder, date_folder)
        
# #         # Check if it's a directory (date folder)
# #         if os.path.isdir(date_folder_path):
# #             # Find and delete all .parquet files directly inside this folder
# #             for file_name in os.listdir(date_folder_path):
# #                 if file_name.endswith(".parquet"):
# #                     file_path = os.path.join(date_folder_path, file_name)
# #                     os.remove(file_path)
# #                     print(f"Deleted: {file_path}")

# # # Usage
# # base_folder = "/home/lq53/mir_data/24summ"  # Replace with your base folder
# # delete_parquet_files_in_date_folders(base_folder)
# import os

# def delete_all_parquet_files(base_folder):
#     """Delete all Parquet files within the base folder and its subfolders."""
#     # Walk through the entire directory tree
#     for dirpath, dirnames, filenames in os.walk(base_folder):
#         # Loop through each file in the current directory
#         for file_name in filenames:
#             if file_name.endswith(".parquet"):
#                 file_path = os.path.join(dirpath, file_name)
#                 os.remove(file_path)
#                 print(f"Deleted: {file_path}")

# # Usage
# base_folder = "/home/lq53/mir_data/24summ"  # Replace with your base folder
# delete_all_parquet_files(base_folder)


Deleted: /home/lq53/mir_data/24summ/2024_08_26/20240730_PMCr2/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_08_26/20240717_PMCr2/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_08_26/20240717_PMCr1/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_06_26/1686940_left/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_07_15/1691485RMHBN1425/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_07_15/1691485RMPBF1531/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_07_15/1691485RMPBS1659/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_07_16/1691485RMHBN1405/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_08_16/20240717_PMC_r1_11_50/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_08_16/20240717_PMC_r2_11_00/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_08_08/20240628_PMC_r1_11_43/folder_log.parquet
Deleted: /home/lq53/mir_data/24summ/2024_08_08/20240702_PMC_r1_12_02/folder_log.pa