In [None]:
# Now, from above, integrating partial scanning again!

import sys
import os
import concurrent.futures
sys.path.append(os.path.abspath('..'))
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from utlis.sync_utlis.sync_df_utlis import find_calib_file
from utlis.scan_engine_utlis.scan_engine_utlis import (
    read_failed_paths,
    match_date_pattern,
    assign_status_codes
)
from status_fields_config import STATUS_FIELDS_CONFIG


# Function to scan an individual folder (for parallel processing)
def scan_folder(folder_name, base_folder, failed_paths, config):
    folder_path = os.path.join(base_folder, folder_name)
    rec_files_data = []  # To store rec files and their status
    calib_files = []  # To store calibration files

    # Check for calibration files starting with 'calib'
    for file_name in os.listdir(folder_path):
        if file_name.startswith("calib"):
            calib_files.append(file_name)

    # Traverse subfolders within this folder
    for subfolder_name in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder_name)

        # Check for subfolders starting with a digit (rec folders)
        if os.path.isdir(subfolder_path) and subfolder_name[0].isdigit():
            # Find calibration file for each subfolder
            calib_file = find_calib_file(subfolder_path)

            # Assign status codes dynamically based on the config
            rec_file_data = assign_status_codes(
                folder_name, subfolder_path, calib_file, failed_paths, config
            )

            rec_file_data['rec_file'] = subfolder_name  # Add rec_file to the data
            rec_files_data.append(rec_file_data)

    return {
        'date_folder': folder_name,
        'calib_files': calib_files,  # Store the calibration files under date_folder level
        'rec_files_data': rec_files_data  # Each rec file with its status fields
    }

# Parallel version of the original log_folder_to_parquet function, with partial scan now.
# def log_folder_to_parquet(base_folder, parquet_file, failed_paths_file, config):
#     # Read manually inputted failed paths
#     failed_paths = read_failed_paths(failed_paths_file)

#     # Read the existing log to get already processed date folders
#     existing_df = pq.read_table(parquet_file).to_pandas()
#     logged_folders = existing_df['date_folder'].unique() if not existing_df.empty else []

#     # Get the list of current date folders that match the date pattern and are not logged yet
#     date_folders = [
#         f for f in os.listdir(base_folder) 
#         if os.path.isdir(os.path.join(base_folder, f)) and match_date_pattern(f) 
#         and f not in logged_folders  # Only include new folders
#     ]
    
#     # If there are no new folders, print a message and return
#     if not date_folders:
#         print("No new folders to scan.")
#         return

#     # Use ThreadPoolExecutor for parallel folder scanning
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         # Run scan_folder in parallel for each new date folder
#         log_data = list(executor.map(scan_folder, date_folders, [base_folder] * len(date_folders), [failed_paths] * len(date_folders), [config] * len(date_folders)))

#     # Convert the results into a DataFrame
#     df = pd.json_normalize(log_data, 'rec_files_data', ['date_folder', 'calib_files'])

#     # Dynamically ensure all relevant columns are strings based on config
#     status_columns = list(config.keys())
#     df[status_columns] = df[status_columns].astype(str)

#     # Create pyarrow Table and save as Parquet
#     table = pa.Table.from_pandas(df)
#     pq.write_table(table, parquet_file)

def log_folder_to_parquet(base_folder, parquet_file, failed_paths_file, config):
    # Read manually inputted failed paths
    failed_paths = read_failed_paths(failed_paths_file)

    # Initialize logged folders
    logged_folders = []

    # Check if the Parquet file exists
    if os.path.exists(parquet_file):
        # Read the existing log to get already processed date folders
        existing_df = pq.read_table(parquet_file).to_pandas()
        logged_folders = existing_df['date_folder'].unique() if not existing_df.empty else []
    else:
        print("No existing Parquet file found. Running full scan.")

    # Get the list of current date folders that match the date pattern
    date_folders = [
        f for f in os.listdir(base_folder) 
        if os.path.isdir(os.path.join(base_folder, f)) and match_date_pattern(f)
    ]
    
    # Filter for new folders not logged yet
    new_folders = [f for f in date_folders if f not in logged_folders]

    # If no new folders are found, print a message and return
    if not new_folders:
        print("No new folders to scan.")
        return

    # Use ThreadPoolExecutor for parallel folder scanning
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Run scan_folder in parallel for each new date folder
        log_data = list(executor.map(scan_folder, new_folders, [base_folder] * len(new_folders), [failed_paths] * len(new_folders), [config] * len(new_folders)))

    # Convert the results into a DataFrame
    df = pd.json_normalize(log_data, 'rec_files_data', ['date_folder', 'calib_files'])

    # Dynamically ensure all relevant columns are strings based on config
    status_columns = list(config.keys())
    df[status_columns] = df[status_columns].astype(str)

    # Create pyarrow Table and save as Parquet
    table = pa.Table.from_pandas(df)
    pq.write_table(table, parquet_file)
    print("all scannning done")






if __name__ == "__main__":
    base_folder = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/24summ"  # Replace with your base folder
    save_path = os.path.join(base_folder, 'paret')
    failed_paths_file = '/hpc/group/tdunn/Bryan_Rigs/BigOpenField/24summ/mir_bundle_run/synced_folders/240914_failed_sum_test.txt'  # File containing failed paths

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    parquet_file = os.path.join(save_path, "folder_log_encoded_numb_paralle_test_3.parquet")  # Output Parquet file

    # Run the full scan with parallel processing
    log_folder_to_parquet(base_folder, parquet_file, failed_paths_file, STATUS_FIELDS_CONFIG)

