In [7]:
import os
import re
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Function to log folder structure into a Parquet file
def log_folder_to_parquet(base_folder, parquet_file):
    # Regex to match date format yyyy_mm_dd
    date_pattern = re.compile(r"\d{4}_\d{2}_\d{2}")
    
    # Data storage for DataFrame
    log_data = []
    
    # Traverse the base folder
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        
        # Check if the folder name matches the date pattern
        if os.path.isdir(folder_path) and date_pattern.match(folder_name):
            calib_status = 'NO'
            rec_status = 'NO'
            calib_files = []
            rec_files = []
            
            # Check for calibration files starting with 'calib'
            for file_name in os.listdir(folder_path):
                if file_name.startswith("calib"):
                    calib_status = 'YES'
                    calib_files.append(file_name)
            # check for rec
                if file_name[0].isdigit():
                    rec_status = 'YES'
                    rec_files.append(file_name)
            
            # Append data to list for DataFrame creation
            log_data.append({
                'date_folder': folder_name,
                'calib_status': calib_status,
                'calib_files': ', '.join(calib_files),
                'folder_path': folder_path,
                'rec_status': rec_status,
                'rec_files': ', '.join(rec_files)
                
            })
    
    # Create DataFrame and save as Parquet file
    df = pd.DataFrame(log_data)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, parquet_file)

if __name__ == "__main__":
    base_folder = "/home/lq53/mir_data/24summ"  # Replace with your base folder
    save_path = os.path.join(base_folder, 'paret')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    parquet_file = os.path.join(save_path, "folder_log.parquet")   # Output Parquet file
    log_folder_to_parquet(base_folder, parquet_file)


In [8]:
import pandas as pd
import pyarrow.parquet as pq
import os

# Function to load the Parquet file
def load_parquet(parquet_file):
    # Read the Parquet file into a pandas DataFrame
    table = pq.read_table(parquet_file)
    df = table.to_pandas()
    return df

# Function to display the DataFrame in the notebook
def display_parquet(parquet_file):
    df = load_parquet(parquet_file)
    
    # Display the DataFrame
    display(df)

if __name__ == "__main__":
    base_folder = "/home/lq53/mir_data/24summ"  # Replace with your base folder
    parquet_file = os.path.join(base_folder, 'paret', 'folder_log.parquet')  # Path to your parquet file
    
    # Call function to display the Parquet file
    display_parquet(parquet_file)


Unnamed: 0,date_folder,calib_status,calib_files,folder_path,rec_status,rec_files
0,2024_08_26,YES,calib_before,/home/lq53/mir_data/24summ/2024_08_26,YES,"20240730_PMCr2, 20240717_PMCr2, 20240717_PMCr1"
1,2024_06_26,NO,,/home/lq53/mir_data/24summ/2024_06_26,YES,1686940_left
2,2024_07_15,NO,,/home/lq53/mir_data/24summ/2024_07_15,YES,"1691485RMHBN1425, 1691485RMPBF1531, 1691485RMP..."
3,2024_07_16,NO,,/home/lq53/mir_data/24summ/2024_07_16,YES,1691485RMHBN1405
4,2024_09_18,NO,,/home/lq53/mir_data/24summ/2024_09_18,NO,
5,2024_08_16,YES,"calib_before_3, calib_before_2, calib_before",/home/lq53/mir_data/24summ/2024_08_16,YES,"20240717_PMC_r1_11_50, 20240717_PMC_r2_11_00"
6,2024_08_08,YES,"calib_after, calib_before",/home/lq53/mir_data/24summ/2024_08_08,YES,"20240628_PMC_r1_11_43, 20240702_PMC_r1_12_02, ..."
7,2024_07_12,NO,,/home/lq53/mir_data/24summ/2024_07_12,YES,240605PMC1_right_hole_11_27
8,2024_07_19,NO,,/home/lq53/mir_data/24summ/2024_07_19,YES,"240605PMC_window2_right2holes_12_14, 240605PMC..."
9,2024_06_28,NO,,/home/lq53/mir_data/24summ/2024_06_28,YES,1686941_left_right_2
