In [1]:
import sys
import os
sys.path.append(os.path.abspath('../..'))
import datetime
import concurrent.futures
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import threading


from utlis.sync_utlis.sync_df_utlis import find_calib_file
from utlis.scan_engine_utlis.scan_engine_utlis import (
    read_failed_paths,
    match_date_pattern,
    assign_status_codes,
)
from scan_engine.status_fields_config_oct3v1 import STATUS_FIELDS_CONFIG

# Import functions from utils.py
from utlis.scan_engine_utlis.scan_log_utlis import (
    load_scan_log,
    save_scan_log,
    clean_scan_log,
    update_scan_log,
    get_folders_to_scan
)

def scan_folder(folder_name, base_folder, failed_paths, config, rec_files_to_scan):
    folder_path = os.path.join(base_folder, folder_name)
    rec_files_data = []  # To store rec files and their status
    calib_files = []  # To store calibration files

    # Check for calibration files starting with 'calib'
    for file_name in os.listdir(folder_path):
        if file_name.startswith("calib"):
            calib_files.append(file_name)

    # Traverse subfolders within this folder
    for subfolder_name in rec_files_to_scan:
        subfolder_path = os.path.join(folder_path, subfolder_name)

        # Check for subfolders starting with a digit (rec folders)
        if os.path.isdir(subfolder_path) and subfolder_name[0].isdigit():
            # Find calibration file for each subfolder
            calib_file = find_calib_file(subfolder_path)

            # Assign status codes dynamically based on the config
            rec_file_data = assign_status_codes(
                folder_name, subfolder_path, calib_file, failed_paths, config
            )

            rec_file_data['rec_file'] = subfolder_name  # Add rec_file to the data
            # Add date-time for update and some future
            rec_file_data['scan_time'] = datetime.datetime.now().isoformat()

            rec_files_data.append(rec_file_data)

    return {
        'date_folder': folder_name,
        'calib_files': calib_files,  # Store the calibration files under date_folder level
        'rec_files_data': rec_files_data  # Each rec file with its status fields
    }

def log_folder_to_parquet_sep(base_folder, failed_paths_file, config, force_rescan_rec_files=None, rescan_threshold_days=7):
    """Log folders and save Parquet in subfolders with partial scan support."""

    # Paths for scan log
    scan_log_path = os.path.join(base_folder, 'paret', 'scan_log.csv')

    # Load or initialize the scan log
    scan_log_df = load_scan_log(scan_log_path)

    # Read manually inputted failed paths
    failed_paths = read_failed_paths(failed_paths_file) if failed_paths_file else set()

    # Forced rescans
    # force_rescan_rec_files = [
    #     # ('2023-10-01', '001'),
    #     # ('2023-10-02', '002'),
    #     # Add more as needed
    # ]
    # force_rescan_rec_files_set = set(force_rescan_rec_files)
    
    if force_rescan_rec_files is None:
        force_rescan_rec_files = []
    force_rescan_rec_files_set = set(force_rescan_rec_files)



    # Rescan threshold
    # rescan_threshold_days = 7

    # Determine folders to scan
    folders_to_scan = get_folders_to_scan(base_folder, scan_log_df, rescan_threshold_days, force_rescan_rec_files_set)

    if not folders_to_scan:
        print("No new or modified folders to scan.")
        return

    # Use ThreadPoolExecutor for parallel folder scanning
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for date_folder, rec_files_to_scan in folders_to_scan.items():
            futures.append(
                executor.submit(
                    scan_folder,
                    date_folder,
                    base_folder,
                    failed_paths,
                    config,
                    rec_files_to_scan
                )
            )

        for future in concurrent.futures.as_completed(futures):
            folder_log = future.result()
            date_folder = folder_log['date_folder']
            calib_files = folder_log.get('calib_files', [])

            # Ensure 'calib_files' is always a list of strings
            calib_files = [str(f) for f in calib_files] if calib_files else []

            # Process and save each experiment's log separately
            for rec_file_data in folder_log['rec_files_data']:
                rec_file = rec_file_data['rec_file']
                subfolder_save_path = os.path.join(base_folder, date_folder, rec_file, "folder_log.parquet")

                # Ensure the experiment/rec_file folder exists
                os.makedirs(os.path.dirname(subfolder_save_path), exist_ok=True)

                # Add 'date_folder' and 'calib_files' to rec_file_data
                rec_file_data['date_folder'] = date_folder
                rec_file_data['calib_files'] = calib_files

                # Dynamically ensure all relevant columns are strings based on config
                status_columns = list(config.keys())
                df = pd.DataFrame([rec_file_data])
                df[status_columns] = df[status_columns].astype(str)

                # Convert the data into a DataFrame and save the Parquet file
                table = pa.Table.from_pandas(df)
                pq.write_table(table, subfolder_save_path)

                print(f"Log for {rec_file} saved at {subfolder_save_path}")

                # Update the scan log
                scan_log_df = update_scan_log(scan_log_df, date_folder, rec_file)

    # Clean up the scan log
    scan_log_df = clean_scan_log(scan_log_df, base_folder)

    # Save the updated scan log
    save_scan_log(scan_log_df, scan_log_path)

if __name__ == "__main__":
    base_folder = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1"  # Replace with your base folder
    # save_path = os.path.join(base_folder, 'paret')
    failed_paths_file = '/hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/sync_failed.txt'  # File containing failed paths


    force_rescan_rec_files = [
        # ('2023-10-01', '001'),
        # ('2023-10-02', '002'),
        # Add more as needed
    ]
    rescan_threshold_days = 0.0001 # 7 days, but guess if i mess up i can just change it to automatically rescan all, smile... #0.1

    log_folder_to_parquet_sep(base_folder, failed_paths_file, STATUS_FIELDS_CONFIG,
                              force_rescan_rec_files=force_rescan_rec_files,
                              rescan_threshold_days=rescan_threshold_days)


Log for 2social_mini_0605pmc_single_15_38 saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_30/2social_mini_0605pmc_single_15_38/folder_log.parquet
Log for 2social_mini_0605pmc_single_15_18 saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_30/2social_mini_0605pmc_single_15_18/folder_log.parquet
Log for 2social_mini_0605pmc_single_15_00 saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_30/2social_mini_0605pmc_single_15_00/folder_log.parquet
Log for 20240819V1r1_13_41 saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_17/20240819V1r1_13_41/folder_log.parquet
Log for 20240819V1r1_14_40 saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_17/20240819V1r1_14_40/folder_log.parquet
Log for 20240819V1r1_14_25 saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_17/20240819V1r1_14_25/folder_log.parquet
Log for 24Anshu_f_paint_2mice saved at /hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1/2024_10_03_micecolor_test

In [2]:
sys.path.append(os.path.abspath('../..'))
from utlis.scan_engine_utlis.scan_engine_utlis import read_all_parquet_files
base_folder = "/hpc/group/tdunn/Bryan_Rigs/BigOpenField/Oct3V1"
all_df = read_all_parquet_files(base_folder)

In [3]:
import pyarrow.compute as pc
table = all_df  # combined_df

# Apply the filter and print the results
for_dannce_vis = table.filter(pc.equal(table['dannce'], '1')) #filter_mask

# Print each row of the filtered table
print(for_dannce_vis.to_pandas())  # Display the filtered data in a familiar pandas-like format


   mir_generate_param sync dropf_handle com social miniscope test  \
0                   1    1            0   1      0         1    0   
1                   1    1            0   1      0         1    0   
2                   1    1            0   1      0         1    0   
3                   1    1            0   1      0         1    0   
4                   1    1            0   1      0         1    0   
5                   1    1            0   1      0         1    0   
6                   1    1            0   1      0         1    0   
7                   1    1            0   1      0         1    0   
8                   1    1            0   1      0         1    0   
9                   1    1            1   1      0         1    0   
10                  1    1            0   1      0         1    0   
11                  1    1            0   1      0         1    0   
12                  1    1            0   1      0         1    0   
13                  1    1        

In [None]:
import pyarrow as pa
import os
from datetime import datetime
import re
import csv

# Assuming for_dannce_vis is your PyArrow Table and base_folder is defined

data_list = []

# Convert the PyArrow Table to a list of dictionaries
rows = for_dannce_vis.to_pylist()

def extract_animalid_and_time(rec_file):
    parts = rec_file.split('_')
    # Remove 'test' if it is the last part
    if parts[-1].lower() == 'test':
        parts.pop()
    time_formatted = ''
    # Check for time at the end
    if len(parts) >= 1:
        last_part = parts[-1]
        # Match 'HHMM' format
        if re.match(r'^\d{4}$', last_part):
            time_str = parts.pop()
            time_formatted = f"{time_str[:2]}:{time_str[2:]}"
        # Match 'HH_MM' format
        elif re.match(r'^\d{1,2}$', last_part) and len(parts) >= 2 and re.match(r'^\d{1,2}$', parts[-2]):
            minute = parts.pop()
            hour = parts.pop()
            time_formatted = f"{hour.zfill(2)}:{minute.zfill(2)}"
        else:
            # Check if the last 4 characters are digits (time in 'HHMM' format without underscores)
            if rec_file[-4:].isdigit():
                time_str = rec_file[-4:]
                time_formatted = f"{time_str[:2]}:{time_str[2:]}"
                animalid = rec_file[:-4]
                return animalid, time_formatted
    animalid = '_'.join(parts)
    return animalid, time_formatted

for row in rows:
    date_folder = row['date_folder']
    rec_file = row['rec_file']
    social = row.get('social', '0')
    miniscope = row.get('miniscope', '0')
    test = row.get('test', '0')
    after_oxytocin = row.get('after_oxytocin', '0')
    before_oxytocin = row.get('before_oxytocin', '0')

    # Prediction_path
    Prediction_path = f"{base_folder}/{date_folder}/{rec_file}"

    # Extract AnimalID and Time
    AnimalID, time_formatted = extract_animalid_and_time(rec_file)

    # If time_formatted is empty, attempt to get time from 'videos' folder
    if not time_formatted:
        videos_path = f"{base_folder}/{date_folder}/{rec_file}/videos"
        if os.path.exists(videos_path):
            mod_time = os.path.getmtime(videos_path)
            dt = datetime.fromtimestamp(mod_time)
            time_formatted = dt.strftime('%H:%M')
        else:
            time_formatted = ''

    # Date
    date = date_folder

    # Sex
    Sex = 'male'

    # Collect applicable conditions
    def is_true(value):
        return str(value).strip() == '1'

    conditions = []
    if is_true(test):
        conditions.append('test')
    if is_true(miniscope):
        conditions.append('miniscope')
    if is_true(social):
        conditions.append('social')
    if is_true(after_oxytocin):
        conditions.append('after_oxytocin')
    if is_true(before_oxytocin):
        conditions.append('before_oxytocin')

    # Join conditions into a single string
    Condition = ','.join(conditions) if conditions else 'baseline'

    # Collect data
    data_list.append({
        'AnimalID': AnimalID,
        'Sex': Sex,
        'Condition': Condition,
        'date': date,
        'time': time_formatted,
        'Prediction_path': Prediction_path
    })

# Save to CSV
csv_columns = ['AnimalID', 'Sex', 'Condition', 'date', 'time', 'Prediction_path']
csv_file = os.path.join('/hpc/group/tdunn/lq53/dappy_24_nov/mir_modif_dappy/meta_datas', os.path.basename(base_folder) + '.csv')

try:
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"CSV file saved to {csv_file}")
except IOError:
    print("I/O error occurred while writing the CSV file.")
