In [19]:
import os
import pandas as pd
import random
import time
from tqdm import tqdm

def merge_random_sampled_csvs(input_folder: str, percent_sites: float = 100.0, percent_pixels: float = 1.0, output_folder: str = ".", min_age_minutes: int = 10):
    """
    Merges a subset of CSV files and a subset of rows from each file into a single CSV.
    The output file is named after the input folder and includes a progress bar.
    Returns a list of skipped files due to errors.

    Parameters:
        input_folder (str): Path to the folder containing CSV files.
        percent_sites (float): Percentage of files to randomly select (default: 100%).
        percent_pixels (float): Percentage of rows to randomly select from each file (default: 1%).
        output_folder (str): Path to save the merged output CSV.
        min_age_minutes (int): Minimum age (in minutes) a file must be before processing (default: 10).

    Returns:
        list: List of files that were skipped due to errors.
    """
    folder_name = os.path.basename(os.path.normpath(input_folder))
    output_file = os.path.join(output_folder, f"{folder_name}_summary.csv")

    # Skip processing if summary already exists
    if os.path.exists(output_file):
        print(f"✅ Skipping {folder_name}: Summary file already exists.")
        return []

    # Get current time and filter out recently updated files
    current_time = time.time()
    csv_files = [
        os.path.join(input_folder, f) for f in os.listdir(input_folder)
        if f.endswith('.csv') and (current_time - os.path.getmtime(os.path.join(input_folder, f)) > min_age_minutes * 60)
    ]

    if not csv_files:
        print(f"Skipping {folder_name}: No CSV files found or all files are too new (<{min_age_minutes} min).")
        return []

    num_files = max(1, int(len(csv_files) * (percent_sites / 100.0)))
    num_files = min(num_files, len(csv_files))  # Ensure we don't select more than available
    sampled_files = random.sample(csv_files, num_files)

    merged_data = []
    skipped_files = []

    for file in tqdm(sampled_files, desc=f"Processing {folder_name}", unit="file"):
        try:
            df = pd.read_csv(file, low_memory=False)
            num_rows = max(1, int(len(df) * (percent_pixels / 100.0)))
            num_rows = min(num_rows, len(df))  # Ensure we don't sample more rows than available
            sampled_rows = df.sample(n=num_rows, random_state=42)
            merged_data.append(sampled_rows)
        except Exception as e:
            skipped_files.append((file, str(e)))

    if merged_data:
        final_df = pd.concat(merged_data, ignore_index=True)
        try:
            final_df.to_csv(output_file, index=False)
            print(f"✅ Merged file saved to {output_file}")
        except OSError as e:
            print(f"⚠️ Remote I/O error while saving {output_file}: {e}")
            skipped_files.append((output_file, str(e)))
    else:
        print(f"No data to merge in {folder_name}")

    return skipped_files


def merge_all_folders_with_resume_and_retry(parent_directory: str, percent_sites: float = 100.0, percent_pixels: float = 1.0, summary_output_folder: str = ".", max_retries: int = 3, min_age_minutes: int = 10):
    """
    Applies the merge_random_sampled_csvs function to all subfolders within a parent directory,
    excluding the 'summary' and 'Uncategorized' folders. If a folder fails, it will retry up to max_retries times.

    Parameters:
        parent_directory (str): Path to the parent directory containing multiple folders with CSV files.
        percent_sites (float): Percentage of files to randomly select (default: 100%).
        percent_pixels (float): Percentage of rows to randomly select from each file (default: 1%).
        summary_output_folder (str): Path to save all summary CSV outputs.
        max_retries (int): Number of times to retry failed folders (default: 3).
        min_age_minutes (int): Minimum file age (in minutes) before processing (default: 10).

    Returns:
        dict: A dictionary mapping folder names to lists of skipped files.
    """
    skipped_files_report = {}
    excluded_folders = {"summary", "uncategorized"}
    
    os.makedirs(summary_output_folder, exist_ok=True)

    for attempt in range(max_retries + 1):
        if attempt > 0:
            print(f"\n🔄 Retrying failed folders (Attempt {attempt}/{max_retries})...\n")

        failed_folders = skipped_files_report.copy()
        skipped_files_report = {}

        folders_to_process = failed_folders.keys() if attempt > 0 else os.listdir(parent_directory)

        for folder in sorted(folders_to_process):
            folder_path = os.path.join(parent_directory, folder)
            output_file = os.path.join(summary_output_folder, f"{folder}_summary.csv")

            if os.path.isdir(folder_path) and folder.lower() not in excluded_folders:
                if os.path.exists(output_file):
                    print(f"✅ Skipping {folder}: Already processed.")
                    continue

                print(f"Processing folder: {folder}")

                try:
                    skipped_files = merge_random_sampled_csvs(folder_path, percent_sites, percent_pixels, summary_output_folder, min_age_minutes)
                    if skipped_files:
                        skipped_files_report[folder] = skipped_files
                except OSError as e:
                    print(f"⚠️ Remote I/O error on {folder}: {e}")
                    skipped_files_report[folder] = [("ERROR", str(e))]

        if not skipped_files_report:
            print("✅ All folders processed successfully.")
            break

        time.sleep(5)

    if skipped_files_report:
        print("\n❌ Some folders still failed after retries:", skipped_files_report)

    return skipped_files_report





In [20]:

skipped_files_report = merge_all_folders_with_resume_and_retry(
    parent_directory="home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked",
    percent_sites=100,
    percent_pixels=100,
    summary_output_folder="home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary",
    max_retries=3,
    min_age_minutes=10  # Ensure files are at least 10 minutes old
)

# Print final skipped files
for folder, skipped_files in skipped_files_report.items():
    print(f"\n❌ Still failed after retries: {folder}")
    for file, error in skipped_files:
        print(f" - {file}: {error}")


Processing folder: .ipynb_checkpoints
Skipping .ipynb_checkpoints: No CSV files found or all files are too new (<10 min).
✅ Skipping Landsat_5_TM: Already processed.
✅ Skipping Landsat_7_ETMplus: Already processed.
✅ Skipping Landsat_8_OLI: Already processed.
Processing folder: Landsat_9_OLI-2


Processing Landsat_9_OLI-2: 100%|██████████| 12/12 [00:07<00:00,  1.50file/s]


⚠️ Remote I/O error while saving home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/Landsat_9_OLI-2_summary.csv: [Errno 121] Remote I/O error: 'home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/Landsat_9_OLI-2_summary.csv'
Processing folder: MicaSense


Processing MicaSense: 100%|██████████| 12/12 [00:10<00:00,  1.09file/s]


✅ Merged file saved to home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_summary.csv
Processing folder: MicaSense_to_match_OLI


Processing MicaSense_to_match_OLI: 100%|██████████| 13/13 [00:14<00:00,  1.08s/file]


✅ Merged file saved to home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_to_match_OLI_summary.csv
Processing folder: MicaSense_to_match_TM


Processing MicaSense_to_match_TM: 100%|██████████| 13/13 [00:12<00:00,  1.02file/s]


⚠️ Remote I/O error while saving home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_to_match_TM_summary.csv: [Errno 121] Remote I/O error: 'home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_to_match_TM_summary.csv'
✅ Skipping corrected: Already processed.
Processing folder: original


Processing original: 100%|██████████| 2/2 [00:03<00:00,  1.94s/file]


✅ Merged file saved to home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/original_summary.csv

🔄 Retrying failed folders (Attempt 1/3)...

Processing folder: Landsat_9_OLI-2


Processing Landsat_9_OLI-2: 100%|██████████| 12/12 [00:10<00:00,  1.11file/s]


✅ Merged file saved to home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/Landsat_9_OLI-2_summary.csv
✅ Skipping MicaSense_to_match_OLI: Already processed.
Processing folder: MicaSense_to_match_TM


Processing MicaSense_to_match_TM: 100%|██████████| 13/13 [00:06<00:00,  2.03file/s]


⚠️ Remote I/O error while saving home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_to_match_TM_summary.csv: [Errno 121] Remote I/O error: 'home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_to_match_TM_summary.csv'
✅ Skipping original: Already processed.

🔄 Retrying failed folders (Attempt 2/3)...

Processing folder: MicaSense_to_match_TM


Processing MicaSense_to_match_TM: 100%|██████████| 13/13 [00:10<00:00,  1.21file/s]


✅ Merged file saved to home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/masked/summary/MicaSense_to_match_TM_summary.csv
✅ All folders processed successfully.


In [None]:
skipped_files_report = merge_all_folders_with_resume_and_retry(
    parent_directory="home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/unmasked",
    percent_sites=100,
    percent_pixels=100,
    summary_output_folder="home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/unmasked/summary",
    max_retries=3,
    min_age_minutes=10  # Ensure files are at least 10 minutes old
)

# Print final skipped files
for folder, skipped_files in skipped_files_report.items():
    print(f"\n❌ Still failed after retries: {folder}")
    for file, error in skipped_files:
        print(f" - {file}: {error}")

Processing folder: Landsat_5_TM


Processing Landsat_5_TM:   0%|          | 0/1 [00:00<?, ?file/s]