In [5]:
import os
import pandas as pd
from collections import defaultdict

def generate_file_move_list(base_folder, destination_folder, category_keywords):
    """
    Generates a table of file paths to be moved and their destination paths based on predefined category keywords.

    Parameters:
    - base_folder (str): The path to the base directory containing the subdirectories to sort.
    - destination_folder (str): The path to the folder where sorted directories will be stored.
    - category_keywords (dict): A dictionary mapping category names to lists of keywords.
    
    Returns:
    - DataFrame: A Pandas DataFrame with columns ['Source Path', 'Destination Path'].
    """
    # Dictionary to store folders by category
    categorized_folders = defaultdict(list)
    
    # Identify subdirectories
    subdirectories = [
        d for d in os.listdir(base_folder)
        if os.path.isdir(os.path.join(base_folder, d)) and not d.startswith('.')
    ]
    
    # Categorize each folder
    move_list = []
    for subdir in subdirectories:
        subdir_path = os.path.join(base_folder, subdir)
        assigned_category = None
        
        for category, keywords in category_keywords.items():
            if any(keyword in subdir.lower() for keyword in keywords):
                assigned_category = category
                break
        
        # If no category is found, assign to "Uncategorized"
        if assigned_category is None:
            assigned_category = "Uncategorized"
        
        category_path = os.path.join(destination_folder, "sorted_files", assigned_category, subdir)
        move_list.append((subdir_path, category_path))
    
    # Create DataFrame
    df = pd.DataFrame(move_list, columns=['Source Path', 'Destination Path'])
    
    # Ensure sorted_files directory exists
    sorted_files_path = os.path.join(destination_folder, "sorted_files")
    os.makedirs(sorted_files_path, exist_ok=True)
    
    # Save DataFrame to CSV
    csv_path = os.path.join(sorted_files_path, "file_move_list.csv")
    df.to_csv(csv_path, index=False)
    print(f"✅ Move list saved to {csv_path}")
    
    return df

# Example usage:
# Define categories and associated keywords
category_keywords = {
    "Reflectance": ["reflectance", "brdf"],
    "Ancillary Data": ["ancillary", "config"],
    "Masking": ["mask", "masked"],
    "Topography": ["topo"],
}

# Generate file move list
df_move_list = generate_file_move_list("home/shared/earthlab/macrosystems/processed_flight_lines", "cross-sensor-cal", category_keywords)

# Display the move list
df_move_list


✅ Move list saved to cross-sensor-cal/sorted_files/file_move_list.csv


Unnamed: 0,Source Path,Destination Path
0,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Reflectance/NEON...
1,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Reflectance/NEON...
2,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/ne...
3,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/NI...
4,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/NI...
5,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/NI...
6,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/NI...
7,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/NI...
8,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/RM...
9,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/Uncategorized/WR...


In [4]:
validate_output_files('YELL_2023_07_B', debug=False)


🔍 Starting validation of output files in base folder: YELL_2023_07_B



Validating subdirectories:   0%|          | 0/12 [00:00<?, ?it/s]

❌ Subdirectory: NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance
   🚨 Missing Files:
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance__envi_masked
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance__envi_masked.aux.xml
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance__envi_masked.hdr
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance__envi_masked_spectral_data.csv
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance_masked
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance_masked.aux.xml
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance_masked.hdr
     - NEON_D12_YELL_DP1_L001-1_20230711_directional_reflectance_masked_spectral_data.csv



Validating subdirectories:  17%|█▋        | 2/12 [00:00<00:01,  7.30it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L001-1_20230719_directional_reflectance - All expected files are present and valid.



Validating subdirectories:  33%|███▎      | 4/12 [00:00<00:01,  5.32it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L007-1_20230718_directional_reflectance - All expected files are present and valid.

✅ Subdirectory: NEON_D12_YELL_DP1_L011-1_20230718_directional_reflectance - All expected files are present and valid.



Validating subdirectories:  50%|█████     | 6/12 [00:01<00:01,  5.32it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L013-1_20230718_directional_reflectance - All expected files are present and valid.

✅ Subdirectory: NEON_D12_YELL_DP1_L014-1_20230718_directional_reflectance - All expected files are present and valid.



Validating subdirectories:  67%|██████▋   | 8/12 [00:01<00:00,  5.42it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L024-1_20230718_directional_reflectance - All expected files are present and valid.

✅ Subdirectory: NEON_D12_YELL_DP1_L024-1_20230719_directional_reflectance - All expected files are present and valid.



Validating subdirectories:  83%|████████▎ | 10/12 [00:01<00:00,  5.52it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L025-1_20230719_directional_reflectance - All expected files are present and valid.

✅ Subdirectory: NEON_D12_YELL_DP1_L029-1_20230715_directional_reflectance - All expected files are present and valid.



Validating subdirectories:  92%|█████████▏| 11/12 [00:01<00:00,  5.51it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L031-1_20230715_directional_reflectance - All expected files are present and valid.



Validating subdirectories: 100%|██████████| 12/12 [00:02<00:00,  5.36it/s]

✅ Subdirectory: NEON_D12_YELL_DP1_L032-1_20230715_directional_reflectance - All expected files are present and valid.






In [14]:
import os
import pandas as pd

def generate_envi_file_move_list(base_folder, destination_folder, category_keywords):
    """
    Scans subdirectories two levels deep in `base_folder` to generate
    a list of ENVI files and sort them by category into `destination_folder`.
    Appends _Masked if 'mask' or 'masked' is in filename (including for resample).
    """

    resample_types = {
        "resample_landsat_5_tm": "Reflectance_Resample_Landsat_5_TM",
        "resample_landsat_7_etmplus": "Reflectance_Resample_Landsat_7_ETMplus",
        "resample_landsat_8_oli": "Reflectance_Resample_Landsat_8_OLI",
        "resample_landsat_9_oli-2": "Reflectance_Resample_Landsat_9_OLI-2",
        "resample_micasense-to-match_oli_and_oli-2": "Reflectance_Resample_MicaSense_to_match_OLI",
        "resample_micasense-to-match_tm_and_etmplus": "Reflectance_Resample_MicaSense_to_match_TM",
        "resample_micasense": "Reflectance_Resample_MicaSense"
    }

    move_list = []

    for dir_lvl1 in os.listdir(base_folder):
        lvl1_path = os.path.join(base_folder, dir_lvl1)
        if not os.path.isdir(lvl1_path) or dir_lvl1.startswith('.'):
            continue

        for repo in os.listdir(lvl1_path):
            repo_path = os.path.join(lvl1_path, repo)
            if not os.path.isdir(repo_path) or repo.startswith('.'):
                continue

            files = [f for f in os.listdir(repo_path)
                     if os.path.isfile(os.path.join(repo_path, f)) and not f.lower().endswith('.csv')]

            for fname in files:
                lower_fname = fname.lower()
                assigned_category = None

                # Check for resample mapping first
                if "_resample" in lower_fname:
                    for key, category in resample_types.items():
                        if key in lower_fname:
                            assigned_category = category
                            break
                    if not assigned_category:
                        assigned_category = "Reflectance_Resample"
                else:
                    # Reflectance-style logic
                    if "reflectance__envi_masked" in lower_fname or "reflectance__envi_mask" in lower_fname:
                        assigned_category = "Reflectance__ENVI_Masked"
                    elif "reflectance_envi_masked" in lower_fname or "reflectance_envi_mask" in lower_fname:
                        assigned_category = "Reflectance_ENVI_Masked"
                    elif "reflectance__envi" in lower_fname:
                        assigned_category = "Reflectance__ENVI"
                    elif "reflectance_envi" in lower_fname:
                        assigned_category = "Reflectance_ENVI"
                    elif "reflectance_masked" in lower_fname or "reflectance_mask" in lower_fname:
                        assigned_category = "Reflectance_Masked"
                    elif "reflectance" in lower_fname:
                        assigned_category = "Reflectance"
                    else:
                        for category, keywords in category_keywords.items():
                            if any(k in lower_fname for k in keywords):
                                assigned_category = category
                                break
                        if not assigned_category:
                            assigned_category = "Uncategorized"

                # Append _Masked to *any* category if the file is mask-related
                if ("mask" in lower_fname or "masked" in lower_fname) and "masked" not in assigned_category.lower():
                    assigned_category += "_Masked"

                # Construct iRODS-style paths
                source_path = os.path.join(repo_path, fname)
                dest_path = os.path.join(destination_folder, "sorted_files", "envi", assigned_category, fname)
                move_list.append((f"i:/iplant/{source_path}", f"i:/iplant/{dest_path}"))

    df = pd.DataFrame(move_list, columns=["Source Path", "Destination Path"])

    # Save the move list
    os.makedirs(os.path.join(destination_folder, "sorted_files"), exist_ok=True)
    csv_path = os.path.join(destination_folder, "sorted_files", "envi_file_move_list.csv")
    df.to_csv(csv_path, index=False)
    print(f"✅ Move list saved to {csv_path}")

    return df

category_keywords = {
    "Reflectance": ["reflectance", "brdf"],
    "Ancillary Data": ["ancillary", "config"],
    "Masking": ["mask", "masked"],
    "Topography": ["topo"]
}

# Adjust these paths to match your actual directory structure
base_folder = "home/shared/earthlab/macrosystems/processed_flight_lines"
destination_folder = "home/shared/earthlab/macrosystems/cross-sensor-cal"

# Call the function to generate the move list
df_envi_move_list = generate_envi_file_move_list(base_folder, destination_folder, category_keywords)


✅ Move list saved to home/shared/earthlab/macrosystems/cross-sensor-cal/sorted_files/envi_file_move_list.csv


Unnamed: 0,Source Path,Destination Path
0,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
1,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
2,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
3,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
4,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
...,...,...
87,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
88,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
89,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...
90,home/shared/earthlab/macrosystems/processed_fl...,home/shared/earthlab/macrosystems/cross-sensor...


In [None]:
chmod +x cross-sensor-cal/batch_transfer_parallel.sh
./cross-sensor-cal/batch_transfer_parallel.sh
