In [42]:
import os
import pandas as pd

def generate_file_move_list(base_folder, destination_folder):
    """
    Generates a table of CSV file paths to be moved and their destination paths based on refined file type categorization.

    Parameters:
    - base_folder (str): The path to the base directory containing subdirectories that each contain repositories.
    - destination_folder (str): The path to the folder where sorted directories will be stored.
    
    Returns:
    - DataFrame: A Pandas DataFrame with columns ['Source Path', 'Destination Path'].
    """
    # List to store file movement information
    move_list = []

    # Define resample types, ensuring distinct MicaSense categories
    resample_types = {
        "resample_Landsat_5_TM": "Landsat_5_TM",
        "resample_Landsat_7_ETMplus": "Landsat_7_ETMplus",
        "resample_Landsat_8_OLI": "Landsat_8_OLI",
        "resample_Landsat_9_OLI-2": "Landsat_9_OLI-2",
        "resample_MicaSense-to-match_OLI_and_OLI-2": "MicaSense_to_match_OLI",
        "resample_MicaSense-to-match_TM_and_ETMplus": "MicaSense_to_match_TM",
        "resample_MicaSense": "MicaSense"
    }

    # Identify parent subdirectories
    parent_subdirectories = [
        os.path.join(base_folder, d) for d in os.listdir(base_folder)
        if os.path.isdir(os.path.join(base_folder, d)) and not d.startswith('.')
    ]

    # Iterate through each parent subdirectory
    for parent_subdir in parent_subdirectories:
        repositories = [
            os.path.join(parent_subdir, repo) for repo in os.listdir(parent_subdir)
            if os.path.isdir(os.path.join(parent_subdir, repo)) and not repo.startswith('.')
        ]

        # Process each repository
        for repo in repositories:
            csv_files = [
                os.path.join(repo, file) for file in os.listdir(repo)
                if file.endswith('.csv') and not file.startswith('.')
            ]

            # Categorize each CSV file
            for csv_file in csv_files:
                csv_name = os.path.basename(csv_file)

                # Determine if the file is masked or unmasked
                mask_status = "masked" if "masked" in csv_name else "unmasked"

                # Assign category within masked/unmasked
                subfolder = "Uncategorized"

                # Classify 'original' and 'corrected' files explicitly
                if "envi_masked_spectral_data" in csv_name and "resample" not in csv_name:
                    subfolder = "corrected"
                elif "masked_spectral_data" in csv_name and "envi" not in csv_name and "resample" not in csv_name:
                    subfolder = "original"
                else:
                    # Prioritize more specific MicaSense matches first
                    if "resample_MicaSense-to-match_OLI_and_OLI-2" in csv_name:
                        subfolder = "MicaSense_to_match_OLI"
                    elif "resample_MicaSense-to-match_TM_and_ETMplus" in csv_name:
                        subfolder = "MicaSense_to_match_TM"
                    else:
                        # Check for remaining resample types
                        for resample_key, resample_value in resample_types.items():
                            if resample_key in csv_name:
                                subfolder = resample_value
                                break

                # Define final destination path
                category_path = os.path.join(destination_folder, "sorted_files", mask_status, subfolder, csv_name)
                move_list.append((csv_file, category_path))

    # Create DataFrame
    df = pd.DataFrame(move_list, columns=['Source Path', 'Destination Path'])

    # Ensure sorted_files directory exists
    sorted_files_path = os.path.join(destination_folder, "sorted_files")
    os.makedirs(sorted_files_path, exist_ok=True)

    # Save DataFrame to CSV
    csv_path = os.path.join(sorted_files_path, "file_move_list.csv")
    df.to_csv(csv_path, index=False)
    print(f"Move list saved to {csv_path}")

    return df



# Generate file move list
df_move_list = generate_file_move_list("home/shared/earthlab/macrosystems/processed_flight_lines", "cross-sensor-cal")
df_move_list
# Execute gocmd transfer with optional test row limit
#execute_gocmd_transfer("home/shared/earthlab/macrosystems/processed_flight_lines/sorted_files/file_move_list.csv", irods_username, irods_password, test_rows=None)


Move list saved to cross-sensor-cal/sorted_files/file_move_list.csv


Unnamed: 0,Source Path,Destination Path
0,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/masked/corrected...
1,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/unmasked/Uncateg...
2,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/masked/Landsat_7...
3,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/masked/MicaSense...
4,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/masked/MicaSense...
...,...,...
737,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/unmasked/Landsat...
738,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/unmasked/MicaSen...
739,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/masked/MicaSense...
740,home/shared/earthlab/macrosystems/processed_fl...,cross-sensor-cal/sorted_files/unmasked/MicaSen...
