In [1]:
## Install packages and load libraries
import pandas as pd
import os
import glob
from pathlib import Path
import csv
from datetime import datetime, timezone
import shutil
from tqdm import tqdm


In [2]:
## 2. Load and Aggregate Deep Feature Data

# Define the path to the main classified directory
dir_path = "classified_output"

all_ellipse_dfs = []
ellipse_files = glob.glob(os.path.join(dir_path, "**", "ellipse_data_*.csv"), recursive=True)

for file in ellipse_files:
    # Extract the taxon name from the filename
    taxon_name = Path(file).stem.replace("ellipse_data_", "")
    
    temp_df = pd.read_csv(file)
    temp_df['object_speciesID'] = taxon_name # Add the taxon name as a column
    all_ellipse_dfs.append(temp_df)

# Combine all data into a single master DataFrame
master_df = pd.concat(all_ellipse_dfs, ignore_index=True)

print(f"Loaded {len(master_df)} deep feature records from {len(ellipse_files)} files.")
master_df.head()

Loaded 4560 deep feature records from 35 files.


Unnamed: 0,filename,object_major,object_minor,object_area,object_circularity,object_perimeter,object_width,object_height,object_sharpness,object_saturation,object_redness,object_greeness,object_blueness,object_colorfulness,object_speciesID
0,20250127_175115.761.0.png,61.10984,39.730301,1134.5,0.134707,325.320848,60,48,287.186931,86.849626,44.88141,46.26469,41.541667,25.418107,acantharea
1,20250127_182854.570.0.png,87.294754,44.348526,1550.0,0.09735,447.303603,56,98,679.520833,97.125661,57.013228,56.994213,43.671296,51.950046,acantharea
2,20250129_224543.614.0.png,94.639442,43.62307,2123.0,0.164367,402.877197,98,56,243.814675,83.703869,56.118132,55.747024,68.444139,25.538225,acantharea
3,20250129_224803.015.0.png,57.560436,41.655426,1622.0,0.295141,262.793937,63,53,172.10126,66.433532,54.522619,54.848115,64.574206,28.898972,acantharea
4,20250130_111618.661.0.png,94.78434,31.535536,1316.0,0.108239,390.877197,44,112,1221.368467,94.418831,72.232143,71.717127,76.379058,42.586178,acantharea


In [3]:
## 3. Load and Prepare SFER Cruise Data

# Load SFER_data.csv from the parent 'plankton_imaging' directory
sfer_path = os.path.join(Path(dir_path).parent, 'SFER_data.csv')
sfer_data = pd.read_csv(sfer_path, low_memory=False)

# Create a proper datetime column for merging
sfer_data['GMT.datetime'] = pd.to_datetime(sfer_data['datetime'], format='mixed', errors='coerce')

# Define the specific fields you want to merge from the SFER data
sfer_fields_to_merge = [
    'keyfield',
    'cruise_id',
    'GMT.datetime', # This key is required for the merge itself
    'station',
    'lat_dec',
    'lon_dec',
    'depth',
    'temp',
    'sal',
    'chl',
    'o2_ctd',
    'no3_no2',
    'nh4',
    'po4',
    'si'
]

# Create a new, smaller DataFrame with only the required columns and sort it
sfer_data_filtered = sfer_data[sfer_fields_to_merge].sort_values('GMT.datetime')

print("SFER data loaded and filtered to the following columns:")
print(sfer_data_filtered.columns.tolist())

SFER data loaded and filtered to the following columns:
['keyfield', 'cruise_id', 'GMT.datetime', 'station', 'lat_dec', 'lon_dec', 'depth', 'temp', 'sal', 'chl', 'o2_ctd', 'no3_no2', 'nh4', 'po4', 'si']


In [4]:
## 4. Merge Image Data with Cruise Data

# Prepare the master DataFrame for merging
master_df = master_df.rename(columns={'filename': 'img_file_name'})
master_df['datetime'] = pd.to_datetime(master_df['img_file_name'].str.slice(0, 19), format='%Y%m%d_%H%M%S.%f')
master_df = master_df.sort_values('datetime')

# Merge the complete image list with the filtered SFER cruise data
final_table = pd.merge_asof(
    master_df,
    sfer_data_filtered,
    left_on='datetime',
    right_on='GMT.datetime',
    direction='nearest'
)

print("\nSuccessfully merged image data with SFER cruise data.")
print(f"Final table shape: {final_table.shape}")
final_table.head()


Successfully merged image data with SFER cruise data.
Final table shape: (4560, 31)


Unnamed: 0,img_file_name,object_major,object_minor,object_area,object_circularity,object_perimeter,object_width,object_height,object_sharpness,object_saturation,...,lon_dec,depth,temp,sal,chl,o2_ctd,no3_no2,nh4,po4,si
0,20250127_174616.568.0.png,51.203804,19.600698,628.0,0.364936,147.053823,32,46,142.890039,98.736111,...,-82.8894,0.0,18.0796,36.753,0.395777,7.631,0.05,0.42,0.07,0.7
1,20250127_174617.260.0.png,124.038612,19.411459,1507.0,0.21699,295.421354,52,86,54.962349,107.958821,...,-82.8894,0.0,18.0796,36.753,0.395777,7.631,0.05,0.42,0.07,0.7
2,20250127_174617.864.0.png,30.025755,27.661806,629.0,0.702371,106.08326,30,30,204.216392,111.376705,...,-82.8894,0.0,18.0796,36.753,0.395777,7.631,0.05,0.42,0.07,0.7
3,20250127_174645.761.0.png,142.024139,18.894503,1992.5,0.236171,325.605119,116,78,48.962889,107.934524,...,-82.8894,0.0,18.0796,36.753,0.395777,7.631,0.05,0.42,0.07,0.7
4,20250127_174647.864.0.png,74.323059,16.69743,820.0,0.236234,208.852812,34,72,486.58526,119.326042,...,-82.8894,0.0,18.0796,36.753,0.395777,7.631,0.05,0.42,0.07,0.7


In [5]:
## 5. Create Final Metadata and Format for Export

# --- Create the dynamic acquisition ID ---
# Count the total number of .png files in the directory
total_png_files = len(glob.glob(os.path.join(dir_path, "**", "*.png"), recursive=True))

# Get the current GMT/UTC time and format it
utc_now = datetime.now(timezone.utc)
timestamp_str = utc_now.strftime('%Y%m%d_%H%M')

# Construct the final acq_id string
acq_id_str = f"export_{total_png_files}_{timestamp_str}_sfer"
print(f"Generated Acquisition ID: {acq_id_str}")


# --- Add and format final metadata columns ---
final_table['acq_instrument'] = "CPICS"
final_table['acq_author'] = "Enrique Montes (NOAA AOML)"
final_table['acq_id'] = acq_id_str

# Create other required columns from existing data
final_table['object_id'] = final_table['img_file_name'].str.slice(0, 21)
final_table['object_date'] = final_table['datetime'].dt.strftime('%Y%m%d')
final_table['object_time'] = final_table['datetime'].dt.strftime('%H%M%S')

# Rename SFER columns to match the final EcoTaxa format (depth columns removed)
final_table = final_table.rename(columns={
    'lat_dec': 'object_lat',
    'lon_dec': 'object_lon'
})

# --- Define the exact columns and order for the EcoTaxa table (depth columns removed) ---
ecotaxa_cols = {
    'img_file_name': '[t]', 'object_id': '[t]', 'object_date': '[t]', 'object_time': '[t]',
    'object_lat': '[f]', 'object_lon': '[f]',
    'object_major': '[f]', 'object_minor': '[f]', 'object_area': '[f]', 'object_circularity': '[f]',
    'object_perimeter': '[f]', 'object_width': '[f]', 'object_height': '[f]', 'object_sharpness': '[f]',
    'object_saturation': '[f]', 'object_redness': '[f]', 'object_greeness': '[f]', 'object_blueness': '[f]',
    'object_colorfulness': '[f]', 'acq_instrument': '[t]', 'acq_author': '[t]', 'acq_id': '[t]'
}

# Create the format codes header row as a DataFrame
format_codes_df = pd.DataFrame([ecotaxa_cols])

# Select and order the data columns from the master DataFrame
ecotaxa_data = final_table[ecotaxa_cols.keys()]

# Combine the header and data
ecotaxa_table = pd.concat([format_codes_df, ecotaxa_data], ignore_index=True)

print("\nFinal EcoTaxa table created. Sample:")
ecotaxa_table.head()

Generated Acquisition ID: export_4560_20250911_0010_sfer

Final EcoTaxa table created. Sample:


Unnamed: 0,img_file_name,object_id,object_date,object_time,object_lat,object_lon,object_major,object_minor,object_area,object_circularity,...,object_height,object_sharpness,object_saturation,object_redness,object_greeness,object_blueness,object_colorfulness,acq_instrument,acq_author,acq_id
0,[t],[t],[t],[t],[f],[f],[f],[f],[f],[f],...,[f],[f],[f],[f],[f],[f],[f],[t],[t],[t]
1,20250127_174616.568.0.png,20250127_174616.568.0,20250127,174616,26.8771,-82.8894,51.203804,19.600698,628.0,0.364936,...,46,142.890039,98.736111,38.108974,45.83547,44.467949,21.687068,CPICS,Enrique Montes (NOAA AOML),export_4560_20250911_0010_sfer
2,20250127_174617.260.0.png,20250127_174617.260.0,20250127,174617,26.8771,-82.8894,124.038612,19.411459,1507.0,0.21699,...,86,54.962349,107.958821,24.688127,33.75439,35.553512,14.489614,CPICS,Enrique Montes (NOAA AOML),export_4560_20250911_0010_sfer
3,20250127_174617.864.0.png,20250127_174617.864.0,20250127,174617,26.8771,-82.8894,30.025755,27.661806,629.0,0.702371,...,30,204.216392,111.376705,47.460227,52.614773,45.230682,34.7886,CPICS,Enrique Montes (NOAA AOML),export_4560_20250911_0010_sfer
4,20250127_174645.761.0.png,20250127_174645.761.0,20250127,174645,26.8771,-82.8894,142.024139,18.894503,1992.5,0.236171,...,78,48.962889,107.934524,23.073016,32.398909,34.063889,11.22437,CPICS,Enrique Montes (NOAA AOML),export_4560_20250911_0010_sfer


In [6]:
## 6. Validation Check: Verify All Files Exist

# --- Step 1: Get the list of filenames from the EcoTaxa table ---
# Skip the first row (which contains format codes) and get the 'img_file_name' column
try:
    filenames_in_table = set(ecotaxa_table['img_file_name'].iloc[1:])
    print(f"Found {len(filenames_in_table)} unique filenames listed in the final table.")
except NameError:
    print("Error: The 'ecotaxa_table' DataFrame does not exist. Please run the previous cells first.")
    # Stop execution if the table isn't created
    filenames_in_table = set()


if filenames_in_table:
    # --- Step 2: Get the list of all .png files physically present in the directory ---
    dir_path = "classified_output"
    
    # Use glob to recursively find all .png files and get just their names
    actual_files_on_disk = set(
        os.path.basename(p) for p in glob.glob(os.path.join(dir_path, "**", "*.png"), recursive=True)
    )
    print(f"Found {len(actual_files_on_disk)} actual .png files on disk.")

    # --- Step 3: Compare the lists to find missing files ---
    missing_files = filenames_in_table.difference(actual_files_on_disk)

    # --- Step 4: Report the results ---
    if not missing_files:
        print("\n✅ Success: All files listed in the 'ecotaxa_table' are present in the 'classified_output' directory.")
    else:
        print(f"\n⚠️ Warning: Found {len(missing_files)} missing files.")
        print("The following files are listed in the table but were not found on disk:")
        # Print the first 10 missing files for brevity
        for i, filename in enumerate(missing_files):
            if i < 10:
                print(f" - {filename}")
        if len(missing_files) > 10:
            print(f"   ...and {len(missing_files) - 10} more.")

Found 4560 unique filenames listed in the final table.
Found 4560 actual .png files on disk.

✅ Success: All files listed in the 'ecotaxa_table' are present in the 'classified_output' directory.


In [None]:
## 6. Save the Final EcoTaxa Table and the merged DataFrame with SFER Data

# Define the output path for the final TSV file
destination_path = os.path.join(os.path.expanduser("~"), "plankton_imaging")
output_path_ecotaxa = os.path.join(destination_path, "ecotaxa_sfer-mbon.tsv")
output_path_merged = os.path.join(destination_path, "merged_sfer-mbon.tsv")

# Save the final table
ecotaxa_table.to_csv(output_path_ecotaxa, sep='\t', index=False, quoting=csv.QUOTE_NONE)
final_table.to_csv(output_path_merged, sep='\t', index=False, quoting=csv.QUOTE_NONE)

print(f"\nSuccessfully created the final merged table at: {output_path_ecotaxa}")
print(f"\nSuccessfully created the final merged table at: {output_path_merged}")

In [7]:
# Create a zip archive containing all classified PNG files and the final TSV table
def package_ecotaxa_results():
    """
    Aggregates all classified PNG files and the final TSV table into a
    single folder, then compresses that folder into a zip archive.
    """
    print("--- Starting EcoTaxa Packaging Routine ---")

    # --- 1. Define Paths ---
    # Assumes the script is run from the directory containing 'classified_output'
    base_dir = Path.cwd() 
    source_images_dir = base_dir / "classified_output"
    landing_path = Path(os.path.expanduser("~")) / "plankton_imaging"
    source_tsv_file = landing_path / "ecotaxa_sfer-mbon.tsv"

    # Define the temporary packaging folder and final zip file name
    package_dir = base_dir / "ecotaxa_package"
    zip_output_name = "ecotaxa_package"

    # --- 2. Setup Packaging Directory ---
    print(f"Creating temporary packaging folder: {package_dir}")
    # Remove the directory if it exists to ensure a clean start
    if package_dir.exists():
        shutil.rmtree(package_dir)
    package_dir.mkdir()

    # --- 3. Copy the TSV Table ---
    if source_tsv_file.exists():
        print(f"Copying TSV table: {source_tsv_file.name}")
        shutil.copy(source_tsv_file, package_dir)
    else:
        print(f"⚠️ Warning: TSV file not found at {source_tsv_file}. It will be missing from the package.")

    # --- 4. Find and Copy all PNG Files ---
    print(f"Finding all .png files in '{source_images_dir}'...")
    png_files_to_copy = list(source_images_dir.rglob("*.png"))

    if not png_files_to_copy:
        print("⚠️ Warning: No .png files were found in the 'classified_output' directory.")
    else:
        print(f"Found {len(png_files_to_copy)} images. Copying to package folder...")
        # Use tqdm for a progress bar during the copy operation
        for file_path in tqdm(png_files_to_copy, desc="Copying images"):
            shutil.copy(file_path, package_dir)
        print("Image copying complete.")

    # --- 5. Compress the Packaging Folder ---
    print(f"\nCompressing '{package_dir.name}' into a zip archive...")
    try:
        shutil.make_archive(zip_output_name, 'zip', package_dir)
        final_zip_path = base_dir / f"{zip_output_name}.zip"
        print(f"✅ Success! Archive created at: {final_zip_path}")
    except Exception as e:
        print(f"❌ Error: Failed to create zip archive. Reason: {e}")
        return

    # --- 6. Clean Up Temporary Folder ---
    print(f"Cleaning up temporary directory...")
    shutil.rmtree(package_dir)
    
    print("\n--- Packaging Complete! ---")


if __name__ == "__main__":
    # To run this script, first install the 'tqdm' library if you haven't already:
    # pip install tqdm
    package_ecotaxa_results()

--- Starting EcoTaxa Packaging Routine ---
Creating temporary packaging folder: /space/home/enrique.montes@CNS.local/plankton_imaging/ecotaxa_package
Copying TSV table: ecotaxa_sfer-mbon.tsv
Finding all .png files in '/space/home/enrique.montes@CNS.local/plankton_imaging/classified_output'...
Found 4560 images. Copying to package folder...


Copying images: 100%|██████████| 4560/4560 [00:03<00:00, 1256.19it/s]


Image copying complete.

Compressing 'ecotaxa_package' into a zip archive...
✅ Success! Archive created at: /space/home/enrique.montes@CNS.local/plankton_imaging/ecotaxa_package.zip
Cleaning up temporary directory...

--- Packaging Complete! ---
