In [9]:
import os
import numpy as np
import pandas as pd
import re
import shutil

from pandas.errors import EmptyDataError

from sqlalchemy import create_engine
from sqlalchemy import text

from tqdm import tqdm

### Database connection & check contents

In [30]:
#Setup Database
#Setup connection parameters
username = 'plankton'
password = 'piscodisco'
host = 'localhost' #'deepseavision'  # or the IP address of your database server
port = '5432'       # default port for PostgreSQL
database = 'pisco_crop_db'

# Create an engine that connects to the PostgreSQL server
engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')

In [31]:
# Fetch all table names in the database
query_tables = "SELECT tablename FROM pg_tables WHERE schemaname='public'"
with engine.connect() as conn:
    table_names = pd.read_sql_query(text(query_tables), conn)['tablename'].tolist()
print(table_names)
print(len(table_names),'entries found')

['M181-002-1_CTD-001_17deg00S-011deg11E_20220421-1614', 'M181-002-1_CTD-001_17deg00S-011deg11E_20220421-1614_updated', 'M181-005-1_CTD-002_16deg00S-011deg34E_20220422-0039', 'M181-005-1_CTD-002_16deg00S-011deg34E_20220422-0039_updated', 'M181-007-1_CTD-003_15deg00S-012deg04E_20220422-0731', 'M181-007-1_CTD-003_15deg00S-012deg04E_20220422-0731_updated', 'M181-009-1_CTD-004_14deg00S-012deg12E_20220422-1342', 'M181-009-1_CTD-004_14deg00S-012deg12E_20220422-1342_updated', 'M181-011-1_CTD-005_13deg00S-012deg43E_20220422-2140', 'M181-011-1_CTD-005_13deg00S-012deg43E_20220422-2140_updated', 'M181-013-1_CTD-006_10deg50S-013deg00E_20220423-1216', 'M181-013-1_CTD-006_10deg50S-013deg00E_20220423-1216_updated', 'M181-017-1_CTD-007_10deg36S-013deg21E_20220423-1715', 'M181-017-1_CTD-007_10deg36S-013deg21E_20220423-1715_updated', 'M181-019-1_CTD-008_10deg38S-013deg18E_20220423-1845', 'M181-019-1_CTD-008_10deg38S-013deg18E_20220423-1845_updated', 'M181-021-1_CTD-009_10deg40S-013deg15E_20220423-2013', 

### Functions to create columns for coordinates, time, date and object id

In [None]:
def convert_to_decimal(coord):
    # Check if the coordinate is valid
    if coord is None:
        return None
    # Extract degrees, minutes, and direction
    match = re.match(r'(\d+)°(\d+)([NSWE])', coord)
    if match:
        degrees, minutes, direction = match.groups()
        decimal = int(degrees) + int(minutes) / 60.0
        if direction in ['S', 'W']:  # South and West are negative
            decimal *= -1
        return decimal
    return None


def extract_coordinates(path):
    match = re.search(r'(\d+°\d+[NS])-(\d+°\d+[EW])', path)
    if match:
        lat, lon = match.groups()
        return lat, lon
    return None, None


def process_crop_data(df):
    # Add object ID column
    df['object_id'] = df['img_id'].astype(str) + '_' + df['index'].astype(str)
    #print('object_id added')

    #split date-time
    df[['date', 'time']] = df['date-time'].str.split('-', expand=True)


    # Apply the extraction function to the full_path column
    df[['lat', 'lon']] = df['full_path'].apply(
        lambda x: pd.Series(extract_coordinates(x))
    )
    #print('coordinates extracted')

    # Convert latitude and longitude to decimal format
    df['lat'] = df['lat'].apply(convert_to_decimal)
    df['lon'] = df['lon'].apply(convert_to_decimal)
    #print('coordinates converted')

    # df.drop(['date-time', 'index', 'img_id'], axis=1, inplace=True)
    # print('columns removed')


# profile_name = 'M181-035-1_CTD-015_10deg30S-013deg30E_20220424-2025'
# query = f'SELECT * FROM "{profile_name}"'
# df = pd.read_sql_query(query, engine)
# process_crop_data(df)
# print(df.columns)
# print(df.head())

### Apply functions and combine meta data with predictions, upload to data base

In [10]:
pred_base_path = '/home/fanny/M181_output'

# Invert the filtering: Keep rows where 'w' is NOT significantly smaller than 'h'
threshold_ratio = 0.005  # Adjust this ratio as needed

for profile_name in table_names:
    print(profile_name)
    ctd_number = profile_name.split('CTD-')[1].split('_')[0]
    matching_folder = None
    for folder in os.listdir(pred_base_path):
        if f"CTD-{ctd_number}_" in folder:  # Match folder containing 'CTD-YYY_'
            matching_folder = folder
            prediction_path = os.path.join(pred_base_path, matching_folder, 'ViT_predictions.csv')
            break
    
    if not matching_folder:
        print(f"No matching folder found for profile: {profile_name}")
        continue

    query = f'SELECT * FROM "{profile_name}"'
    df = pd.read_sql_query(query, engine)

    process_crop_data(df)
    df_filtered = df[df['w'] >= threshold_ratio * df['h']]

    pred_df = pd.read_csv(prediction_path)
    # Extract only the file name from the 'filename' column
    pred_df.rename(columns={'filename': 'path'}, inplace=True)
    pred_df['filename'] = pred_df['path'].apply(lambda x: os.path.basename(x))
    pred_df.drop(columns=['path'], inplace=True)


    # Concatenate data frames
    crop_df_sorted = df_filtered.sort_values(by='filename').reset_index(drop=True)
    prediction_df_sorted = pred_df.sort_values(by='filename').reset_index(drop=True)
    combined_df = pd.concat([crop_df_sorted, prediction_df_sorted], axis=1)

    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]





    # Save the combined DataFrame back to the database
    table_name = f"{profile_name}_updated"  # Define a table name
    combined_df.to_sql(table_name, engine, if_exists='replace', index=False)
    print(f"updated data saved to table: {table_name}")


    # # Save the combined DataFrame to a CSV file
    # output_csv_path = os.path.join(pred_base_path, matching_folder, 'combined_metadata.csv')
    # combined_df.to_csv(output_csv_path, index=False)


M181-035-1_CTD-015_10deg30S-013deg30E_20220424-2025
updated data saved to table: M181-035-1_CTD-015_10deg30S-013deg30E_20220424-2025_updated
M181-011-1_CTD-005_13deg00S-012deg43E_20220422-2140
updated data saved to table: M181-011-1_CTD-005_13deg00S-012deg43E_20220422-2140_updated
M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101
updated data saved to table: M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101_updated
M181-181-1_CTD-051_00deg00S-020deg00W_20220509-1454
updated data saved to table: M181-181-1_CTD-051_00deg00S-020deg00W_20220509-1454_updated
M181-216-1_CTD-058_00deg00S-025deg00W_20220511-2352
updated data saved to table: M181-216-1_CTD-058_00deg00S-025deg00W_20220511-2352_updated
M181-091-1_CTD-030_00deg00S-002deg00W_20220502-0238
updated data saved to table: M181-091-1_CTD-030_00deg00S-002deg00W_20220502-0238_updated
M181-002-1_CTD-001_17deg00S-011deg11E_20220421-1614
updated data saved to table: M181-002-1_CTD-001_17deg00S-011deg11E_20220421-1614_updated
M181-200-1_CT

In [11]:
# Fetch all table names in the database
query_tables = "SELECT tablename FROM pg_tables WHERE schemaname='public'"
with engine.connect() as conn:
    table_names = pd.read_sql_query(text(query_tables), conn)['tablename'].tolist()
print(table_names)
print(len(table_names),'entries found')

['M181-035-1_CTD-015_10deg30S-013deg30E_20220424-2025', 'M181-011-1_CTD-005_13deg00S-012deg43E_20220422-2140', 'M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101', 'M181-181-1_CTD-051_00deg00S-020deg00W_20220509-1454', 'M181-216-1_CTD-058_00deg00S-025deg00W_20220511-2352', 'M181-091-1_CTD-030_00deg00S-002deg00W_20220502-0238', 'M181-002-1_CTD-001_17deg00S-011deg11E_20220421-1614', 'M181-200-1_CTD-055_00deg00S-023deg00W_20220511-0023', 'M181-023-1_CTD-010_10deg42S-013deg12E_20220423-2212', 'M181-049-1_CTD-020_11deg20S-012deg15E_20220425-2126', 'M181-272-1_CTD-071_00deg00S-036deg00W_20220516-1313', 'M181-071-1_CTD-025_00deg00S-002deg00E_20220430-0653', 'M181-283-1_CTD-074_00deg00S-038deg50W_20220517-1445', 'M181-297-1_CTD-081_00deg00S-041deg50W_20220519-0218', 'M181-233-1_CTD-061_00deg00S-028deg00W_20220513-0203', 'M181-175-1_CTD-050_00deg00S-019deg00W_20220509-0543', 'M181-029-1_CTD-013-10deg48S-013deg03E_20220424-0539', 'M181-117-1_CTD-038_00deg00S-009deg00W_20220505-0250', 'M181-153

In [43]:
query_updated_profiles = "SELECT tablename FROM pg_tables WHERE schemaname='public' AND tablename LIKE '%_updated'"
with engine.connect() as conn:
    updated_profiles = pd.read_sql_query(text(query_updated_profiles), conn)['tablename'].tolist()
print(f"Updated profiles: {updated_profiles}")
print(len(updated_profiles),'entries found')

Updated profiles: ['M181-002-1_CTD-001_17deg00S-011deg11E_20220421-1614_updated', 'M181-005-1_CTD-002_16deg00S-011deg34E_20220422-0039_updated', 'M181-007-1_CTD-003_15deg00S-012deg04E_20220422-0731_updated', 'M181-009-1_CTD-004_14deg00S-012deg12E_20220422-1342_updated', 'M181-011-1_CTD-005_13deg00S-012deg43E_20220422-2140_updated', 'M181-013-1_CTD-006_10deg50S-013deg00E_20220423-1216_updated', 'M181-017-1_CTD-007_10deg36S-013deg21E_20220423-1715_updated', 'M181-019-1_CTD-008_10deg38S-013deg18E_20220423-1845_updated', 'M181-021-1_CTD-009_10deg40S-013deg15E_20220423-2013_updated', 'M181-023-1_CTD-010_10deg42S-013deg12E_20220423-2212_updated', 'M181-033-1_CTD-014_10deg28S-013deg33E_20220424-1553_updated', 'M181-035-1_CTD-015_10deg30S-013deg30E_20220424-2025_updated', 'M181-038-1_CTD-016_10deg32S-013deg27E_20220425-0123_updated', 'M181-040-1_CTD-017_10deg34S-013deg24E_20220425-0446_updated', 'M181-044-1_CTD-018_10deg55S-012deg52E_20220425-1234_updated', 'M181-049-1_CTD-020_11deg20S-012deg1

### List of columns needed for Ecotaxa meta data file

In [None]:
#profile_name = 'M181-035-1_CTD-015_10deg30S-013deg30E_20220424-2025'

# query = f'ALTER TABLE "{profile_name}" RENAME COLUMN "object_%area" TO "object_perc_area";'
# with engine.connect() as conn:
#     conn.execute(text(query))
#     print(f"Column renamed in table {profile_name}")



columns = [
    'date-time', 'pressure [dbar]', 'depth [m]', 'filename','area', 
    'object_bound_box_w', 'object_bound_box_h', 'object_circularity', 
    'object_area_exc', 'object_area_rprops', 'object_%area', 
    'object_major_axis_len', 'object_minor_axis_len', 'object_centroid_y', 
    'object_centroid_x', 'object_convex_area', 'object_min_intensity', 
    'object_max_intensity', 'object_mean_intensity', 'object_int_density', 
    'object_perimeter', 'object_elongation', 'object_range', 
    'object_perim_area_excl', 'object_perim_major', 
    'object_circularity_area_excl', 'object_angle', 'object_boundbox_area', 
    'object_eccentricity', 'object_equivalent_diameter', 'object_euler_nr', 
    'object_extent', 'object_local_centroid_col', 'object_local_centroid_row', 
    'object_solidity', 'esd', 'img_id', 'index', 'full_path', 'top1', 'top2', 
    'top3', 'top4', 'top5', 'prob1', 'prob2', 'prob3', 'prob4', 'prob5'
]

# Create a comma-separated string of column names
#columns_str = ', '.join(columns)
columns_str = ', '.join([f'"{col}"' for col in columns])
print(columns_str)

query = text(f'SELECT {columns_str} FROM "{profile_name}"')
df = pd.read_sql_query(query, engine)
print(df.columns)


### Functions to adjust meta data to ET criteria

In [61]:
def filter_defect_crops(df):
    df_filtered = df[(df['TAG_event'] == 0) & (df['part_based_filter'] == 0)]
    df_filtered.reset_index(drop=True, inplace=True)
    return df_filtered

def prepare_ETdata(df, mapping_csv, sep="\t"):    
    # Load CSV files
    polytaxo_classes_df = pd.read_csv(mapping_csv, sep=sep)

    # Add annotation status
    df['object_annotation_status'] = 'predicted'

    # Create mapping dictionary
    mapping_dict = dict(zip(
        polytaxo_classes_df["Dataset Class NamePolyTaxo Description"],
        polytaxo_classes_df["PolyTaxo Description"]
    ))

    # Columns to update
    columns_to_replace = ["top1", "top2", "top3", "top4", "top5"]

    # Define regex pattern to split on space, semicolon, colon, or slash
    split_pattern = r"[ ;:/]"

    # Replace values using mapping_dict, extract first word, and replace underscores with spaces
    df[columns_to_replace] = df[columns_to_replace].replace(mapping_dict).apply(
        lambda col: col.astype(str).apply(
            lambda x: re.split(split_pattern, x)[0].replace("_", " ") if pd.notna(x) else x
        )
    )

    # Adjust header names
    rename_mapping = {
        'pressure [dbar]': 'object_pressure',
        'date': 'object_date',
        'time': 'object_time',
        'filename': 'img_file_name',
        'depth [m]': 'object_depth_min',
        'area': 'object_area',
        'esd': 'object_esd',
        'top1': 'object_annotation_category',
        'top2': 'object_annotation_category_2',
        'top3': 'object_annotation_category_3',
        'top4': 'object_annotation_category_4',
        'top5': 'object_annotation_category_5',
        'prob1': 'object_prob_1',
        'prob2': 'object_prob_2',
        'prob3': 'object_prob_3',
        'prob4': 'object_prob_4',
        'prob5': 'object_prob_5',
        'lat': 'object_lat',
        'lon': 'object_lon',
        'w': 'object_width',
        'h': 'object_height',
        'interpolated_s': 'object_interpolated_s',
        'interpolated_o': 'object_interpolated_o',
        'interpolated_chl': 'object_interpolated_chl',
        'interpolated_t': 'object_interpolated_t'
    }
    df.rename(columns=rename_mapping, inplace=True)
    dtype_row = [determine_dtype(df.dtypes[col]) for col in df.columns]
    #df.loc[-1] = dtype_row  # Add the dtype row
    # Insert the dtype_row after the header (as the second row)
    df = pd.concat([df.iloc[:0], pd.DataFrame([dtype_row], columns=df.columns), df.iloc[0:]]).reset_index(drop=True)
    df.drop

    return df




def determine_dtype(dtype):
    if pd.api.types.is_numeric_dtype(dtype):
        return '[f]' 
    elif pd.api.types.is_string_dtype(dtype):
        return '[t]'
    else:
        return 'other'

In [24]:
import re

def extract_lat_lon_from_profile(profile_name):
    """
    Extract latitude and longitude from the profile name.
    Example: 'M181-252-1_CTD-066_00deg00S-032deg00W_20220514-1919'
    Returns: (lat, lon) as floats
    """
    match = re.search(r'(\d+)deg(\d+)([NS])-(\d+)deg(\d+)([EW])', profile_name)
    if match:
        lat_deg, lat_min, lat_dir, lon_deg, lon_min, lon_dir = match.groups()
        lat = int(lat_deg) + int(lat_min) / 60.0
        lon = int(lon_deg) + int(lon_min) / 60.0
        if lat_dir == 'S':
            lat *= -1
        if lon_dir == 'W':
            lon *= -1
        return lat, lon
    return None, None

### Function to add scale bar to crops

In [74]:
from PIL import Image, ImageDraw, ImageFont

def add_scale_bar(image_path, output_path, pixel_resolution=23, scale_length_mm=1):
    """
    Adds a scale bar below the image and saves it to the output path.
    
    Args:
        image_path (str): Path to the input image.
        output_path (str): Path to save the image with the scale bar.
        pixel_resolution (int): Micrometers per pixel.
        scale_length_mm (int): Length of the scale bar in millimeters.
    """
    # Open the image
    img = Image.open(image_path)
    draw = ImageDraw.Draw(img)

    # Calculate the scale bar length in pixels
    scale_length_px = int((scale_length_mm * 1000) / pixel_resolution)

    # Define the height of the additional space for the scale bar and text
    extra_height = 50  # Space for the scale bar and text

    # Create a new image with extra space below
    new_img = Image.new("RGB", (img.width, img.height + extra_height), "white")
    new_img.paste(img, (0, 0))
    draw = ImageDraw.Draw(new_img)

    # Define the scale bar position and size
    bar_height = 10  # Height of the scale bar in pixels
    margin = 20  # Margin from the bottom and sides
    bar_x_start = margin
    bar_x_end = bar_x_start + scale_length_px
    bar_y_start = img.height + (extra_height - bar_height) // 2
    bar_y_end = bar_y_start + bar_height

    # Draw the scale bar (black rectangle)
    draw.rectangle([bar_x_start, bar_y_start, bar_x_end, bar_y_end], fill="black")

    # Add text below the scale bar
    text = f"{scale_length_mm} mm"
    font_size = 20
    try:
        font = ImageFont.truetype("arial.ttf", font_size)  # Use a system font
    except IOError:
        font = ImageFont.load_default()  # Fallback to default font if unavailable
    text_bbox = font.getbbox(text)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]
    text_x = bar_x_start + (scale_length_px - text_width) // 2
    text_y = bar_y_end + 5  # Slight margin below the bar
    draw.text((text_x, text_y), text, fill="black", font=font)

    # Save the modified image
    new_img.save(output_path)

### Main function to filter, copy and zip outputs from prior processing

In [None]:
mapping_csv='/home/fanny/taxonomic_data/Polytaxo_classes.csv'
output_folder = '/home/fanny/EcoTaxa'
os.makedirs(output_folder, exist_ok=True)

for profile_name in updated_profiles:
    query = f'SELECT * FROM "{profile_name}"'
    with engine.connect() as conn:
        df = pd.read_sql_query(query, conn)
    df_filtered = filter_defect_crops(df).copy()
    
    # Extract Lat and Lon from profile_name
    lat, lon = extract_lat_lon_from_profile(profile_name)
    df_filtered['lat'] = lat
    df_filtered['lon'] = lon

    # Remove duplicates in the 'full_path' column, keeping the first occurrence
    df_filtered = df_filtered.drop_duplicates(subset='full_path', keep='first')
    df_filtered = df_filtered.sort_values(by='object_id')
    df_filtered.reset_index(drop=True, inplace=True)
    
    # Iterate through the DataFrame and copy images
    for index, row in df_filtered.iterrows():
        # Extract the profile name and full path
        full_path = row['full_path']
        
        # # Define the target directory structure
        # target_dir = os.path.join(output_folder, profile_name, 'crops')
        # os.makedirs(target_dir, exist_ok=True)
        
        # # Copy the image to the target directory
        # if os.path.exists(full_path):
        #     shutil.copy(full_path, target_dir)
        # else:
        #     print(f"File not found: {full_path}")
        
        # Handle the second set of images (replace '/Crops' with '/Deconv_crops')
        deconv_path = full_path.replace('/Crops', '/Deconv_crops')
        deconv_target_dir = os.path.join(output_folder, profile_name, 'deconv_crops')
        os.makedirs(deconv_target_dir, exist_ok=True)
        deconv_target_path = os.path.join(output_folder, profile_name, 'deconv_crops',deconv_path.split('/')[-1])

        # Copy the second set of images
        if os.path.exists(deconv_path):
            add_scale_bar(deconv_path, deconv_target_path, pixel_resolution=23, scale_length_mm=1)
            #shutil.copy(deconv_path, deconv_target_dir)
        else:
            print(f"File not found: {deconv_path}")

    print(f"All images have been copied successfully for profile {profile_name}.")

    df_ET = prepare_ETdata(df_filtered, mapping_csv).copy()
    df_ET['object_depth_max'] = df_ET['object_depth_min']
    # Keep only columns with 'img' or 'object' in their names
    df_ET = df_ET[[col for col in df_ET.columns if 'img' in col or 'object' in col]]
    sample_profile_id = "_".join(profile_name.split("_")[:2])
    df_ET['sample_id'] = sample_profile_id
    #print(df_ET.columns)
    #rename img_id to img_rank
    df_ET.rename(columns={'img_id': 'img_rank'}, inplace=True)

    # output_path_1 = os.path.join(output_folder, profile_name, 'crops', f"ecotaxa_{profile_name}.tsv")
    output_path_2 = os.path.join(output_folder, profile_name, 'deconv_crops', f"ecotaxa_{profile_name}.tsv")
    # df_ET.to_csv(output_path_1, sep="\t", index=False)
    # df_ET.to_csv(output_path_2, sep="\t", index=False)

    # Define the paths to the folders
    # folder_1 = os.path.dirname(output_path_1)  # Get the directory of output_path_1
    folder_2 = os.path.dirname(output_path_2)  # Get the directory of output_path_2

    # Define the zip file paths
    # zip_path_1 = f"{folder_1}.zip"
    zip_path_2 = f"{folder_2}.zip"

    # Create zip archives
    # shutil.make_archive(folder_1, 'zip', folder_1)
    shutil.make_archive(folder_2, 'zip', folder_2)


    #print(f"Exported and zipped {profile_name} to {output_path_1} and {output_path_2}")
    print(f"Exported and zipped {profile_name} to {output_path_2}")


### split large files into smaller parts and some other stuff

In [3]:
import os
import pandas as pd
import shutil
import math
import zipfile

def process_folders(base_path, max_size_mb=500):
    """
    Process folders to update metadata files and split folders with more than 100,000 images.

    Args:
        base_path (str): Path to the base directory containing folders.
        max_size_mb (int): Maximum size of each zip file in MB.
    """
    for folder in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder)
        deconv_crops_zip_path = os.path.join(folder_path, 'deconv_crops.zip')

        # Check if the zip file exists
        if not os.path.exists(deconv_crops_zip_path):
            print(f"Zip file not found: {deconv_crops_zip_path}")
            continue

        # Get the size of the zip file in bytes
        zip_size_bytes = os.path.getsize(deconv_crops_zip_path)

        # Convert the size to MB
        zip_size_mb = zip_size_bytes / (1024 * 1024)

        if zip_size_mb > max_size_mb:
            print(f"Processing folder: {folder_path} (Zip size: {zip_size_mb:.2f} MB)")
            print("already processed...skipping the folder")

            # Calculate the number of parts needed
            num_parts = math.ceil(zip_size_mb / max_size_mb)

            for subfolder in ['deconv_crops', 'crops']:
                subfolder_path = os.path.join(folder_path, subfolder)
                if not os.path.exists(subfolder_path):
                    print(f"Subfolder not found: {subfolder_path}")
                    continue

                # Locate the metadata file
                metadata_files = [f for f in os.listdir(subfolder_path) if f.endswith('.tsv')]
                if not metadata_files:
                    print(f"No metadata file found in {subfolder_path}")
                    continue

                metadata_file = metadata_files[0]
                metadata_path = os.path.join(subfolder_path, metadata_file)

                # Read the metadata file
                # Read the metadata file, including the second row
                metadata_df = pd.read_csv(metadata_path, sep='\t', skiprows=[1], dtype={'object_time': str})
                second_row = pd.read_csv(metadata_path, sep='\t', nrows=1, skiprows=[0])
                
                # Update column names in the metadata file
                metadata_df.rename(columns={'sample_profile_id': 'sample_id', 'img_id': 'img_rank'}, inplace=True)
                
                # Ensure 'object_time' has 8 elements by padding with leading zeros
                metadata_df['object_time'] = metadata_df['object_time'].apply(lambda x: x.zfill(8) if isinstance(x, str) else x)

                # Define annotation columns
                annotation_columns = ['object_annotation_category', 'object_annotation_category_2', 'object_annotation_category_3', 'object_annotation_category_4', 'object_annotation_category_5']

                # Find rows with empty cells in annotation columns
                rows_to_drop = metadata_df[annotation_columns].isnull().any(axis=1)

                # Get the filenames of the images to drop
                images_to_drop = metadata_df.loc[rows_to_drop, 'img_file_name'].tolist()

                # Drop rows with empty annotation cells
                metadata_df = metadata_df[~rows_to_drop].reset_index(drop=True)

                                # Remove corresponding images from crop and deconv crop files
                for img in images_to_drop:
                    crop_image_path = os.path.join(subfolder_path, img)
                    deconv_image_path = os.path.join(subfolder_path.replace('crops', 'deconv_crops'), img)
                    
                    if os.path.exists(crop_image_path):
                        os.remove(crop_image_path)
                    if os.path.exists(deconv_image_path):
                        os.remove(deconv_image_path)

                # Get all image files in the subfolder
                image_files = [f for f in os.listdir(subfolder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.bmp'))]

                # Calculate the number of images per part
                images_per_part = math.ceil(len(image_files) / num_parts)

                # Split the images and metadata into parts
                for part_num in range(num_parts):
                    part_images = image_files[part_num * images_per_part:(part_num + 1) * images_per_part]
                    part_metadata = metadata_df[metadata_df['img_file_name'].isin(part_images)]

                    # Add the second row to the part metadata
                    part_metadata = pd.concat([second_row, part_metadata], ignore_index=True)

                    # Create a new folder for this part
                    part_folder = f"{subfolder_path}_part{part_num + 1}"
                    os.makedirs(part_folder, exist_ok=True)

                    # Move images to the part folder
                    for img in part_images:
                        shutil.move(os.path.join(subfolder_path, img), os.path.join(part_folder, img))

                    # Save the split metadata file
                    part_metadata_path = os.path.join(part_folder, metadata_file)
                    part_metadata.to_csv(part_metadata_path, sep='\t', index=False)

                    # Zip the part folder
                    zip_path = f"{part_folder}.zip"
                    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
                        for root, _, files in os.walk(part_folder):
                            for file in files:
                                file_path = os.path.join(root, file)
                                arcname = os.path.relpath(file_path, part_folder)
                                zipf.write(file_path, arcname)

                    print(f"Created zip file: {zip_path}")

                # Remove the original subfolder after splitting
                shutil.rmtree(subfolder_path)
                print(f"Removed original subfolder: {subfolder_path}")
        
        else:
            print(f"Folder {folder_path} is within the size limit ({zip_size_mb:.2f} MB). Renaming.")
            for subfolder in ['deconv_crops', 'crops']:
                subfolder_path = os.path.join(folder_path, subfolder)
                if not os.path.exists(subfolder_path):
                    print(f"Subfolder not found: {subfolder_path}")
                    continue

                # Locate the metadata file
                metadata_files = [f for f in os.listdir(subfolder_path) if f.endswith('.tsv')]
                if not metadata_files:
                    print(f"No metadata file found in {subfolder_path}")
                    continue

                metadata_file = metadata_files[0]
                metadata_path = os.path.join(subfolder_path, metadata_file)

                # Read the metadata file
                # Read the metadata file, including the second row
                metadata_df = pd.read_csv(metadata_path, sep='\t', skiprows=[1], dtype={'object_time': str})
                second_row = pd.read_csv(metadata_path, sep='\t', nrows=1, skiprows=[0])
                
                # Update column names in the metadata file
                metadata_df.rename(columns={'sample_profile_id': 'sample_id', 'img_id': 'img_rank'}, inplace=True)
                
                # Ensure 'object_time' has 8 elements by padding with leading zeros
                metadata_df['object_time'] = metadata_df['object_time'].apply(lambda x: x.zfill(8) if isinstance(x, str) else x)

                # Define annotation columns
                annotation_columns = ['object_annotation_category', 'object_annotation_category_2', 'object_annotation_category_3', 'object_annotation_category_4', 'object_annotation_category_5']

                # Find rows with empty cells in annotation columns
                rows_to_drop = metadata_df[annotation_columns].isnull().any(axis=1)

                # Get the filenames of the images to drop
                images_to_drop = metadata_df.loc[rows_to_drop, 'img_file_name'].tolist()

                # Drop rows with empty annotation cells
                metadata_df = metadata_df[~rows_to_drop].reset_index(drop=True)

                                # Remove corresponding images from crop and deconv crop files
                for img in images_to_drop:
                    crop_image_path = os.path.join(subfolder_path, img)
                    deconv_image_path = os.path.join(subfolder_path.replace('crops', 'deconv_crops'), img)
                    
                    if os.path.exists(crop_image_path):
                        os.remove(crop_image_path)
                    if os.path.exists(deconv_image_path):
                        os.remove(deconv_image_path)                

                metadata_df.to_csv(metadata_path, sep='\t', index=False)

                shutil.make_archive(subfolder_path, 'zip', subfolder_path)

                
                print(f"Updated metadata file: {metadata_path}")
                


# Example usage
base_path = '/home/fanny/EcoTaxa'  # Replace with the actual base path
process_folders(base_path)


Folder /home/fanny/EcoTaxa/M181-272-1_CTD-071_00deg00S-036deg00W_20220516-1313_updated is within the size limit (61.30 MB). Renaming.
Updated metadata file: /home/fanny/EcoTaxa/M181-272-1_CTD-071_00deg00S-036deg00W_20220516-1313_updated/deconv_crops/ecotaxa_M181-272-1_CTD-071_00deg00S-036deg00W_20220516-1313_updated.tsv
Updated metadata file: /home/fanny/EcoTaxa/M181-272-1_CTD-071_00deg00S-036deg00W_20220516-1313_updated/crops/ecotaxa_M181-272-1_CTD-071_00deg00S-036deg00W_20220516-1313_updated.tsv
Folder /home/fanny/EcoTaxa/M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101_updated is within the size limit (107.92 MB). Renaming.
Updated metadata file: /home/fanny/EcoTaxa/M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101_updated/deconv_crops/ecotaxa_M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101_updated.tsv
Updated metadata file: /home/fanny/EcoTaxa/M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101_updated/crops/ecotaxa_M181-265-1_CTD-069_00deg00S-035deg00W_20220516-0101_updat

### EcoTaxa Upload

In [6]:
from pyecotaxa.remote import Remote

def login_to_ecotaxa(username, password):
    try:
        remote = Remote()
        remote.login(username, password)
        print("Successfully logged into EcoTaxa.")
        return remote
    except Exception as e:
        print(f"Error during login: {e}")
        return None

def ET_upload(remote, project_id, folder_path):
    try:
        remote.push([(folder_path, project_id)])
        print(f"Successfully uploaded {folder_path} to project {project_id}.")
    except Exception as e:
        print(f"Error during upload: {e}")



def upload_all_zips(remote, base_folder):      
    for profile in os.listdir(base_folder):
        profile_path = os.path.join(base_folder, profile)
        if not os.path.isdir(profile_path):
            print(f"Skipping non-directory item: {profile_path}")
            continue

        print(f"Processing EcoTaxa folder: {profile_path}")                

        # Find all zip files in the sub folder
        zip_files = [f for f in os.listdir(profile_path) if f.endswith('.zip')]
        # Check if there are zip files with "part" in their names
        part_zip_files = [f for f in zip_files if "part" in f]
    
        if part_zip_files:
            # If "part" zip files exist, upload only those
            for part_zip in part_zip_files:
                zip_path = os.path.join(profile_path, part_zip)
                project_id = 15862 if "deconv" in part_zip else 15753  # Adjust project IDs as needed
                ET_upload(remote, project_id, zip_path)
                print(f"Uploaded {part_zip} to project {project_id}.")
        else:
            # Otherwise, upload the default crops.zip and deconv_crops.zip
            zip_path_deconv = os.path.join(profile_path, "deconv_crops.zip")
            ET_upload(remote, 15862, zip_path_deconv)
            print(f"Uploaded deconv_crops.zip to project 15862.")
            zip_path_raw = os.path.join(profile_path, "crops.zip")
            ET_upload(remote, 15753, zip_path_raw)
            print(f"Uploaded crops.zip to project 15753.")




    USERNAME = 'fbrodbek@geomar.de'
    PASSWORD = 'CopepodC0nspiracy!'

    # remote = login_to_ecotaxa(USERNAME, PASSWORD)
    # zip_path_raw = '/home/fanny/EcoTaxa/M181-066-1_CTD-024_03deg30S-007deg15E_20220428-1514_updated/crops1.zip'
    # zip_path_deconv = '/home/fanny/EcoTaxa/M181-005-1_CTD-002_16deg00S-011deg34E_20220422-0039_updated/deconv_crops.zip'
    # ET_upload(remote, zip_path_deconv)
    # ET_upload(remote, zip_path_raw)

    remote = login_to_ecotaxa(USERNAME, PASSWORD)
    if remote:
        output_base_folder = '/home/fanny/EcoTaxa'
        upload_all_zips(remote, output_base_folder)
    else:
        print("Failed to log into EcoTaxa. Exiting...")
            