In [2]:
from segmenter import run_segmenter
import pandas as pd
import os
import csv
from tqdm import tqdm
from pandas.errors import EmptyDataError
import numpy as np
import shutil
import subprocess

In [3]:
def gen_crop_df(path:str, small:bool, size_filter:int = 0):
    """
    A function to generate a DataFrame from a directory of CSV files, with options to filter out small objects.
    Parameters:
    path (str): The path to the directory containing the CSV files.
    small (bool): A flag indicating whether to filter out small objects.

    Returns:
    pandas.DataFrame: The concatenated and processed DataFrame with additional columns for analysis.
    """

    def area_to_esd(area: float) -> float:
        pixel_size = 13.5*2 #in µm/pixel @ 2560x2560 
        return 2 * np.sqrt(area * pixel_size**2 / np.pi)

    # Function to concatenate directory and filename
    def join_strings(dir, filename):
        return os.path.join(dir, filename)

    directory = os.path.dirname(path)
    directory = os.path.join(directory,'Data')

    files = [os.path.join(path, file) for file in sorted(os.listdir(path)) if file.endswith(".csv")]
    dataframes = []
    empty_file_counter = 0
    id = 1
    for file in tqdm(files):
        try:
            df = pd.read_csv(file, delimiter=",", header=None, index_col=None)
            if len(df.columns) == 44:
                df.insert(0,'',id)            
                dataframes.append(df)
                id+=1
            else:
                continue
        except EmptyDataError:
            empty_file_counter += 1
            print(f"File {file} is empty")

    df = pd.concat(dataframes, ignore_index=True)
    headers = ["img_id","index", "filename", "mean_raw", "std_raw", "mean", "std", "area", "x", "y", "w", "h", 
               "saved", "object_bound_box_w", "object_bound_box_h", "bound_box_x", "bound_box_y", "object_circularity", "object_area_exc", 
               "object_area_rprops", "object_%area", "object_major_axis_len", "object_minor_axis_len", "object_centroid_y", "object_centroid_x", 
               "object_convex_area", "object_min_intensity", "object_max_intensity", "object_mean_intensity", "object_int_density", "object_perimeter", 
               "object_elongation", "object_range", "object_perim_area_excl", "object_perim_major", "object_circularity_area_excl", "object_angle", 
               "object_boundbox_area", "object_eccentricity", "object_equivalent_diameter", "object_euler_nr", "object_extent", 
               "object_local_centroid_col", "object_local_centroid_row", "object_solidity"
]
    df.columns = headers
    df.reset_index(drop=True, inplace=True)
    df.drop("index", axis=1, inplace=True)

    if not small:
        df = df[df["saved"] == 1]
    df_unique = df.drop_duplicates(subset=['img_id'])
    
    #df.drop("saved", axis=1, inplace=True)

    # Split the 'filename' column
    split_df = df['filename'].str.split('_', expand=True)
    if small:# bug fix for segmenter where small objects are saved with _mask.png extension instead of .png: needs to be fixed if segmenter is fixed
        headers = ["date", "time", "pressure", "temperature", "index", "mask_ext"]
        split_df.columns = headers
        split_df.drop("mask_ext", axis=1, inplace=True)
    else:
        headers = ["date-time", "pressure", "temperature", "index"]#, 'drop']
        split_df.columns = headers
        #split_df.drop("drop", axis=1, inplace=True)
    
    # split date-time
    split_df[['date', 'time']] = split_df['date-time'].str.split('-', expand=True)
    split_df.drop(columns=['date-time'], inplace=True)

    split_df['pressure'] = split_df['pressure'].str.replace('bar', '', regex=False).astype(float)
    split_df['temperature'] = split_df['temperature'].str.replace('C', '', regex=False).astype(float)
    split_df['index'] = split_df['index'].str.replace('.png', '', regex=False).astype(int)
    
    # Concatenate the new columns with the original DataFrame
    df = pd.concat([split_df, df], axis=1)

    # Extend the original 'filename' column
    df['full_path'] = df.apply(lambda x: join_strings(directory, x['filename']), axis=1)
    #df = df.drop('filename', axis=1)

    df['esd'] = df['area'].apply(area_to_esd).round(2)
    df['pressure'] = (df['pressure']-1)*10
    df.rename(columns={'pressure': 'pressure [dbar]'}, inplace=True)

    # Sort the DataFrame by the 'date-time' column
    df = df.sort_values(by=['date', 'time','index'], ascending=True)
    df.reset_index(drop=True, inplace=True)

    #filter the df for objects where 1 dimension is larger than ca. 1mm
    df = df[(df['w'] > size_filter) | (df['h'] > size_filter)]
    df_unique = df.drop_duplicates(subset=['img_id'])
    print(f'{empty_file_counter} files were empty and were dropped; Number of uniue images: {len(df_unique)}')

    return df

file_path = '/home/fanny/M181-2_output_test/M181-117-1_CTD-038_00°00S-009°00W_20220505-0250/Data'
segmentation_df = gen_crop_df(file_path, False)
segmentation_df

100%|██████████| 19339/19339 [00:49<00:00, 393.00it/s]


0 files were empty and were dropped; Number of uniue images: 13244


Unnamed: 0,pressure [dbar],temperature,index,date,time,img_id,filename,mean_raw,std_raw,mean,...,object_boundbox_area,object_eccentricity,object_equivalent_diameter,object_euler_nr,object_extent,object_local_centroid_col,object_local_centroid_row,object_solidity,full_path,esd
0,0.01,29.97,16,20220505,03025559,1,20220505-03025559_001.001bar_29.97C_16.png,95.101949,65.612728,108.226437,...,3717.0,0.644103,60.186099,1.0,0.765402,31.348682,27.876977,0.957590,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,1597.95
1,0.01,29.97,17,20220505,03025559,1,20220505-03025559_001.001bar_29.97C_17.png,95.101949,65.612728,108.226437,...,1480.0,0.291325,37.830163,1.0,0.759459,20.257117,17.767794,0.945332,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,994.25
2,0.01,29.97,18,20220505,03025559,1,20220505-03025559_001.001bar_29.97C_18.png,95.101949,65.612728,108.226437,...,20276.0,0.976962,111.000626,2.0,0.477264,122.772140,45.303296,0.497021,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,2840.31
3,0.01,29.97,23,20220505,03025559,1,20220505-03025559_001.001bar_29.97C_23.png,95.101949,65.612728,108.226437,...,1240.0,0.899237,26.221162,1.0,0.435484,20.968519,17.824074,0.707733,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,658.03
4,0.01,29.97,32,20220505,03025559,1,20220505-03025559_001.001bar_29.97C_32.png,95.101949,65.612728,108.226437,...,600.0,0.276787,24.042686,1.0,0.756667,11.143172,11.828194,0.949791,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,623.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111253,4276.12,2.14,660,20220505,04235560,19339,20220505-04235560_428.612bar_02.14C_660.png,167.868812,34.487374,1.942304,...,3192.0,0.569987,38.464326,1.0,0.364035,27.527539,27.420826,0.481558,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,965.35
111254,4276.12,2.14,667,20220505,04235560,19339,20220505-04235560_428.612bar_02.14C_667.png,167.868812,34.487374,1.942304,...,4128.0,0.960751,48.834185,1.0,0.453731,20.106247,50.789108,0.722887,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,1273.04
111255,4276.12,2.14,683,20220505,04235560,19339,20220505-04235560_428.612bar_02.14C_683.png,167.868812,34.487374,1.942304,...,6762.0,0.970612,61.120249,1.0,0.433895,24.666667,63.794819,0.570595,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,1579.55
111256,4276.12,2.14,763,20220505,04235560,19339,20220505-04235560_428.612bar_02.14C_763.png,167.868812,34.487374,1.942304,...,1624.0,0.940013,29.185566,1.0,0.411946,15.860987,32.390135,0.669000,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,738.14


In [5]:
# Prepare prediction data
import re

prediction_df = pd.read_csv("/home/fanny/M181-2_output_test/M181-117-1_CTD-038_00°00S-009°00W_20220505-0250/ViT_predictions.csv")
polytaxo_classes_df = pd.read_csv('/home/fanny/taxonomic_data/Polytaxo_classes.csv', sep="\t")

print(len(polytaxo_classes_df.columns))

prediction_df['object_annotation_status'] = 'predicted'

mapping_dict = dict(zip(polytaxo_classes_df["Dataset Class NamePolyTaxo Description"], polytaxo_classes_df["PolyTaxo Description"]))
columns_to_replace = ["top1", "top2", "top3", "top4", "top5"]

# Define regex pattern to split on space, semicolon, colon, or slash
split_pattern = r"[ ;:/]"

'''
#prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict)
prediction_df[columns_to_replace] = (
    prediction_df[columns_to_replace]
    .replace(mapping_dict)
    .applymap(lambda x: re.split(split_pattern, str(x))[0] if pd.notna(x) else x)
)
'''

'''
# Replace values and extract only the first word
prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict).apply(
    lambda col: col.astype(str).apply(lambda x: re.split(split_pattern, x)[0] if pd.notna(x) else x)
)

prediction_df[columns_to_replace ]
for row
'''


# Replace values, extract first word, and replace underscores with spaces
prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict).apply(
    lambda col: col.astype(str).apply(
        lambda x: re.split(split_pattern, x)[0].replace("_", " ") if pd.notna(x) else x
    )
)


prediction_df

4


Unnamed: 0,filename,top1,top2,top3,top4,top5,prob1,prob2,prob3,prob4,prob5,object_annotation_status
0,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Detritus,Unknowns,Fecal pellets,0.424959,0.235316,0.210945,0.062445,0.027407,predicted
1,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Physonectae,Detritus,Doliolida,Salpidae,Detritus,0.822883,0.046476,0.035859,0.013201,0.008981,predicted
2,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Calanoida,Artefact,Detritus,Detritus,0.950195,0.004265,0.004085,0.003793,0.003041,predicted
3,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Detritus,Fecal pellets,Radiolaria,0.587833,0.381747,0.010812,0.003838,0.002334,predicted
4,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Unknowns,Detritus,Unknowns,0.850994,0.057319,0.035589,0.012789,0.011884,predicted
...,...,...,...,...,...,...,...,...,...,...,...,...
111232,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Unknowns,Detritus,Doliolida,0.619663,0.162431,0.141514,0.013338,0.010430,predicted
111233,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Tunicata,Unknowns,Unknowns,0.741055,0.048907,0.041340,0.030568,0.022196,predicted
111234,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Unknowns,Detritus,Protista,0.681353,0.279987,0.024871,0.002106,0.001889,predicted
111235,/home/fanny/M181-2_output_test/M181-117-1_CTD-...,Detritus,Detritus,Detritus,Unknowns,Radiolaria,0.923617,0.028933,0.013302,0.011292,0.005039,predicted


In [None]:
# lets try something else
#taxoexport_df = pd.read_csv("/home/fanny/taxonomic_data/taxoexport_20250212_140806.tsv")

# Find common categories
#common_categories = set(polytaxo_classes_df['Dataset Class NamePolyTaxo Description']).intersection(set(taxoexport_df['display_name']))
#print("Direct matches found:", len(common_categories))


In [6]:
# combine segmentation data frame with prediction data frame
# Sort both DataFrames by 'filename'
segmentation_df_sorted = segmentation_df.sort_values(by='filename').reset_index(drop=True)
prediction_df_sorted = prediction_df.sort_values(by='filename').reset_index(drop=True)

# concatenate data frames
segm_and_prediction_df = pd.concat([segmentation_df_sorted, prediction_df_sorted], axis=1)
segm_and_prediction_df = segm_and_prediction_df.loc[:, ~segm_and_prediction_df.columns.duplicated(keep='first')]

# add object id
segm_and_prediction_df['object_id'] = segm_and_prediction_df['img_id'].astype(str) + '_' + segm_and_prediction_df['index'].astype(str)

# Create list of file paths from the 'file_paths' column
#filepaths = segm_and_prediction_df['file_paths'].tolist()

# delete some columns
columns_to_delete = ['temperature', 'mean_raw', 'std_raw', 'mean', 'std', 'x', 'y', 'w', 'h', 'saved', 'bound_box_x', 'bound_box_y', 'full_path', 'img_id', 'index']
segm_and_prediction_df.drop(columns_to_delete, axis=1, inplace=True)

# adjust header names
segm_and_prediction_df = segm_and_prediction_df.rename(columns={
    'pressure [dbar]' : 'object_pressure',
    'date': 'object_date',
    'time' : 'object_time',
    'filename' : 'img_file_name',
    'area' : 'object_area',
    'esd' : 'object_esd',
    'top1' : 'object_annotation_category',
    'top2' : 'object_annotation_category_2',
    'top3' : 'object_annotation_category_3',
    'top4' : 'object_annotation_category_4',
    'top5' : 'object_annotation_category_5',
    'prob1' : 'object_prob_1',
    'prob2' : 'object_prob_2',
    'prob3': 'object_prob_3',
    'prob4' : 'object_prob_4',
    'prob5' : 'object_prob_5'    
})
#print(segm_and_prediction_df.columns)


# function to determine the data type of each column
def determine_dtype(dtype):
    if pd.api.types.is_numeric_dtype(dtype):
        return '[f]' 
    elif pd.api.types.is_string_dtype(dtype):
        return '[t]'
    else:
        return 'other'

dtype_row = []
for col in segm_and_prediction_df.columns:
    dtype_row.append(determine_dtype(segm_and_prediction_df[col].dtype))

# insert row
segm_and_prediction_df.loc[0] = dtype_row

# reset index to maintain order
segm_and_prediction_df = segm_and_prediction_df.sort_index().reset_index(drop=True)
eco_taxa_folder = "/home/fanny/segmentation_output/EcoTaxa"
os.makedirs(eco_taxa_folder, exist_ok=True)

# save everything as tsv file
segm_and_prediction_df.to_csv('/home/fanny/segmentation_output/EcoTaxa/ecotaxa_metadata.tsv', sep="\t", index=False)

  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] 

In [11]:
# combine data and compress file

def zip_data(folder_path, zip_path, extra_file=None):
    """
    Zips a folder and optionally includes an extra file inside the zip.

    Parameters:
        folder_path (str): The folder to zip.
        zip_path (str): The destination zip file path.
        extra_file (str, optional): metadata file
    """
    if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
        print(f"Warning: Folder {folder_path} does not exist.")
        return
    
    # Move meta data file into the folder before zipping
    if extra_file and os.path.exists(extra_file):
        shutil.move(extra_file, os.path.join(folder_path, os.path.basename(extra_file)))

    # Create ZIP archive
    shutil.make_archive(zip_path.replace(".zip", ""), 'zip', folder_path)
    print(f"Zipped {folder_path} to {zip_path}")


deconv_crops_folder = '/home/fanny/segmentation_output/Deconv_crops'
raw_crops_folder = '/home/fanny/segmentation_output/Crops'
metadata = '/home/fanny/segmentation_output/EcoTaxa/ecotaxa_metadata.tsv'

# zip paths
zip_path_deconv = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_deconv.zip"
zip_path_raw = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_raw.zip"

#zip_data(deconv_crops_folder, zip_path_deconv, metadata)
zip_data(raw_crops_folder, zip_path_raw, metadata)

print("Folders have been zipped")

Zipped /home/fanny/segmentation_output/Crops to /home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_raw.zip
Folders have been zipped


In [3]:
# EcoTaxa upload
zip_path_deconv = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_deconv.zip"
zip_path_raw = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_raw.zip"


def ET_upload(project_id, folder_path):
    #Uploads a zip file to an EcoTaxa project using terminal commands.
    try:
        #login to ET project
        login_cmd = ["pyecotaxa", "login"]
        subprocess.run(login_cmd, check=True)
        print("logged into EcoTaxa.")
        
        # upload file
        cmd = ['pyecotaxa', 'push', '--to', str(project_id), folder_path]
        subprocess.run(cmd, check=True)
        print(f"successfully uploaded {folder_path} to project {project_id}.")

    except subprocess.CalledProcessError as e:
        print(f"error during upload: {e}")




#ET_upload(15753, zip_path_deconv)
ET_upload(15753, zip_path_raw)

NameError: name 'subprocess' is not defined

In [None]:
from pyecotaxa.remote import Remote

ecotaxa_client = Remote()

# Login
ecotaxa_client.login()

ecotaxa_client.push(15753, '/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload.zip')

print(dir(ecotaxa_client))

# in terminal:
#pyecotaxa login
#pyecotaxa push --to 15753 --validate /home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload.zip    # --validate validates archives locally before uploading


In [1]:
from EcoTaxa_preparation import ET_upload
import logging
import os
from pyecotaxa.remote import Remote

# #login to ET project
#     login_cmd = ["pyecotaxa", "login"]
#     subprocess.run(login_cmd, check=True)

def ET_upload(project_id, folder_path): #Uploads a zip file to an EcoTaxa project using terminal commands. 
    # Upload file
    # cmd = ['pyecotaxa', 'push', '--to', str(project_id), folder_path]
    # result = subprocess.run(cmd, check=True, capture_output=True, text=True)

    remote = Remote()

    remote.current_user()

    remote.push([(folder_path, project_id)])
    
    logging.info(f"Successfully uploaded {folder_path} to project {project_id}.")
    logging.info(f"Upload output: {result.stdout}")
    logging.info(f"Upload errors: {result.stderr}")


 # Define the paths to the zipped files
    zip_path_raw = '/home/fanny/M181-2_output_test/M181-117-1_CTD-038_00°00S-009°00W_20220505-0250/EcoTaxa/ecotaxa_upload_raw.zip'
    zip_path_deconv = '/home/fanny/M181-2_output_test/M181-117-1_CTD-038_00°00S-009°00W_20220505-0250/EcoTaxa/ecotaxa_upload_deconv.zip'

# Upload the zipped files to EcoTaxa
ET_upload(15753, zip_path_raw) 
ET_upload(15862, zip_path_deconv)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'zip_path_raw' is not defined

In [None]:
classlist = ['acantharia_protist',
'acantharia_protist_big_center',
'acantharia_protist_halo',
'amphipods',
'appendicularian_fritillaridae',
'appendicularian_s_shape',
'appendicularian_slight_curve',
'appendicularian_straight',
'artifacts',
'artifacts_edge',
'chaetognath_non_sagitta',
'chaetognath_other',
'chaetognath_sagitta',
'chordate_type1',
'copepod_calanoid',
'copepod_calanoid_eggs',
'copepod_calanoid_eucalanus',
'copepod_calanoid_flatheads',
'copepod_calanoid_frillyAntennae',
'copepod_calanoid_large',
'copepod_calanoid_large_side_antennatucked',
'copepod_calanoid_octomoms',
'copepod_calanoid_small_longantennae',
'copepod_cyclopoid_copilia',
'copepod_cyclopoid_oithona',
'copepod_cyclopoid_oithona_eggs',
'copepod_other',
'crustacean_other',
'ctenophore_cestid',
'ctenophore_cydippid_no_tentacles',
'ctenophore_cydippid_tentacles',
'ctenophore_lobate',
'decapods',
'detritus_blob',
'detritus_filamentous',
'detritus_other',
'diatom_chain_string',
'diatom_chain_tube',
'echinoderm_larva_pluteus_brittlestar',
'echinoderm_larva_pluteus_early',
'echinoderm_larva_pluteus_typeC',
'echinoderm_larva_pluteus_urchin',
'echinoderm_larva_seastar_bipinnaria',
'echinoderm_larva_seastar_brachiolaria',
'echinoderm_seacucumber_auricularia_larva',
'echinopluteus',
'ephyra',
'euphausiids',
'euphausiids_young',
'fecal_pellet',
'fish_larvae_deep_body',
'fish_larvae_leptocephali',
'fish_larvae_medium_body',
'fish_larvae_myctophids',
'fish_larvae_thin_body',
'fish_larvae_very_thin_body',
'heteropod',
'hydromedusae_aglaura',
'hydromedusae_bell_and_tentacles',
'hydromedusae_h15',
'hydromedusae_haliscera',
'hydromedusae_haliscera_small_sideview',
'hydromedusae_liriope',
'hydromedusae_narco_dark',
'hydromedusae_narco_young',
'hydromedusae_narcomedusae',
'hydromedusae_other',
'hydromedusae_partial_dark',
'hydromedusae_shapeA',
'hydromedusae_shapeA_sideview_small',
'hydromedusae_shapeB',
'hydromedusae_sideview_big',
'hydromedusae_solmaris',
'hydromedusae_solmundella',
'hydromedusae_typeD',
'hydromedusae_typeD_bell_and_tentacles',
'hydromedusae_typeE',
'hydromedusae_typeF',
'invertebrate_larvae_other_A',
'invertebrate_larvae_other_B',
'jellies_tentacles',
'polychaete',
'protist_dark_center',
'protist_fuzzy_olive',
'protist_noctiluca',
'protist_other',
'protist_star',
'pteropod_butterfly',
'pteropod_theco_dev_seq',
'pteropod_triangle',
'radiolarian_chain',
'radiolarian_colony',
'shrimp-like_other',
'shrimp_caridean',
'shrimp_sergestidae',
'shrimp_zoea',
'siphonophore_calycophoran_abylidae',
'siphonophore_calycophoran_rocketship_adult',
'siphonophore_calycophoran_rocketship_young',
'siphonophore_calycophoran_sphaeronectes',
'siphonophore_calycophoran_sphaeronectes_stem',
'siphonophore_calycophoran_sphaeronectes_young',
'siphonophore_other_parts',
'siphonophore_partial',
'siphonophore_physonect',
'siphonophore_physonect_young',
'stomatopod',
'tornaria_acorn_worm_larvae',
'trichodesmium_bowtie',
'trichodesmium_multiple',
'trichodesmium_puff',
'trichodesmium_tuft',
'trochophore_larvae',
'tunicate_doliolid',
'tunicate_doliolid_nurse',
'tunicate_partial',
'tunicate_salp',
'tunicate_salp_chains',
'unknown_blobs_and_smudges',
'unknown_sticks',
'unknown_unclassified']

classlist_mapping = {}

for item in classlist:
    new_name = input(f"Enter new name for {item}: ")
    classlist_mapping[item] = new_name

print(classlist_mapping)

In [None]:
import os
from PIL import Image
image_folder = '/home/fanny/segmentation_output/Deconv_crops'

def is_valid_image(image_path):
    try:
        with Image.open(image_path) as img:
            img.verify()  # Verify integrity
        return True
    except Exception as e:
        print(f"Corrupt image detected: {image_path} - {e}")
        return False
    
valid_images = []
for filename in os.listdir(image_folder):
    image_path = os.path.join(image_folder, filename)
    
    # Ensure it's a valid image
    if is_valid_image(image_path):
        valid_images.append(image_path)

print(f"Found {len(valid_images)} valid images.")

In [None]:
# EcoTaxa upload

from segmenter import run_segmenter
import pandas as pd
import os
import csv
from tqdm import tqdm
from pandas.errors import EmptyDataError
import numpy as np
import re
import shutil
import subprocess

def gen_crop_df(path:str, small:bool, size_filter:int = 0):
    """
    A function to generate a DataFrame from a directory of CSV files, with options to filter out small objects.
    Parameters:
    path (str): The path to the directory containing the CSV files.
    small (bool): A flag indicating whether to filter out small objects.

    Returns:
    pandas.DataFrame: The concatenated and processed DataFrame with additional columns for analysis.
    """

    def area_to_esd(area: float) -> float:
        pixel_size = 13.5*2 #in µm/pixel @ 2560x2560 
        return 2 * np.sqrt(area * pixel_size**2 / np.pi)

    # Function to concatenate directory and filename
    def join_strings(dir, filename):
        return os.path.join(dir, filename)

    directory = os.path.dirname(path)
    directory = os.path.join(directory,'Data')

    files = [os.path.join(path, file) for file in sorted(os.listdir(path)) if file.endswith(".csv")]
    dataframes = []
    empty_file_counter = 0
    id = 1
    for file in tqdm(files):
        try:
            df = pd.read_csv(file, delimiter=",", header=None, index_col=None)
            if len(df.columns) == 44:
                df.insert(0,'',id)            
                dataframes.append(df)
                id+=1
            else:
                continue
        except EmptyDataError:
            empty_file_counter += 1
            print(f"File {file} is empty")

    df = pd.concat(dataframes, ignore_index=True)
    headers = ["img_id","index", "filename", "mean_raw", "std_raw", "mean", "std", "area", "x", "y", "w", "h", 
               "saved", "object_bound_box_w", "object_bound_box_h", "bound_box_x", "bound_box_y", "object_circularity", "object_area_exc", 
               "object_area_rprops", "object_%area", "object_major_axis_len", "object_minor_axis_len", "object_centroid_y", "object_centroid_x", 
               "object_convex_area", "object_min_intensity", "object_max_intensity", "object_mean_intensity", "object_int_density", "object_perimeter", 
               "object_elongation", "object_range", "object_perim_area_excl", "object_perim_major", "object_circularity_area_excl", "object_angle", 
               "object_boundbox_area", "object_eccentricity", "object_equivalent_diameter", "object_euler_nr", "object_extent", 
               "object_local_centroid_col", "object_local_centroid_row", "object_solidity"
]
    df.columns = headers
    df.reset_index(drop=True, inplace=True)
    df.drop("index", axis=1, inplace=True)

    if not small:
        df = df[df["saved"] == 1]
    df_unique = df.drop_duplicates(subset=['img_id'])
    
    #df.drop("saved", axis=1, inplace=True)

    # Split the 'filename' column
    split_df = df['filename'].str.split('_', expand=True)
    if small:# bug fix for segmenter where small objects are saved with _mask.png extension instead of .png: needs to be fixed if segmenter is fixed
        headers = ["date", "time", "pressure", "temperature", "index", "mask_ext"]
        split_df.columns = headers
        split_df.drop("mask_ext", axis=1, inplace=True)
    else:
        headers = ["date-time", "pressure", "temperature", "index"]#, 'drop']
        split_df.columns = headers
        #split_df.drop("drop", axis=1, inplace=True)
    
    # split date-time
    split_df[['date', 'time']] = split_df['date-time'].str.split('-', expand=True)
    split_df.drop(columns=['date-time'], inplace=True)

    split_df['pressure'] = split_df['pressure'].str.replace('bar', '', regex=False).astype(float)
    split_df['temperature'] = split_df['temperature'].str.replace('C', '', regex=False).astype(float)
    split_df['index'] = split_df['index'].str.replace('.png', '', regex=False).astype(int)
    
    # Concatenate the new columns with the original DataFrame
    df = pd.concat([split_df, df], axis=1)

    # Extend the original 'filename' column
    df['full_path'] = df.apply(lambda x: join_strings(directory, x['filename']), axis=1)
    #df = df.drop('filename', axis=1)

    df['esd'] = df['area'].apply(area_to_esd).round(2)
    df['pressure'] = (df['pressure']-1)*10
    df.rename(columns={'pressure': 'pressure [dbar]'}, inplace=True)

    # Sort the DataFrame by the 'date-time' column
    df = df.sort_values(by=['date', 'time','index'], ascending=True)
    df.reset_index(drop=True, inplace=True)

    #filter the df for objects where 1 dimension is larger than ca. 1mm
    df = df[(df['w'] > size_filter) | (df['h'] > size_filter)]
    df_unique = df.drop_duplicates(subset=['img_id'])
    print(f'{empty_file_counter} files were empty and were dropped; Number of uniue images: {len(df_unique)}')

    return df

file_path = '/home/fanny/segmentation_output/Data'
segmentation_df = gen_crop_df(file_path, False)



#Loads prediction and mapping data, processes class names, and updates object annotation status
def prepare_prediction_data(prediction_csv, mapping_csv, sep="\t"):    
    # Load CSV files
    prediction_df = pd.read_csv(prediction_csv)
    polytaxo_classes_df = pd.read_csv(mapping_csv, sep=sep)

    # Add annotation status
    prediction_df['object_annotation_status'] = 'predicted'

    # Create mapping dictionary
    mapping_dict = dict(zip(
        polytaxo_classes_df["Dataset Class NamePolyTaxo Description"],
        polytaxo_classes_df["PolyTaxo Description"]
    ))

    # Columns to update
    columns_to_replace = ["top1", "top2", "top3", "top4", "top5"]

    # Define regex pattern to split on space, semicolon, colon, or slash
    split_pattern = r"[ ;:/]"

    # Replace values using mapping_dict, extract first word, and replace underscores with spaces
    prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict).apply(
        lambda col: col.astype(str).apply(
            lambda x: re.split(split_pattern, x)[0].replace("_", " ") if pd.notna(x) else x
        )
    )

    return prediction_df  # Return the processed DataFrame


def combine_segmentation_and_prediction(segmentation_df, prediction_df):
        
    # Sort both DataFrames by 'filename'
    segmentation_df_sorted = segmentation_df.sort_values(by='filename').reset_index(drop=True)
    prediction_df_sorted = prediction_df.sort_values(by='filename').reset_index(drop=True)

    # Concatenate data frames
    combined_df = pd.concat([segmentation_df_sorted, prediction_df_sorted], axis=1)
    
    # Remove duplicate columns (keeping the first occurrence)
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated(keep='first')]

    # Add object ID column
    combined_df['object_id'] = combined_df['img_id'].astype(str) + '_' + combined_df['index'].astype(str)

    # Define columns to delete
    columns_to_delete = [
        'temperature', 'mean_raw', 'std_raw', 'mean', 'std', 'x', 'y', 'w', 'h', 
        'saved', 'bound_box_x', 'bound_box_y', 'full_path', 'img_id', 'index'
    ]
    # Remove unwanted columns if they exist
    combined_df.drop(columns=[col for col in columns_to_delete if col in combined_df.columns], axis=1, inplace=True)

    # Adjust header names
    rename_mapping = {
        'pressure [dbar]': 'object_pressure',
        'date': 'object_date',
        'time': 'object_time',
        'filename': 'img_file_name',
        'area': 'object_area',
        'esd': 'object_esd',
        'top1': 'object_annotation_category',
        'top2': 'object_annotation_category_2',
        'top3': 'object_annotation_category_3',
        'top4': 'object_annotation_category_4',
        'top5': 'object_annotation_category_5',
        'prob1': 'object_prob_1',
        'prob2': 'object_prob_2',
        'prob3': 'object_prob_3',
        'prob4': 'object_prob_4',
        'prob5': 'object_prob_5'
    }
    combined_df.rename(columns=rename_mapping, inplace=True)

    return combined_df  # Return the processed DataFrame



# combine data and compress file
def zip_data(folder_path, zip_path, extra_file=None):
    """
    Zips a folder and optionally includes an extra file inside the zip.

    Parameters:
        folder_path (str): The folder to zip.
        zip_path (str): The destination zip file path.
        extra_file (str, optional): metadata file
    """
    if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
        print(f"Warning: Folder {folder_path} does not exist.")
        return
    
    # Move meta data file into the folder before zipping
    if extra_file and os.path.exists(extra_file):
        shutil.move(extra_file, os.path.join(folder_path, os.path.basename(extra_file)))

    # Create ZIP archive
    shutil.make_archive(zip_path.replace(".zip", ""), 'zip', folder_path)
    print(f"Zipped {folder_path} to {zip_path}")


deconv_crops_folder = '/home/fanny/segmentation_output/Deconv_crops'
raw_crops_folder = '/home/fanny/segmentation_output/Crops'
metadata = '/home/fanny/segmentation_output/EcoTaxa/ecotaxa_metadata.tsv'

# zip paths
zip_path_deconv = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_deconv.zip"
zip_path_raw = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload_raw.zip"

#zip_data(deconv_crops_folder, zip_path_deconv, metadata)
zip_data(raw_crops_folder, zip_path_raw, metadata)

print("Folders have been zipped")




def ET_upload(project_id, folder_path): #Uploads a zip file to an EcoTaxa project using terminal commands.
    try:
        #login to ET project
        login_cmd = ["pyecotaxa", "login"]
        subprocess.run(login_cmd, check=True)
        
        # upload file
        cmd = ['pyecotaxa', 'push', '--to', str(project_id), folder_path]
        subprocess.run(cmd, check=True)
        print(f"successfully uploaded {folder_path} to project {project_id}.")

    except subprocess.CalledProcessError as e:
        print(f"error during upload: {e}")




#ET_upload(15862, zip_path_deconv)
ET_upload(15753, zip_path_raw)