In [1]:
from segmenter import run_segmenter
import pandas as pd
import os
import csv
from tqdm import tqdm
from pandas.errors import EmptyDataError
import numpy as np
import shutil

In [None]:

folder_path = '/home/fanny/segmentation_output/Data' 
csv_files = []

#create one list of csv files
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        csv_files.append(os.path.join(folder_path, file))

#print(csv_files)

# select the crops only
def select_crops(data_file):
    list_crops = []
    with open(data_file, "r") as file:
        reader = csv.reader(file)
        for row in reader:
            if len(row) > 12:
                list_crops.append(row)
    return list_crops


# create list of data frames
df_list = []

for csv_file in csv_files:
    selected_rows = select_crops(csv_file)
    if selected_rows: # if there are any rows
        df = pd.DataFrame(selected_rows)
        df_list.append(df)
    

# Combine all DataFrames into one
if df_list: # Check if df_list has any DataFrames to concatenate
    combined_df = pd.concat(df_list, ignore_index=True)
    print(combined_df.head())

    # Save the combined DataFrame to a CSV file
    combined_df.to_csv("/home/fanny/output_dataframes/combined_data.csv", index=False, header=True)
else:
    print("no valid data found")



In [None]:
folder_path = '/home/fanny/segmentation_output/Data' 
csv_files = []

for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        csv_files.append(os.path.join(folder_path, file))


df = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df_list.append(df)

df_combined = pd.concat(map(pd.read_csv, csv_files))

print(df.head())

In [4]:
def gen_crop_df(path:str, small:bool, size_filter:int = 0):
    """
    A function to generate a DataFrame from a directory of CSV files, with options to filter out small objects.
    Parameters:
    path (str): The path to the directory containing the CSV files.
    small (bool): A flag indicating whether to filter out small objects.

    Returns:
    pandas.DataFrame: The concatenated and processed DataFrame with additional columns for analysis.
    """

    def area_to_esd(area: float) -> float:
        pixel_size = 13.5*2 #in µm/pixel @ 2560x2560 
        return 2 * np.sqrt(area * pixel_size**2 / np.pi)

    # Function to concatenate directory and filename
    def join_strings(dir, filename):
        return os.path.join(dir, filename)

    directory = os.path.dirname(path)
    directory = os.path.join(directory,'Data')

    files = [os.path.join(path, file) for file in sorted(os.listdir(path)) if file.endswith(".csv")]
    dataframes = []
    empty_file_counter = 0
    id = 1
    for file in tqdm(files):
        try:
            df = pd.read_csv(file, delimiter=",", header=None, index_col=None)
            if len(df.columns) == 44:
                df.insert(0,'',id)            
                dataframes.append(df)
                id+=1
            else:
                continue
        except EmptyDataError:
            empty_file_counter += 1
            print(f"File {file} is empty")

    df = pd.concat(dataframes, ignore_index=True)
    headers = ["img_id","index", "filename", "mean_raw", "std_raw", "mean", "std", "area", "x", "y", "w", "h", 
               "saved", "object_bound_box_w", "object_bound_box_h", "bound_box_x", "bound_box_y", "object_circularity", "object_area_exc", 
               "object_area_rprops", "object_%area", "object_major_axis_len", "object_minor_axis_len", "object_centroid_y", "object_centroid_x", 
               "object_convex_area", "object_min_intensity", "object_max_intensity", "object_mean_intensity", "object_int_density", "object_perimeter", 
               "object_elongation", "object_range", "object_perim_area_excl", "object_perim_major", "object_circularity_area_excl", "object_angle", 
               "object_boundbox_area", "object_eccentricity", "object_equivalent_diameter", "object_euler_nr", "object_extent", 
               "object_local_centroid_col", "object_local_centroid_row", "object_solidity"
]
    df.columns = headers
    df.reset_index(drop=True, inplace=True)
    df.drop("index", axis=1, inplace=True)

    if not small:
        df = df[df["saved"] == 1]
    df_unique = df.drop_duplicates(subset=['img_id'])
    
    #df.drop("saved", axis=1, inplace=True)

    # Split the 'filename' column
    split_df = df['filename'].str.split('_', expand=True)
    if small:# bug fix for segmenter where small objects are saved with _mask.png extension instead of .png: needs to be fixed if segmenter is fixed
        headers = ["date", "time", "pressure", "temperature", "index", "mask_ext"]
        split_df.columns = headers
        split_df.drop("mask_ext", axis=1, inplace=True)
    else:
        headers = ["date-time", "pressure", "temperature", "index", 'drop']
        split_df.columns = headers
        split_df.drop("drop", axis=1, inplace=True)
    
    # split date-time
    split_df[['date', 'time']] = split_df['date-time'].str.split('-', expand=True)
    split_df.drop(columns=['date-time'], inplace=True)

    split_df['pressure'] = split_df['pressure'].str.replace('bar', '', regex=False).astype(float)
    split_df['temperature'] = split_df['temperature'].str.replace('C', '', regex=False).astype(float)
    split_df['index'] = split_df['index'].str.replace('.png', '', regex=False).astype(int)

    # Concatenate the new columns with the original DataFrame
    df = pd.concat([split_df, df], axis=1)

    # Extend the original 'filename' column
    df['full_path'] = df.apply(lambda x: join_strings(directory, x['filename']), axis=1)
    #df = df.drop('filename', axis=1)

    df['esd'] = df['area'].apply(area_to_esd).round(2)
    df['pressure'] = (df['pressure']-1)*10
    df.rename(columns={'pressure': 'pressure [dbar]'}, inplace=True)

    # Sort the DataFrame by the 'date-time' column
    df = df.sort_values(by=['date', 'time','index'], ascending=True)
    df.reset_index(drop=True, inplace=True)

    #filter the df for objects where 1 dimension is larger than ca. 1mm
    df = df[(df['w'] > size_filter) | (df['h'] > size_filter)]
    df_unique = df.drop_duplicates(subset=['img_id'])
    print(f'{empty_file_counter} files were empty and were dropped; Number of uniue images: {len(df_unique)}')

    return df

file_path = '/home/fanny/segmentation_output/Data'
segmentation_df = gen_crop_df(file_path, False)

100%|██████████| 51/51 [00:00<00:00, 268.79it/s]


0 files were empty and were dropped; Number of uniue images: 51


In [5]:
# Prepare prediction data
import re

prediction_df = pd.read_csv("/home/fanny/segmentation_output/ViT_predictions.csv")
polytaxo_classes_df = pd.read_csv('/home/fanny/taxonomic_data/Polytaxo_classes(1).csv', sep=";")

print(len(polytaxo_classes_df.columns))

prediction_df['object_annotation_status'] = 'predicted'

mapping_dict = dict(zip(polytaxo_classes_df["Dataset Class NamePolyTaxo Description"], polytaxo_classes_df["PolyTaxo Description"]))
columns_to_replace = ["top1", "top2", "top3", "top4", "top5"]

# Define regex pattern to split on space, semicolon, colon, or slash
split_pattern = r"[ ;:/]"

'''
#prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict)
prediction_df[columns_to_replace] = (
    prediction_df[columns_to_replace]
    .replace(mapping_dict)
    .applymap(lambda x: re.split(split_pattern, str(x))[0] if pd.notna(x) else x)
)
'''

'''
# Replace values and extract only the first word
prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict).apply(
    lambda col: col.astype(str).apply(lambda x: re.split(split_pattern, x)[0] if pd.notna(x) else x)
)

prediction_df[columns_to_replace ]
for row
'''


# Replace values, extract first word, and replace underscores with spaces
prediction_df[columns_to_replace] = prediction_df[columns_to_replace].replace(mapping_dict).apply(
    lambda col: col.astype(str).apply(
        lambda x: re.split(split_pattern, x)[0].replace("_", " ") if pd.notna(x) else x
    )
)


prediction_df

4


Unnamed: 0,filename,top1,top2,top3,top4,top5,prob1,prob2,prob3,prob4,prob5,object_annotation_status
0,/home/fanny/segmentation_output/Deconv_crops/2...,Acantharia,Unknowns,Protista,Protista,Trichodesmium,0.436174,0.127273,0.116326,0.076356,0.045707,predicted
1,/home/fanny/segmentation_output/Deconv_crops/2...,Protista,Unknowns,Protista,Acantharia,Unknowns,0.664016,0.146700,0.054679,0.027206,0.016088,predicted
2,/home/fanny/segmentation_output/Deconv_crops/2...,Detritus,Detritus,Unknowns,Fecal pellets,Detritus,0.494211,0.430913,0.021329,0.010284,0.008775,predicted
3,/home/fanny/segmentation_output/Deconv_crops/2...,Protista,Protista,Detritus,Detritus,Trichodesmium,0.311248,0.190141,0.181070,0.077617,0.067613,predicted
4,/home/fanny/segmentation_output/Deconv_crops/2...,Trichodesmium,Detritus,Unknowns,Fecal pellets,Chaetognatha,0.845670,0.071942,0.058176,0.009616,0.001538,predicted
...,...,...,...,...,...,...,...,...,...,...,...,...
1255,/home/fanny/segmentation_output/Deconv_crops/2...,Unknowns,Protista,Unknowns,Detritus,Hydromedusae,0.338783,0.323545,0.092405,0.046433,0.043990,predicted
1256,/home/fanny/segmentation_output/Deconv_crops/2...,Detritus,Unknowns,Trichodesmium,Unknowns,Crustacea,0.471309,0.341310,0.041979,0.023661,0.017594,predicted
1257,/home/fanny/segmentation_output/Deconv_crops/2...,Unknowns,Detritus,Trichodesmium,Unknowns,Doliolida,0.704372,0.025734,0.023721,0.022455,0.019465,predicted
1258,/home/fanny/segmentation_output/Deconv_crops/2...,Detritus,Detritus,Unknowns,Detritus,Artefact,0.613329,0.235042,0.048409,0.026416,0.010740,predicted


In [1]:
# lets try something else
#taxoexport_df = pd.read_csv("/home/fanny/taxonomic_data/taxoexport_20250212_140806.tsv")

# Find common categories
#common_categories = set(polytaxo_classes_df['Dataset Class NamePolyTaxo Description']).intersection(set(taxoexport_df['display_name']))
#print("Direct matches found:", len(common_categories))


In [6]:
# combine segmentation data frame with prediction data frame
# Sort both DataFrames by 'filename'
segmentation_df_sorted = segmentation_df.sort_values(by='filename').reset_index(drop=True)
prediction_df_sorted = prediction_df.sort_values(by='filename').reset_index(drop=True)

# concatenate data frames
segm_and_prediction_df = pd.concat([segmentation_df_sorted, prediction_df_sorted], axis=1)
segm_and_prediction_df = segm_and_prediction_df.loc[:, ~segm_and_prediction_df.columns.duplicated(keep='first')]

# add object id
segm_and_prediction_df['object_id'] = segm_and_prediction_df['img_id'].astype(str) + '_' + segm_and_prediction_df['index'].astype(str)

# Create list of file paths from the 'file_paths' column
#filepaths = segm_and_prediction_df['file_paths'].tolist()

# delete some columns
columns_to_delete = ['temperature', 'mean_raw', 'std_raw', 'mean', 'std', 'x', 'y', 'w', 'h', 'saved', 'bound_box_x', 'bound_box_y', 'full_path', 'img_id', 'index']
segm_and_prediction_df.drop(columns_to_delete, axis=1, inplace=True)

# adjust header names
segm_and_prediction_df = segm_and_prediction_df.rename(columns={
    'pressure [dbar]' : 'object_pressure',
    'date': 'object_date',
    'time' : 'object_time',
    'filename' : 'img_file_name',
    'area' : 'object_area',
    'esd' : 'object_esd',
    'top1' : 'object_annotation_category',
    'top2' : 'object_annotation_category_2',
    'top3' : 'object_annotation_category_3',
    'top4' : 'object_annotation_category_4',
    'top5' : 'object_annotation_category_5',
    'prob1' : 'object_prob_1',
    'prob2' : 'object_prob_2',
    'prob3': 'object_prob_3',
    'prob4' : 'object_prob_4',
    'prob5' : 'object_prob_5'    
})
#print(segm_and_prediction_df.columns)


# function to determine the data type of each column
def determine_dtype(dtype):
    if pd.api.types.is_numeric_dtype(dtype):
        return '[f]' 
    elif pd.api.types.is_string_dtype(dtype):
        return '[t]'
    else:
        return 'other'

dtype_row = []
for col in segm_and_prediction_df.columns:
    dtype_row.append(determine_dtype(segm_and_prediction_df[col].dtype))

# insert row
segm_and_prediction_df.loc[0] = dtype_row

# reset index to maintain order
segm_and_prediction_df = segm_and_prediction_df.sort_index().reset_index(drop=True)
eco_taxa_folder = "/home/fanny/segmentation_output/EcoTaxa"
os.makedirs(eco_taxa_folder, exist_ok=True)

# save everything as tsv file
segm_and_prediction_df.to_csv('/home/fanny/segmentation_output/Deconv_crops/ecotaxa_metadata.tsv', sep="\t", index=False)

deconv_crops_folder = '/home/fanny/segmentation_output/Deconv_crops'

# Define the output zip file path
zip_path = "/home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload.zip"

# Create a zip archive of the entire Deconv_crops folder (including images and metadata file)
shutil.make_archive(zip_path.replace(".zip", ""), 'zip', deconv_crops_folder)

print(f"Deconv_crops folder has been zipped to {zip_path}")

  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] = dtype_row
  segm_and_prediction_df.loc[0] 

Deconv_crops folder has been zipped to /home/fanny/segmentation_output/EcoTaxa/ecotaxa_upload.zip


In [None]:
classlist = ['acantharia_protist',
'acantharia_protist_big_center',
'acantharia_protist_halo',
'amphipods',
'appendicularian_fritillaridae',
'appendicularian_s_shape',
'appendicularian_slight_curve',
'appendicularian_straight',
'artifacts',
'artifacts_edge',
'chaetognath_non_sagitta',
'chaetognath_other',
'chaetognath_sagitta',
'chordate_type1',
'copepod_calanoid',
'copepod_calanoid_eggs',
'copepod_calanoid_eucalanus',
'copepod_calanoid_flatheads',
'copepod_calanoid_frillyAntennae',
'copepod_calanoid_large',
'copepod_calanoid_large_side_antennatucked',
'copepod_calanoid_octomoms',
'copepod_calanoid_small_longantennae',
'copepod_cyclopoid_copilia',
'copepod_cyclopoid_oithona',
'copepod_cyclopoid_oithona_eggs',
'copepod_other',
'crustacean_other',
'ctenophore_cestid',
'ctenophore_cydippid_no_tentacles',
'ctenophore_cydippid_tentacles',
'ctenophore_lobate',
'decapods',
'detritus_blob',
'detritus_filamentous',
'detritus_other',
'diatom_chain_string',
'diatom_chain_tube',
'echinoderm_larva_pluteus_brittlestar',
'echinoderm_larva_pluteus_early',
'echinoderm_larva_pluteus_typeC',
'echinoderm_larva_pluteus_urchin',
'echinoderm_larva_seastar_bipinnaria',
'echinoderm_larva_seastar_brachiolaria',
'echinoderm_seacucumber_auricularia_larva',
'echinopluteus',
'ephyra',
'euphausiids',
'euphausiids_young',
'fecal_pellet',
'fish_larvae_deep_body',
'fish_larvae_leptocephali',
'fish_larvae_medium_body',
'fish_larvae_myctophids',
'fish_larvae_thin_body',
'fish_larvae_very_thin_body',
'heteropod',
'hydromedusae_aglaura',
'hydromedusae_bell_and_tentacles',
'hydromedusae_h15',
'hydromedusae_haliscera',
'hydromedusae_haliscera_small_sideview',
'hydromedusae_liriope',
'hydromedusae_narco_dark',
'hydromedusae_narco_young',
'hydromedusae_narcomedusae',
'hydromedusae_other',
'hydromedusae_partial_dark',
'hydromedusae_shapeA',
'hydromedusae_shapeA_sideview_small',
'hydromedusae_shapeB',
'hydromedusae_sideview_big',
'hydromedusae_solmaris',
'hydromedusae_solmundella',
'hydromedusae_typeD',
'hydromedusae_typeD_bell_and_tentacles',
'hydromedusae_typeE',
'hydromedusae_typeF',
'invertebrate_larvae_other_A',
'invertebrate_larvae_other_B',
'jellies_tentacles',
'polychaete',
'protist_dark_center',
'protist_fuzzy_olive',
'protist_noctiluca',
'protist_other',
'protist_star',
'pteropod_butterfly',
'pteropod_theco_dev_seq',
'pteropod_triangle',
'radiolarian_chain',
'radiolarian_colony',
'shrimp-like_other',
'shrimp_caridean',
'shrimp_sergestidae',
'shrimp_zoea',
'siphonophore_calycophoran_abylidae',
'siphonophore_calycophoran_rocketship_adult',
'siphonophore_calycophoran_rocketship_young',
'siphonophore_calycophoran_sphaeronectes',
'siphonophore_calycophoran_sphaeronectes_stem',
'siphonophore_calycophoran_sphaeronectes_young',
'siphonophore_other_parts',
'siphonophore_partial',
'siphonophore_physonect',
'siphonophore_physonect_young',
'stomatopod',
'tornaria_acorn_worm_larvae',
'trichodesmium_bowtie',
'trichodesmium_multiple',
'trichodesmium_puff',
'trichodesmium_tuft',
'trochophore_larvae',
'tunicate_doliolid',
'tunicate_doliolid_nurse',
'tunicate_partial',
'tunicate_salp',
'tunicate_salp_chains',
'unknown_blobs_and_smudges',
'unknown_sticks',
'unknown_unclassified']

classlist_mapping = {}

for item in classlist:
    new_name = input(f"Enter new name for {item}: ")
    classlist_mapping[item] = new_name

print(classlist_mapping)