# Clustering Zooniverse Marks to count Iguanas
The goal is to find the best method to cluster the data and find the best number of clusters.
The benchmark is a gold standard dataset obtained by experts.

In [6]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../")
sys.path.append("../zooniverse")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Intro
### Retrieve a Classification report from Zooniverse
Export the classification export from your zooniverse project.
https://www.zooniverse.org/lab/11905/data-exports

This leads to a csv file which can be used for the analysis which should be renamed to `iguanas-from-above-classifications.csv` and placed in the `input_path` directory.
The methods do not use methods from zooniverse. It is a custom implementation.

An alternative would be to use the [code provided by zooniverse](https://github.com/zooniverse/Data-digging/tree/master/notebooks_ProcessExports)
(Bird Count Example)[https://github.com/zooniverse/Data-digging/blob/master/scripts_ProjectExamples/seabirdwatch/bird_count.py]

This notebooks assumes the data is flat and prepared. An alternative format would be the [cesar aggregation format](https://github.com/zooniverse/aggregation-for-caesar)

Used Methods are:

### DBSCAN 
It does not require the number of clusters to be specified. It is used here because, but has min_samples and eps as hyperparameters which need to be found. [Link](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html)
For finding eps and min_sample a simple **grid search** is used.
Additionally, DBSCAN not assume a specific shape for the clusters (K-means assumes clusters are gaussian in shape) even though we should assume that points around an iguana is gaussian shaped.

### HDBSCAN
It is an extension of DBSCAN which is more robust to hyperparameter settings as it finds epsilon and min_samples automatically. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html)

### Processing workflow

The Data is flatted and filtered
![Image](images/dataprocessing-DataFiltering.png)

In each phase we have the following number of images if they are filtered for at least 4 true marks by users
1. phase 105
2. phase 160
3. phase 86


## Load the data

In [7]:
from zooniverse.utils.data_format import data_prep
from pathlib import Path

import pandas as pd
from zooniverse.analysis import get_annotation_count_stats
from zooniverse.utils.filters import filter_df_user_threshold
from zooniverse.config import get_config

## Input Path of all the data
input_path =Path("/Users/christian/data/zooniverse")

reprocess = True # if True, the raw classification data is reprocessed. If False, the data is loaded from disk

# Phase Selection
phase_tag = "Iguanas 1st launch"
# phase_tag = "Iguanas 2nd launch"
# phase_tag = "Iguanas 3rd launch"


debug = False # debugging with a smaller dataset
plot_diagrams = False # plot the diagrams to disk for the clustering methods
show_plots = False # show the plots in the notebook
user_threshold = None # in a number, filter records which have less than these user interactions.


### use either the subset of the subset
use_gold_standard_subset = "expert_goldstandard" # Use the X-T2-GS-results-5th-0s as the basis
# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis

# Location for the analysis Results
output_path = Path(input_path.joinpath(f"2024_03_07_{use_gold_standard_subset}_analysis").joinpath(phase_tag))
output_path.mkdir(exist_ok=True, parents=True)

# Location for plots
output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)


## Look into the config
This Config points to all files necessary for the analysis + the result files

In [8]:
config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)
config

{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-1stphase.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/1-T2-GS-results-5th-0s.csv'),
 'image_source': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/1st launch'),
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_07_expert_goldstandard_analysis/Iguanas 1st launch/yes_no_dataset_Iguanas 1st launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_07_expert_goldstandard_analysis/Iguanas 1st launch/flat_dataset_Iguanas 1st launch.csv'),
 'merged_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_07_expert_goldstandard_analysis/Iguanas 1st launch/merged_dataset_gold_standard_expe

In [9]:
if reprocess:
    ds_stats = data_prep(phase_tag=phase_tag, 
                         output_path=output_path, 
                         input_path=input_path,
                         filter_combination=use_gold_standard_subset, config=config)
    print(ds_stats)

[32m2024-03-08 14:57:57.571[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m175[0m - [1mfound a new phase tag: Iguanas 1st launch[0m


{'retired': None, 'link': 'https://www.flickr.com/photos/hellie55/6713782851', 'origin': 'Flickr', 'license': 'Creative Commons - share adapt attribute', 'subject_id': '3', 'attribution': 'hehaden', 'image_name_1': '6713782851_82fc8c73e5_z.jpg', '#secret_description': 'juvenile kittehs practice break-in at the catnip factory'}
{'retired': None, 'link': 'https://www.flickr.com/photos/davebloggs007/15899553966', 'origin': 'Flickr', 'license': 'Creative Commons - share adapt attribute', 'subject_id': '16', 'attribution': 'davebloggs007', 'image_name_1': '15899553966_5debf88369_z.jpg', '#secret_description': 'makeover kitteh not sure these new contacts are sassy enough'}
{'retired': None, 'link': 'https://www.flickr.com/photos/odoketa/8762013777', 'origin': 'Flickr', 'license': 'Creative Commons - share adapt attribute', 'subject_id': '4', 'attribution': 'David Barber', 'image_name_1': '8762013777_1fa7d91e98_z.jpg', '#secret_description': 'hunter kitteh only willing to go so far to capture

[32m2024-03-08 14:58:05.808[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m175[0m - [1mfound a new phase tag: Main Workflow[0m
[32m2024-03-08 14:58:07.309[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m175[0m - [1mfound a new phase tag: survey tool 061417[0m
[32m2024-03-08 14:58:08.973[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m175[0m - [1mfound a new phase tag: Iguanas 2nd launch[0m
[32m2024-03-08 14:58:13.225[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m175[0m - [1mfound a new phase tag: Iguanas 3rd launch[0m
[32m2024-03-08 14:58:27.570[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m175[0m - [1mfound a new phase tag: Iguanas 4th launch[0m
[32m2024-03-08 14:5

                                            filename  images
0                flat_dataset_Iguanas 1st launch.csv    8260
1                                         1st launch    2737
2                             expert-GS-1stphase.csv    2733
3                         1-T2-GS-results-5th-0s.csv     107
4  merged_dataset_gold_standard_expert_Iguanas 1s...     107


### Optional Debugging

In [10]:
if plot_diagrams == False:
    output_plot_path = None

# the flattened, filtered marks from zooniverse.
df_merged_dataset = pd.read_csv(config["merged_dataset"])

# data for reference
df_goldstandard_expert_count = pd.read_csv(config["goldstandard_data"], sep=";")

## Debugging helpers
if phase_tag == "Iguanas 1st launch":    
    if debug:

        df_merged_dataset = df_merged_dataset[df_merged_dataset.image_name.isin(["SFM01-2-2-2_333.jpg", "SFM01-2-2-2_334.jpg", "SFM01-2-2-3_201.jpg"])]

elif phase_tag == "Iguanas 2nd launch":
    if debug:
        df_merged_dataset = df_merged_dataset[
           df_merged_dataset.image_name.isin(["FMO03-1_65.jpg", "FMO03-1_72.jpg", "MBN04-2_182.jpg", "EGI08-2_78.jpg"])]
           # df_merged_dataset.image_name.isin(["FMO03-1_72.jpg"])]

    
elif phase_tag == "Iguanas 3rd launch":

    # this user is a spammer
    df_merged_dataset = df_merged_dataset[df_merged_dataset.user_id != 2581179]
    
    if debug:
        df_merged_dataset = df_merged_dataset[
           df_merged_dataset.image_name.isin(["FMO03-2_70.jpg", "MBN04-2_182.jpg", "EGI08-2_78.jpg"])]
            
    


## Look at the data


In [11]:
## Look at the data
df_merged_dataset.drop("user_name", axis=1)


Unnamed: 0.1,Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,mission_name,image_path,width,height
0,0,SFM1,SFM01-2-2-3_137.jpg,47970105,376.160156,84.558594,Adult Male alone,Iguanas 1st launch,2068663.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
1,1,SFM1,SFM01-2-2-3_137.jpg,47970105,272.695312,297.894531,Adult Male alone,Iguanas 1st launch,2068663.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
2,2,SFM1,SFM01-2-2-3_137.jpg,47970105,111.898438,410.011719,Adult Male alone,Iguanas 1st launch,2068663.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
3,3,SFM1,SFM01-1-2_264.jpg,47968406,462.800781,390.786377,Adult Male in a lek,Iguanas 1st launch,1878692.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
4,4,SFM1,SFM01-1-2_264.jpg,47968406,545.598145,423.652527,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1878692.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042,3042,SFM1,SFM01-2-2-1_120.jpg,47969236,406.704041,383.049408,"Could be an iguana, not sure",Iguanas 1st launch,,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,666,648
3043,3043,SFM1,SFM01-2-2-1_120.jpg,47969236,577.184998,270.231110,"Could be an iguana, not sure",Iguanas 1st launch,,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,666,648
3044,3044,SFM1,SFM01-1-2_210.jpg,47968359,316.314850,392.354614,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,2136122.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
3045,3045,SFM1,SFM01-1-2_210.jpg,47968359,326.542725,402.156372,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,2136122.0,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630


### Filter User if necessary and Marks


In [12]:
print(f"Before filtering: {df_merged_dataset.subject_id.nunique()}")
# There images in which some people said there are iguanas, but then didn't mark them. Clustering with fewer than 3 dots doesn't make sense
if user_threshold is not None:
    print(f"filtering records which have less than {user_threshold} interactions.")
    df_merged_dataset = filter_df_user_threshold(df_merged_dataset, user_threshold=user_threshold)
    
    
from zooniverse.utils.filters import filter_remove_marks
# Check if partials are still in the data. There shouldn't be any
df_merged_dataset = filter_remove_marks(df_merged_dataset)




Before filtering: 107


### Are there anonymous users in the data?
There should be

In [13]:
df_merged_dataset[df_merged_dataset.user_id.isnull().values]

Unnamed: 0.1,Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name,mission_name,image_path,width,height
13,13,SFM1,SFM01-1-2_59.jpg,47968506,116.633331,153.966675,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,not-logged-in-33bc5b21c88460dbfea6,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
15,15,SFM1,SFM01-1-1_184.jpg,47968013,279.644714,199.890869,Adult Male alone,Iguanas 1st launch,,not-logged-in-33bc5b21c88460dbfea6,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,600,630
16,16,SFM1,SFM01-1-1_184.jpg,47968013,407.698547,321.648682,Adult Male alone,Iguanas 1st launch,,not-logged-in-33bc5b21c88460dbfea6,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,600,630
21,21,SFM1,SFM01-2-2-2_302.jpg,47969853,549.633301,540.966675,Adult Male alone,Iguanas 1st launch,,not-logged-in-33bc5b21c88460dbfea6,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
25,25,SFM1,SFM01-2-2-3_149.jpg,47970139,57.739017,178.533081,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,not-logged-in-33bc5b21c88460dbfea6,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2986,2986,SFM1,SFM01-1-2_284.jpg,47968423,162.702652,474.069214,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,not-logged-in-93e07d6b2a99b8725f21,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
2987,2987,SFM1,SFM01-1-2_284.jpg,47968423,85.247139,474.450745,Adult Male in a lek,Iguanas 1st launch,,not-logged-in-93e07d6b2a99b8725f21,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
3041,3041,SFM1,SFM01-2-2-1_120.jpg,47969236,231.432205,154.631561,Adult Male alone,Iguanas 1st launch,,not-logged-in-341c84c0e52bdc34f8f8,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,666,648
3042,3042,SFM1,SFM01-2-2-1_120.jpg,47969236,406.704041,383.049408,"Could be an iguana, not sure",Iguanas 1st launch,,not-logged-in-341c84c0e52bdc34f8f8,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,666,648


In [14]:
df_merged_dataset["subject_id"].nunique()

107

In [15]:
## After filtering there
df_merged_dataset

Unnamed: 0.1,Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name,mission_name,image_path,width,height
0,0,SFM1,SFM01-2-2-3_137.jpg,47970105,376.160156,84.558594,Adult Male alone,Iguanas 1st launch,2068663.0,aymor,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
1,1,SFM1,SFM01-2-2-3_137.jpg,47970105,272.695312,297.894531,Adult Male alone,Iguanas 1st launch,2068663.0,aymor,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
2,2,SFM1,SFM01-2-2-3_137.jpg,47970105,111.898438,410.011719,Adult Male alone,Iguanas 1st launch,2068663.0,aymor,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,589,573
3,3,SFM1,SFM01-1-2_264.jpg,47968406,462.800781,390.786377,Adult Male in a lek,Iguanas 1st launch,1878692.0,drewavery,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
4,4,SFM1,SFM01-1-2_264.jpg,47968406,545.598145,423.652527,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1878692.0,drewavery,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042,3042,SFM1,SFM01-2-2-1_120.jpg,47969236,406.704041,383.049408,"Could be an iguana, not sure",Iguanas 1st launch,,not-logged-in-341c84c0e52bdc34f8f8,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,666,648
3043,3043,SFM1,SFM01-2-2-1_120.jpg,47969236,577.184998,270.231110,"Could be an iguana, not sure",Iguanas 1st launch,,not-logged-in-341c84c0e52bdc34f8f8,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,666,648
3044,3044,SFM1,SFM01-1-2_210.jpg,47968359,316.314850,392.354614,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,2136122.0,H.axson,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630
3045,3045,SFM1,SFM01-1-2_210.jpg,47968359,326.542725,402.156372,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,2136122.0,H.axson,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,599,630


In [16]:
# how many marks per user
df_merged_dataset[["user_id", "x"]].groupby("user_id").count().head()

Unnamed: 0_level_0,x
user_id,Unnamed: 1_level_1
1476.0,2
2842.0,18
109942.0,1
114901.0,4
120287.0,2


### gold standard data
For reference

In [17]:
df_goldstandard_expert_count[df_goldstandard_expert_count["image_name"].isin(["SFM01-2-2-2_282.jpg", "SFM01-2-2-2_323.jpg"])]

Unnamed: 0,subspecies,island,site_name,subject_group,image_name,subject_id,presence_absence,count_male-lek,count_male-no-lek,count_others,count_partial,count_total,quality,condition,comment
0,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-2-2-2_282.jpg,47969795,Y,0,2,0,2,2,Good,Hard,
701,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-2-2-2_323.jpg,47969912,Y,1,1,4,0,6,Good,Visible,


In [18]:
df_goldstandard_expert_count.count_total.sum()

422

In [19]:
# look at the
df_goldstandard_expert_count.count()

subspecies           2733
island               2733
site_name            2733
subject_group        2733
image_name           2733
subject_id           2733
presence_absence     2733
count_male-lek       2733
count_male-no-lek    2733
count_others         2733
count_partial        2733
count_total          2733
quality               150
condition             150
comment                18
dtype: int64

In [20]:
fsum = df_goldstandard_expert_count[
    df_goldstandard_expert_count.image_name.isin(df_merged_dataset.image_name.unique())]

print(f"filtering the zooniverse classifications dataset for gold standard images the count_total of iguanas is: {fsum.count_total.sum()}, but it should be {df_goldstandard_expert_count.count_total.sum()}")
fsum


filtering the zooniverse classifications dataset for gold standard images the count_total of iguanas is: 331, but it should be 422


Unnamed: 0,subspecies,island,site_name,subject_group,image_name,subject_id,presence_absence,count_male-lek,count_male-no-lek,count_others,count_partial,count_total,quality,condition,comment
0,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-2-2-2_282.jpg,47969795,Y,0,2,0,2,2,Good,Hard,
1,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-2-2-1_344.jpg,47969531,Y,0,2,2,1,4,Good,Hard,not consider number 4 marked in the image
5,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-2-2-2_293.jpg,47969828,Y,1,0,6,1,7,Good,Hard,
22,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-1-1_114.jpg,47967876,Y,0,1,0,0,1,Bad,Visible,
60,A. c. trillmichi,Santa Fe,El Miedo,SFM1,SFM01-1-1_154.jpg,47967959,Y,0,1,0,0,1,Good,Visible,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2445,A. c. godzilla,San Cristobal,Playa Blanca (Punta Pitt),SRPB1,SRPB02-2-1_67.jpg,47987920,Y,0,1,0,0,1,Good,Visible,
2506,A. c. godzilla,San Cristobal,Playa Blanca (Punta Pitt),SRPB1,SRPB02-2-2_143.jpg,47988474,Y,1,0,3,0,4,Bad,Visible,
2516,A. c. godzilla,San Cristobal,Playa Blanca (Punta Pitt),SRPB1,SRPB02-2-2_153.jpg,47988580,Y,0,1,0,0,1,Good,Visible,
2524,A. c. godzilla,San Cristobal,Playa Blanca (Punta Pitt),SRPB1,SRPB02-2-2_161.jpg,47988636,Y,0,1,0,0,1,Good,Visible,


In [21]:
# How many images are left in the zooniverse dataset?
len(list(df_merged_dataset.image_name.unique()))

107

In [22]:
#Is there an image in the goldstandard, which is not in the classifcations?
len(set(df_goldstandard_expert_count.subject_id) - set(df_merged_dataset.subject_id.unique()))


df_goldstandard_expert_count.count_total.sum()
# df_merged_dataset[df_merged_dataset.image_name.isin(["SRL01-1-2_105.jpg"])]

422

In [23]:
df_goldstandard_expert_count[df_goldstandard_expert_count.image_name.isin(["SRL01-1-2_105.jpg"])]

Unnamed: 0,subspecies,island,site_name,subject_group,image_name,subject_id,presence_absence,count_male-lek,count_male-no-lek,count_others,count_partial,count_total,quality,condition,comment
1070,A. c. mertensi,San Cristobal,Loberia,SRL1,SRL01-1-2_105.jpg,47979415,N,0,0,0,0,0,,,


In [24]:
T2_GS_results_5th_0s = pd.read_csv(config["gold_standard_image_subset"], sep=";")
T2_GS_results_5th_0s

Unnamed: 0,subject_id,Median0s,Mean0s,Max0s,Std0s,Median.r,Mean.r,Mode0s
0,47967876,1.0,1.444444,3,0.726483,1,1,1
1,47967959,1.0,1.181818,2,0.404520,1,1,1
2,47967961,9.0,9.000000,12,2.581989,9,9,12
3,47967975,2.0,2.000000,2,0.000000,2,2,2
4,47968013,1.0,1.250000,2,0.500000,1,1,1
...,...,...,...,...,...,...,...,...
102,48026608,6.0,5.705882,7,1.263166,6,6,6
103,48026629,1.0,1.000000,1,0.000000,1,1,1
104,48026645,2.0,2.333333,6,1.345185,2,2,3
105,48026817,1.0,1.222222,3,0.666667,1,1,1


## The gold standard vs. the expert count

In [25]:
# Double checking for the counts
gstd_5th = df_goldstandard_expert_count[df_goldstandard_expert_count.subject_id.isin(T2_GS_results_5th_0s.subject_id)].count_total.sum()
print(f"If the expert count ({config['goldstandard_data']})  is filtered for the subject ids in {config['gold_standard_image_subset']} the count_total is {gstd_5th} iguanas")

If the expert count (/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-1stphase.csv)  is filtered for the subject ids in /Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/1-T2-GS-results-5th-0s.csv the count_total is 331 iguanas


In [26]:
len(set(T2_GS_results_5th_0s.subject_id.unique()) - set(df_goldstandard_expert_count.subject_id))
df_goldstandard_expert_count["count_total"].sum()

422

In [27]:
df_merged_dataset["subject_id"].nunique()

107

In [28]:

df_goldstandard_expert_count = df_goldstandard_expert_count[
    df_goldstandard_expert_count.subject_id.isin(df_merged_dataset.subject_id.unique())]
df_goldstandard_expert_count = df_goldstandard_expert_count[["image_name", "subject_id", "count_total"]]

df_goldstandard_expert_count["count_total"].sum()

331

In [29]:
## plot some of the marks
from zooniverse.utils.plotting import plot_zooniverse_user_marks_v2
# FMO03-1_65
# EIG05-1_83.jpg # phase 
# MBN04-2_182.jpg # phase 3
# df_merged_dataset_filtered = df_merged_dataset[df_merged_dataset.image_name.isin(["ESCG02-1_19.jpg"])]
if phase_tag in["Iguanas 1st launch", "Iguanas 2nd launch"]  and  ( plot_diagrams or show_plots ) :
    for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
        
        ## plot the marks
        markers_plot_path = plot_zooniverse_user_marks_v2(df_image_name,
                                                          image_path=df_image_name.iloc[0]["image_path"],
                                                          image_name=image_name,
                                                          output_path=output_plot_path, show=show_plots, title=f"Markers for {image_name}", fig_size=(5,5))
        

## Clustering

### Basic Statics like mean, median, mode

In [30]:
from sklearn.metrics import mean_squared_error
from zooniverse.analysis import kmeans_knee, get_mark_overview

basic_stats = []
kmeans_knee_stats = []
kmeans_silouettes = []
mse_errors = {}


for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
    annotations_count = get_mark_overview(df_image_name)


    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"])


    ### basic statistics like mean, median
    basic_stats.append(annotations_count_stats)
    

df_basic_stats = pd.DataFrame(basic_stats)    

df_comparison = df_goldstandard_expert_count.merge(df_basic_stats, on='image_name', how='left')

df_comparison["count_total"].sum()
df_goldstandard_expert_count["count_total"].sum()

331

In [31]:
# There might be records with too few annotations
df_comparison[(df_comparison.count_total > 0) & (df_comparison.sum_annotations_count < 5)].sort_values(by="users", ascending=False)

Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count


In [32]:
# images with an expert count of more than 0 and less than 5 different users
df_comparison[(df_comparison.count_total > 0) & (df_comparison.users < 5)].sort_values(by="users", ascending=False)


Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count
29,SFM01-2-2-1_293.jpg,47969478,9,4.5,5.0,2,4,20,"[2, 2, 7, 9]"
54,SFM01-2-2-3_123.jpg,47970063,1,1.0,2.75,1,4,11,"[1, 1, 1, 8]"
0,SFM01-2-2-2_282.jpg,47969795,2,2.0,2.0,1,3,6,"[1, 2, 3]"
42,SFM01-2-2-2_323.jpg,47969912,6,5.0,6.33,5,3,19,"[1, 5, 13]"


In [33]:
df_comparison["count_total"].sum()

331

### Fill NaN values with 0 because the errors can't be calculated otherwise

In [34]:

## Fill NaN values with 0 because the errors can't be calculated otherwise
df_comparison.fillna(0, inplace=True)


In [35]:

mse_errors["median_count_rmse"] = mean_squared_error(df_comparison.count_total, df_comparison.median_count,
                                                     squared=False)
mse_errors["mean_count_rmse"] = mean_squared_error(df_comparison.count_total, df_comparison.mean_count, squared=False)
mse_errors["mode_count_rmse"] = mean_squared_error(df_comparison.count_total, df_comparison.mode_count, squared=False)

pd.Series(mse_errors)

median_count_rmse    2.380313
mean_count_rmse      2.098189
mode_count_rmse      2.916277
dtype: float64

It can be seen the knee method has a very high Root mean squared error. 

In [36]:
df_comparison

Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count
0,SFM01-2-2-2_282.jpg,47969795,2,2.0,2.00,1,3,6,"[1, 2, 3]"
1,SFM01-2-2-1_344.jpg,47969531,4,1.0,1.70,1,10,17,"[1, 1, 1, 1, 1, 1, 1, 3, 3, 4]"
2,SFM01-2-2-2_293.jpg,47969828,7,2.0,2.57,1,14,36,"[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 5, 8]"
3,SFM01-1-1_114.jpg,47967876,1,1.0,1.44,1,9,13,"[1, 1, 1, 1, 1, 1, 1, 2, 4]"
4,SFM01-1-1_154.jpg,47967959,1,1.0,1.00,1,11,11,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...
102,SRPB02-2-1_67.jpg,47987920,1,1.0,1.07,1,14,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]"
103,SRPB02-2-2_143.jpg,47988474,4,4.0,3.89,4,9,35,"[2, 3, 3, 4, 4, 4, 4, 5, 6]"
104,SRPB02-2-2_153.jpg,47988580,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
105,SRPB02-2-2_161.jpg,47988636,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [37]:

pd.Series(mse_errors)

median_count_rmse    2.380313
mean_count_rmse      2.098189
mode_count_rmse      2.916277
dtype: float64

In [38]:
df_comparison

Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count
0,SFM01-2-2-2_282.jpg,47969795,2,2.0,2.00,1,3,6,"[1, 2, 3]"
1,SFM01-2-2-1_344.jpg,47969531,4,1.0,1.70,1,10,17,"[1, 1, 1, 1, 1, 1, 1, 3, 3, 4]"
2,SFM01-2-2-2_293.jpg,47969828,7,2.0,2.57,1,14,36,"[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 5, 8]"
3,SFM01-1-1_114.jpg,47967876,1,1.0,1.44,1,9,13,"[1, 1, 1, 1, 1, 1, 1, 2, 4]"
4,SFM01-1-1_154.jpg,47967959,1,1.0,1.00,1,11,11,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...
102,SRPB02-2-1_67.jpg,47987920,1,1.0,1.07,1,14,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]"
103,SRPB02-2-2_143.jpg,47988474,4,4.0,3.89,4,9,35,"[2, 3, 3, 4, 4, 4, 4, 5, 6]"
104,SRPB02-2-2_153.jpg,47988580,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
105,SRPB02-2-2_161.jpg,47988636,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [39]:
df_comparison.sum()

image_name               SFM01-2-2-2_282.jpgSFM01-2-2-1_344.jpgSFM01-2-...
subject_id                                                      5133430839
count_total                                                            331
median_count                                                         235.5
mean_count                                                          271.41
mode_count                                                             221
users                                                                 1204
sum_annotations_count                                                 3047
annotations_count        [1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 1, 1, ...
dtype: object

### DBSCAN clustering and take the variant with the best silouette score for each image


In [40]:
### The old variant
# from zooniverse.analysis import compare_dbscan_hyp_v2
# 
# eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
# min_samples_variants = [3, 5, 8, 10]
# if debug:
#     eps_variants = [0.3]
#     min_samples_variants = [3]
# params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]
# 
# db_scan_results = {}
# db_scan_best_results = []
# db_scan_best_bic_results = []
# for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
# 
#     dbscan_localization = compare_dbscan_hyp_v2(
#         # phase_tag=phase_tag,
#         params=params,
#         df_flat=df_image_name,
#         # output_path=output_path,
#         output_plot_path=output_plot_path,
#         plot=show_plots,
#         
#     )
# 
#     db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
#     
#     # TODO Here lies the main problem with DBSCAN.
#     ## DBSCAN tends to classfy all points as noise if min_samples is too high. Often only a single user marked an iguana.
#     ## Sillouette Scoring needs a minimum of 2 clusters
#     ## if there are points in decent radius they will belong to a cluster
#     # if pd.DataFrame(dbscan_localization).dbscan_count.max() == 1:
#     #     db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
#     #     db_scan_best_bic_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
#     # # If two or more cluster seem to exists take ones with the best BIC or Silouette score
#     # else:  
#     # take the best result by silouette score if there are more clusters then 1
#     ## TODO make the sorting deterministic
#     db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_silouette_score", ascending=False).iloc[0])
#     
# df_dbscan_localization = pd.concat([*db_scan_results.values()])
# df_scan_best_results = pd.DataFrame(db_scan_best_results)



In [41]:
# df_scan_best_results

In [42]:
## fixes the problem with the silouette score sorting
from zooniverse.analysis import compare_dbscan_hyp_v2

eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
min_samples_variants = [3, 5, 8, 10]
if debug:
    eps_variants = [0.3]
    min_samples_variants = [3]
params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]

db_scan_results = {}
db_scan_best_results = []
db_scan_best_bic_results = []
for image_name, df_image_name in df_merged_dataset.groupby("image_name"):

    dbscan_localization = compare_dbscan_hyp_v2(
        # phase_tag=phase_tag,
        params=params,
        df_flat=df_image_name,
        # output_path=output_path,
        output_plot_path=output_plot_path,
        plot=show_plots,
        
    )

    db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
    
    # TODO Here lies the main problem with DBSCAN.
    # DBSCAN tends to classfy all points as noise if min_samples is too high. Often only a single user marked an iguana.
    # Sillouette Scoring needs a minimum of 2 clusters
    # if there are points in decent radius they will belong to a cluster
    if pd.DataFrame(dbscan_localization).dbscan_count.max() == 1:
        db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
        db_scan_best_bic_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
        # If two or more cluster seem to exists take ones with the best Silouette score
    else:  
        # take the best result by silouette score if there are more clusters then 1
        db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values(["dbscan_silouette_score", "dbscan_count"], ascending=[False, False]).iloc[0])
    
df_dbscan_localization = pd.concat([*db_scan_results.values()])
df_scan_best_results = pd.DataFrame(db_scan_best_results)



  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluste

In [43]:
df_scan_best_results

Unnamed: 0,image_name,dbscan_count,dbscan_noise,dbscan_silouette_score,dbscan_BIC_score,eps,min_samples
14,SFM01-1-1_114.jpg,1,4,,,0.20,8
24,SFM01-1-1_154.jpg,1,8,,,0.50,3
12,SFM01-1-1_155.jpg,2,5,0.449423,-270.096525,0.20,3
24,SFM01-1-1_163.jpg,1,3,,,0.50,3
16,SFM01-1-1_184.jpg,1,1,,,0.30,3
...,...,...,...,...,...,...,...
12,SRPB02-2-1_67.jpg,2,9,-0.102094,-154.532513,0.20,3
24,SRPB02-2-2_143.jpg,4,2,0.709825,-344.263386,0.50,3
0,SRPB02-2-2_153.jpg,0,10,,,0.01,3
0,SRPB02-2-2_161.jpg,0,10,,,0.01,3


Here it can be seen why the silouette score is difficult because it is often undefined.

In [44]:
## save the combinations of parameters, which maximized the silouette score.

df_dbscan_localization.to_csv(config["dbscan_hyperparam_grid"])
df_scan_best_results

Unnamed: 0,image_name,dbscan_count,dbscan_noise,dbscan_silouette_score,dbscan_BIC_score,eps,min_samples
14,SFM01-1-1_114.jpg,1,4,,,0.20,8
24,SFM01-1-1_154.jpg,1,8,,,0.50,3
12,SFM01-1-1_155.jpg,2,5,0.449423,-270.096525,0.20,3
24,SFM01-1-1_163.jpg,1,3,,,0.50,3
16,SFM01-1-1_184.jpg,1,1,,,0.30,3
...,...,...,...,...,...,...,...
12,SRPB02-2-1_67.jpg,2,9,-0.102094,-154.532513,0.20,3
24,SRPB02-2-2_143.jpg,4,2,0.709825,-344.263386,0.50,3
0,SRPB02-2-2_153.jpg,0,10,,,0.01,3
0,SRPB02-2-2_161.jpg,0,10,,,0.01,3


In [45]:
df_scan_best_results.rename(columns={"dbscan_count": "dbscan_count_sil" }, inplace=True)

df_comparison = df_comparison.merge(df_scan_best_results, on='image_name', how='left')

In [46]:
df_comparison.fillna(0, inplace=True)

mse_errors["dbscan_count_sil_rmse"] = mean_squared_error(df_comparison.count_total, df_comparison.dbscan_count_sil, squared=False)

pd.Series(mse_errors)

median_count_rmse        2.380313
mean_count_rmse          2.098189
mode_count_rmse          2.916277
dbscan_count_sil_rmse    2.451397
dtype: float64

In [47]:

df_comparison = df_comparison.drop(["dbscan_noise", "dbscan_silouette_score", "eps", "min_samples", "dbscan_BIC_score", "with_noise", "bic_avg"], axis=1, errors="ignore")
df_comparison

Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count,dbscan_count_sil
0,SFM01-2-2-2_282.jpg,47969795,2,2.0,2.00,1,3,6,"[1, 2, 3]",1
1,SFM01-2-2-1_344.jpg,47969531,4,1.0,1.70,1,10,17,"[1, 1, 1, 1, 1, 1, 1, 3, 3, 4]",3
2,SFM01-2-2-2_293.jpg,47969828,7,2.0,2.57,1,14,36,"[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 5, 8]",3
3,SFM01-1-1_114.jpg,47967876,1,1.0,1.44,1,9,13,"[1, 1, 1, 1, 1, 1, 1, 2, 4]",1
4,SFM01-1-1_154.jpg,47967959,1,1.0,1.00,1,11,11,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
...,...,...,...,...,...,...,...,...,...,...
102,SRPB02-2-1_67.jpg,47987920,1,1.0,1.07,1,14,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]",2
103,SRPB02-2-2_143.jpg,47988474,4,4.0,3.89,4,9,35,"[2, 3, 3, 4, 4, 4, 4, 5, 6]",4
104,SRPB02-2-2_153.jpg,47988580,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
105,SRPB02-2-2_161.jpg,47988636,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0


### HDBSCAN clustering for each image

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html#sklearn.cluster.HDBSCAN states: "A distance threshold. Clusters below this value will be merged."



In [48]:
from zooniverse.analysis import HDBSCAN_Wrapper

hdbscan_values = []

eps_variants = [0.0] # 0 is the default
min_cluster_sizes = [5] # 5 is the default


for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
    annotations_count = get_mark_overview(df_image_name)
    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"])
    
    
    if df_image_name.shape[0] >= 5: # if less then min_cluster_sizes points are available clustering makes no sense
        params = [(eps, min_cluster_size, max_cluster_size) 
                    for eps in eps_variants
                    for min_cluster_size in min_cluster_sizes
                    for max_cluster_size in [None]
              ]

        df_hdbscan = HDBSCAN_Wrapper(df_marks=df_image_name[["x", "y"]],
                                     annotations_count=annotations_count,
                                     output_path=output_plot_path,
                                     plot=show_plots,
                                     show=show_plots,
                                     image_name=image_name,
                                     params=params)
        hdbscan_values.append(df_hdbscan)


df_hdbscan = pd.concat(hdbscan_values)



  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluste

In [49]:
df_hdbscan.drop(["with_noise", "bic_avg"], axis=1, inplace=True)
df_hdbscan

Unnamed: 0,image_name,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,SFM01-1-1_114.jpg,1,0.0,5,,8
0,SFM01-1-1_154.jpg,1,0.0,5,,6
0,SFM01-1-1_155.jpg,1,0.0,5,,22
0,SFM01-1-1_163.jpg,1,0.0,5,,1
0,SFM01-1-1_184.jpg,1,0.0,5,,1
...,...,...,...,...,...,...
0,SRPB02-2-1_67.jpg,1,0.0,5,,9
0,SRPB02-2-2_143.jpg,4,0.0,5,,2
0,SRPB02-2-2_153.jpg,1,0.0,5,,4
0,SRPB02-2-2_161.jpg,1,0.0,5,,5


In [50]:
df_comparison = df_comparison.merge(df_hdbscan, on='image_name', how='left')


df_comparison["count_total"].sum()

331

In [51]:
df_comparison.to_csv(config["comparison_dataset"])
print(f"saved {config['comparison_dataset']}")

saved /Users/christian/data/zooniverse/2024_03_07_expert_goldstandard_analysis/Iguanas 1st launch/Iguanas 1st launch_method_comparison.csv


In [52]:
df_comparison.count_total.fillna(0, inplace=True)
df_comparison.HDBSCAN_count.fillna(0, inplace=True)

mse_errors["hdbscan_count_rmse"] = mean_squared_error(df_comparison.count_total, df_comparison.HDBSCAN_count, squared=False)


# A look into the results
Root Means Squared Error for the different methods

In [53]:
df_rmse = pd.DataFrame(pd.Series(mse_errors).sort_values())

df_rmse.to_csv(config["rmse_errors"])
df_rmse

Unnamed: 0,0
mean_count_rmse,2.098189
hdbscan_count_rmse,2.100289
median_count_rmse,2.380313
dbscan_count_sil_rmse,2.451397
mode_count_rmse,2.916277


## The sum of the clustering
What is the sum of the methods

In [54]:

df_comparison_sum = df_comparison[["count_total", "median_count", "mean_count", "mode_count", "dbscan_count_sil", "HDBSCAN_count"]].sum().sort_values()
df_comparison_sum.to_csv(config["method_sums"])
df_comparison_sum

mode_count          221.00
median_count        235.50
HDBSCAN_count       247.00
dbscan_count_sil    256.00
mean_count          271.41
count_total         331.00
dtype: float64

In [55]:
print(f"phase_tag: {phase_tag}, user_threshold: {user_threshold}")

phase_tag: Iguanas 1st launch, user_threshold: None


## Compare the numbers
The counts are only for images which were in the dataset after filtering.

### Sum of all the Methods

In [56]:
print(f"{config['method_sums'].name}")
pd.read_csv(config["method_sums"])

Iguanas 1st launch_method_sums.csv


Unnamed: 0.1,Unnamed: 0,0
0,mode_count,221.0
1,median_count,235.5
2,HDBSCAN_count,247.0
3,dbscan_count_sil,256.0
4,mean_count,271.41
5,count_total,331.0


### Root Mean Squared Error

In [57]:
print(f"{config['rmse_errors'].name}")
pd.read_csv(config["rmse_errors"])

Iguanas 1st launch_rmse_errors.csv


Unnamed: 0.1,Unnamed: 0,0
0,mean_count_rmse,2.098189
1,hdbscan_count_rmse,2.100289
2,median_count_rmse,2.380313
3,dbscan_count_sil_rmse,2.451397
4,mode_count_rmse,2.916277


### Comparison per Image Level

In [58]:
print(f"load {config['comparison_dataset']}")
pd.read_csv(config["comparison_dataset"])

load /Users/christian/data/zooniverse/2024_03_07_expert_goldstandard_analysis/Iguanas 1st launch/Iguanas 1st launch_method_comparison.csv


Unnamed: 0.1,Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,0,SFM01-2-2-2_282.jpg,47969795,2,2.0,2.00,1,3,6,"[1, 2, 3]",1,1,0.0,5,,1
1,1,SFM01-2-2-1_344.jpg,47969531,4,1.0,1.70,1,10,17,"[1, 1, 1, 1, 1, 1, 1, 3, 3, 4]",3,1,0.0,5,,12
2,2,SFM01-2-2-2_293.jpg,47969828,7,2.0,2.57,1,14,36,"[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 5, 8]",3,1,0.0,5,,30
3,3,SFM01-1-1_114.jpg,47967876,1,1.0,1.44,1,9,13,"[1, 1, 1, 1, 1, 1, 1, 2, 4]",1,1,0.0,5,,8
4,4,SFM01-1-1_154.jpg,47967959,1,1.0,1.00,1,11,11,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1,1,0.0,5,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,102,SRPB02-2-1_67.jpg,47987920,1,1.0,1.07,1,14,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]",2,1,0.0,5,,9
103,103,SRPB02-2-2_143.jpg,47988474,4,4.0,3.89,4,9,35,"[2, 3, 3, 4, 4, 4, 4, 5, 6]",4,4,0.0,5,,2
104,104,SRPB02-2-2_153.jpg,47988580,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0,1,0.0,5,,4
105,105,SRPB02-2-2_161.jpg,47988636,1,1.0,1.00,1,10,10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0,1,0.0,5,,5


## Discussion:
Clustering works, it yields better numbers than just taking mode,median or mean annotations from the volunteers, because it takes the spatial location of the marker dots into consideration.



Assert the numbers haven't changed

In [59]:
## these are the numbers before the sorting was repaired
# if phase_tag == "Iguanas 1st launch" and not debug:
#     if user_threshold == 3:
#         assert df_comparison_sum["mode_count"] == 215
#         assert df_comparison_sum["dbscan_count_sil"] == 222
#         assert df_comparison_sum["median_count"] == 228.5
#         assert df_comparison_sum["HDBSCAN_count"] == 244
#         assert df_comparison_sum["count_total"] == 323
#     if user_threshold is None:
#         assert df_comparison_sum["mode_count"] == 221
#         assert df_comparison_sum["dbscan_count_sil"] == 224
#         assert df_comparison_sum["median_count"] == 235.5
#         assert df_comparison_sum["HDBSCAN_count"] == 247
#         assert df_comparison_sum["count_total"] == 331
#         
# if phase_tag == "Iguanas 2nd launch" and not debug:
#     if user_threshold == 3:
#         assert df_comparison_sum["mode_count"] == 502
#         assert df_comparison_sum["dbscan_count_sil"] == 484
#         assert df_comparison_sum["median_count"] == 475
#         assert df_comparison_sum["HDBSCAN_count"] == 541
#         assert df_comparison_sum["count_total"] == 586
#     if user_threshold is None:
#         assert df_comparison_sum["mode_count"] == 511
#         assert df_comparison_sum["dbscan_count_sil"] == 484
#         assert df_comparison_sum["median_count"] == 484.5
#         assert df_comparison_sum["HDBSCAN_count"] == 541.0
#         assert df_comparison_sum["count_total"] == 589.0
#         
# if phase_tag == "Iguanas 3rd launch" and not debug:
#     if user_threshold == 3:
#         assert df_comparison_sum["mode_count"] == 302
#         assert df_comparison_sum["dbscan_count_sil"] == 309
#         assert df_comparison_sum["median_count"] == 313
#         assert df_comparison_sum["HDBSCAN_count"] == 357
#         assert df_comparison_sum["count_total"] == 351
#     if user_threshold is None:
#         assert df_comparison_sum["mode_count"] == 304
#         assert df_comparison_sum["dbscan_count_sil"] == 309
#         assert df_comparison_sum["median_count"] == 315
#         assert df_comparison_sum["HDBSCAN_count"] == 357
#         assert df_comparison_sum["count_total"] == 351
