# Clustering Zooniverse Marks to count Iguanas
Count all the iguanas in the images by clustering the marks from the zooniverse volunteers.
This does not compare the results to the gold standard and requires only a single file, the flattened zooniverse data export.

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("./")
sys.path.append("./zooniverse")

## Intro
### Retrieve a Classification report from Zooniverse
This notebook is used to cluster the marks from the zooniverse volunteers to count the iguanas in the images saved in results/<phase_tag>/flat_dataset_filtered_Iguanas 3rd launch.csv

Used Methods are:

### DBSCAN 
It does not require the number of clusters to be specified. It is used here because, but has min_samples and eps as hyperparameters which need to be found. [Link](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html)
For finding eps and min_sample a simple **grid search** is used.
Additionally, DBSCAN not assume a specific shape for the clusters (K-means assumes clusters are gaussian in shape) even though we should assume that points around an iguana is gaussian shaped.

### HDBSCAN
It is an extension of DBSCAN which is more robust to hyperparameter settings as it finds epsilon and min_samples automatically. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html)

## Load the data

In [2]:
from pathlib import Path

import pandas as pd
from zooniverse.analysis import get_annotation_count_stats
from zooniverse.utils.filters import filter_df_user_threshold



# Phase Selection
# phase_tag = "Iguanas 1st launch"
phase_tag = "Iguanas 2nd launch"
# phase_tag = "Iguanas 3rd launch"


debug = False # debugging with a smaller dataset
plot_diagrams = False # plot the diagrams to disk for the clustering methods
show_plots = False # show the plots in the notebook

user_threshold = None # None or a number, filter records which have less than these user interactions.
use_gold_standard_subset = None # Use no filtering


## Input Path of all the data
input_path =Path("/Users/christian/data/zooniverse")
# input_path = Path("results/")
# Location for the analysis Results
output_path = Path(input_path.joinpath(f"2024_04_16_analysis").joinpath(phase_tag))
output_path.mkdir(exist_ok=True, parents=True)
reprocess = False # if True, the raw classification data is reprocessed. If False, the data is loaded from disk

# Location for plots
# output_plot_path = output_path.joinpath("plots")
# output_plot_path.mkdir(parents=True, exist_ok=True)
output_plot_path = None # do not plot

## Look into the config
This Config points to all files necessary for the analysis + the result files

In [3]:
from zooniverse.config import get_config

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)
config

{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-2ndphase.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/2-T2-GS-results-5th-0s.csv'),
 'image_source': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/2nd launch_without_prefix'),
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/yes_no_dataset_Iguanas 2nd launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/flat_dataset_Iguanas 2nd launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/flat_panoptes_points_Iguanas 2nd launch.csv'),
 'panoptes_question': Pos

In [4]:
config["flat_panoptes_points"]

PosixPath('/Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/flat_panoptes_points_Iguanas 2nd launch.csv')

## Look at the data


In [5]:

df_flat_panoptes_points = pd.read_csv(config["flat_panoptes_points"])
df_flat_panoptes_points["phase"] = phase_tag

df_flat_panoptes_points

  df_flat_panoptes_points = pd.read_csv(config["flat_panoptes_points"])


Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y,phase
0,406989334,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-04 00:07:09 UTC,72335718,EPS04-1_50.jpg,557,529,Iguanas 2nd launch
1,406989334,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-04 00:07:09 UTC,72335718,EPS04-1_50.jpg,604,472,Iguanas 2nd launch
2,406989334,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-04 00:07:09 UTC,72335718,EPS04-1_50.jpg,1068,917,Iguanas 2nd launch
3,406983766,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-03 23:26:39 UTC,72335747,EPS04-1_87.jpg,116,815,Iguanas 2nd launch
4,406983766,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-03 23:26:39 UTC,72335747,EPS04-1_87.jpg,173,850,Iguanas 2nd launch
...,...,...,...,...,...,...,...,...,...,...,...,...
147418,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,97,117,Iguanas 2nd launch
147419,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,113,186,Iguanas 2nd launch
147420,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,87,167,Iguanas 2nd launch
147421,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,443,124,Iguanas 2nd launch


### Filter User if necessary and Marks


In [6]:
# Use the panoptes Extraction instead of the custom extraction
df_merged_dataset = df_flat_panoptes_points

### Are there anonymous users in the data?
There should be some because anonymous users are kept.

In [7]:
df_merged_dataset[df_merged_dataset.user_id.isnull().values]

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y,phase
128822,409274355,3622446dcfa0cb24b026e40766d1382b,,20600,94.166,T2,2022-04-15 08:40:28 UTC,72332800,EGI01-1-2_111.jpg,1019,863,Iguanas 2nd launch
128823,394601142,22f2858fc7b6a25d0d8910cb081899f0,,20600,93.166,T2,2022-02-04 18:24:39 UTC,72332804,EGI01-1-2_24.jpg,588,209,Iguanas 2nd launch
128824,411354835,b9cfbb298b4d527dec7191c58d133d32,,20600,94.166,T2,2022-04-26 14:09:57 UTC,72332804,EGI01-1-2_24.jpg,604,219,Iguanas 2nd launch
128825,411545755,fe60792452f428d3a02cf49a4e4e9448,,20600,94.166,T2,2022-04-27 01:47:20 UTC,72332804,EGI01-1-2_24.jpg,597,208,Iguanas 2nd launch
128826,396273092,ac28275d5c75246a09823bc8a9b55c33,,20600,93.166,T2,2022-02-11 11:18:06 UTC,72332808,EGI01-1-2_38.jpg,1471,-659,Iguanas 2nd launch
...,...,...,...,...,...,...,...,...,...,...,...,...
147418,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,97,117,Iguanas 2nd launch
147419,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,113,186,Iguanas 2nd launch
147420,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,87,167,Iguanas 2nd launch
147421,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,443,124,Iguanas 2nd launch


In [8]:
# Amount of images
df_merged_dataset["subject_id"].nunique()

4321

In [9]:
## After filtering there
df_merged_dataset

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y,phase
0,406989334,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-04 00:07:09 UTC,72335718,EPS04-1_50.jpg,557,529,Iguanas 2nd launch
1,406989334,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-04 00:07:09 UTC,72335718,EPS04-1_50.jpg,604,472,Iguanas 2nd launch
2,406989334,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-04 00:07:09 UTC,72335718,EPS04-1_50.jpg,1068,917,Iguanas 2nd launch
3,406983766,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-03 23:26:39 UTC,72335747,EPS04-1_87.jpg,116,815,Iguanas 2nd launch
4,406983766,da233c785cfd7c42279d2f2b632bbc4f,001d00e0739694888f7dc3471fc76e5c,20600,94.166,T2,2022-04-03 23:26:39 UTC,72335747,EPS04-1_87.jpg,173,850,Iguanas 2nd launch
...,...,...,...,...,...,...,...,...,...,...,...,...
147418,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,97,117,Iguanas 2nd launch
147419,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,113,186,Iguanas 2nd launch
147420,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,87,167,Iguanas 2nd launch
147421,397662853,9432ca625cbcee771e54db4c895908d3,,20600,93.166,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,443,124,Iguanas 2nd launch


In [10]:
# how many marks per user
df_merged_dataset[["user_id", "user_name", "x"]].groupby("user_name").count()

Unnamed: 0_level_0,user_id,x
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1
0017d25c56dde700371a7b31c37d1bb4,10,10
001a4ac0175c2415dcffc072cd58094b,0,2
001a7e5e989807349d0dc5e90eaebf83,63,63
002d783ad543c3030deb5b904b99d044,6,6
009a3b2a0da3c28e726e27f09d201582,5,5
...,...,...
ffa31db924ebf1353aaac21d4c6a3ff1,0,3
ffac078ce61b035a5a0af18d4a9f53f8,9,9
ffdbd721ffbb85f7622630c03c4c5912,20,20
ffe50577f6ab39e113b36e68f9ea7edf,7,7


## Clustering

### Basic Statics like mean, median, mode

In [11]:
from zooniverse.analysis import get_mark_overview

basic_stats = []

for image_name, df_image_name in df_merged_dataset.groupby("subject_id"):
    annotations_count = get_mark_overview(df_image_name)

    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"], 
                                                         subject_id=df_image_name.iloc[0]["subject_id"])

    ### basic statistics like mean, median, mode
    basic_stats.append(annotations_count_stats)
    

df_basic_stats = pd.DataFrame(basic_stats)    
df_comparison = df_basic_stats


In [12]:
df_basic_stats

Unnamed: 0,image_name,subject_id,median_count,mean_count,mode_min_count,mode_max_count,mode_count,mode_count_avg,users,sum_annotations_count,annotations_count
0,EGI01-1-1_11.jpg,72332772,2.0,2.00,2,2,[2],2.0,1,2,[2]
1,EGI01-1-1_19.jpg,72332775,1.0,1.00,1,1,[1],1.0,1,1,[1]
2,EGI01-1-1_54.jpg,72332788,1.0,1.00,1,1,[1],1.0,1,1,[1]
3,EGI01-1-1_69.jpg,72332794,1.0,1.00,1,1,[1],1.0,1,1,[1]
4,EGI01-1-2_110.jpg,72332799,12.0,12.00,12,12,[12],12.0,1,12,[12]
...,...,...,...,...,...,...,...,...,...,...,...
4316,ESCG02-2_59.jpg,72373345,1.0,1.00,1,1,[1],1.0,2,2,"[1, 1]"
4317,ESCG02-2_62.jpg,72373347,1.0,1.00,1,1,[1],1.0,4,4,"[1, 1, 1, 1]"
4318,ESCG02-2_63.jpg,72373348,1.0,1.25,1,1,[1],1.0,16,20,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]"
4319,ESCG02-2_64.jpg,72373350,5.0,5.47,5,5,[5],5.0,15,82,"[1, 2, 3, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 10]"


In [13]:
# There might be records with too few annotations if they were not removed before
df_comparison[(df_comparison.sum_annotations_count < 5)].sort_values(by="users", ascending=False)

Unnamed: 0,image_name,subject_id,median_count,mean_count,mode_min_count,mode_max_count,mode_count,mode_count_avg,users,sum_annotations_count,annotations_count
4317,ESCG02-2_62.jpg,72373347,1.0,1.0,1,1,[1],1.0,4,4,"[1, 1, 1, 1]"
1758,EPS08-2_183.jpg,72336856,1.0,1.0,1,1,[1],1.0,4,4,"[1, 1, 1, 1]"
1823,EPS08-2_340.jpg,72336954,1.0,1.0,1,1,[1],1.0,4,4,"[1, 1, 1, 1]"
3570,FSCE-1.1-2_45.jpg,72340392,1.0,1.0,1,1,[1],1.0,4,4,"[1, 1, 1, 1]"
1809,EPS08-2_318.jpg,72336934,1.0,1.0,1,1,[1],1.0,4,4,"[1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...
1661,EPS08-1_237.jpg,72336700,1.0,1.0,1,1,[1],1.0,1,1,[1]
1651,EPS08-1_212.jpg,72336679,2.0,2.0,2,2,[2],2.0,1,2,[2]
1646,EPS08-1_207.jpg,72336674,2.0,2.0,2,2,[2],2.0,1,2,[2]
1645,EPS08-1_201.jpg,72336668,1.0,1.0,1,1,[1],1.0,1,1,[1]


In [14]:
# Fill NaN values with 0
df_comparison.fillna(0, inplace=True)


### DBSCAN clustering and take the variant with the best silouette score for each image
Grid search for the best silouette score among the permutations of eps and min_samples values.

In [15]:
## sort by silouette score sorting + dbscan_count
from zooniverse.analysis import compare_dbscan

eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
min_samples_variants = [3, 5, 8, 10]
if debug:
    eps_variants = [0.3]
    min_samples_variants = [3]
params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]

db_scan_results = {}
db_scan_best_results = []
for image_name, df_image_name in df_merged_dataset.groupby("subject_id"):
    # check for at least 5 marks
    if df_image_name.shape[0] >= 5:
        dbscan_localization = compare_dbscan(
            params=params,
            df_flat=df_image_name,
            output_plot_path=output_plot_path,
            plot=show_plots
        )
    
        db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
        
        # DBSCAN tends to classify all points as noise if min_samples is too high. Often only a single user marked an iguana.
        # Sillouette Scoring needs a minimum of 2 clusters
        # if there are points in decent radius they will belong to a cluster
        if pd.DataFrame(dbscan_localization).dbscan_count.max() == 1:
            db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
            # If two or more cluster seem to exists take ones with the best Silouette score
        else:  
            # take the best result by silouette score if there are more clusters then 1
            db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values(["dbscan_silouette_score", "dbscan_count"], ascending=[False, False]).iloc[0])
    
df_dbscan_localization = pd.concat([*db_scan_results.values()])
df_scan_best_results = pd.DataFrame(db_scan_best_results)


  df_dbscan_localization = pd.concat([*db_scan_results.values()])


In [16]:
df_scan_best_results

Unnamed: 0,image_name,subject_id,dbscan_count,dbscan_noise,dbscan_silouette_score,dbscan_BIC_score,eps,min_samples
24,EGI01-1-2_110.jpg,72332799,1,9,,,0.50,3
24,EGI01-1-2_121.jpg,72332802,5,2,0.662195,,0.50,3
20,EGI01-1-2_122.jpg,72332803,2,3,0.879223,,0.40,3
24,EGI01-1-2_24.jpg,72332804,1,7,,,0.50,3
0,EGI01-1-2_62.jpg,72332814,1,18,,,0.01,3
...,...,...,...,...,...,...,...,...
12,ESCG02-2_53.jpg,72373336,9,6,0.792117,,0.20,3
0,ESCG02-2_56.jpg,72373341,0,5,,,0.01,3
8,ESCG02-2_63.jpg,72373348,2,7,0.158355,,0.10,3
12,ESCG02-2_64.jpg,72373350,4,7,0.801983,,0.20,3


Here it can be seen why the silouette score is difficult because it is often undefined.

In [17]:
## save the combinations of parameters, which maximized the silouette score.
df_dbscan_localization.to_csv(config["dbscan_hyperparam_grid"])


In [18]:
df_scan_best_results.rename(columns={"dbscan_count": "dbscan_count_sil"}, inplace=True)


df_comparison = df_comparison.merge(df_scan_best_results.drop(["image_name"], axis=1), on='subject_id', how='left')

In [19]:
# drop columns which are not interesting here
df_comparison = df_comparison.drop(["dbscan_noise", "dbscan_silouette_score", "eps", "min_samples", "dbscan_BIC_score", "with_noise", "bic_avg"], axis=1, errors="ignore")
df_comparison

Unnamed: 0,image_name,subject_id,median_count,mean_count,mode_min_count,mode_max_count,mode_count,mode_count_avg,users,sum_annotations_count,annotations_count,dbscan_count_sil
0,EGI01-1-1_11.jpg,72332772,2.0,2.00,2,2,[2],2.0,1,2,[2],
1,EGI01-1-1_19.jpg,72332775,1.0,1.00,1,1,[1],1.0,1,1,[1],
2,EGI01-1-1_54.jpg,72332788,1.0,1.00,1,1,[1],1.0,1,1,[1],
3,EGI01-1-1_69.jpg,72332794,1.0,1.00,1,1,[1],1.0,1,1,[1],
4,EGI01-1-2_110.jpg,72332799,12.0,12.00,12,12,[12],12.0,1,12,[12],1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4316,ESCG02-2_59.jpg,72373345,1.0,1.00,1,1,[1],1.0,2,2,"[1, 1]",
4317,ESCG02-2_62.jpg,72373347,1.0,1.00,1,1,[1],1.0,4,4,"[1, 1, 1, 1]",
4318,ESCG02-2_63.jpg,72373348,1.0,1.25,1,1,[1],1.0,16,20,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]",2.0
4319,ESCG02-2_64.jpg,72373350,5.0,5.47,5,5,[5],5.0,15,82,"[1, 2, 3, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 10]",4.0


### HDBSCAN clustering for each image


In [20]:
from zooniverse.analysis import hdbscan

hdbscan_values = []

eps_variants = [0.0] # 0 is the default
min_cluster_sizes = [5] # 5 is the default


for image_name, df_image_name in df_merged_dataset.groupby("subject_id"):
    annotations_count = get_mark_overview(df_image_name)
    
    image_name=df_image_name.iloc[0]["image_name"]
    subject_id=df_image_name.iloc[0]["subject_id"]
    
    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=image_name, 
                                                         subject_id=subject_id)
    
    # if less than min_cluster_sizes points are available clustering makes no sense
    if df_image_name.shape[0] >= 5: # If num_samples is 5 for the min_cluster_size is 5 there is no point in passing data with less than 5 samples
        params = [(eps, min_cluster_size, max_cluster_size) 
                    for eps in eps_variants
                    for min_cluster_size in min_cluster_sizes
                    for max_cluster_size in [None]
              ]

        df_hdbscan = hdbscan(df_marks=df_image_name[["x", "y"]],
                                     output_path=output_plot_path,
                                     plot=show_plots,
                                     show=show_plots,
                                     image_name=image_name,
                                     subject_id=subject_id,
                                     params=params)
        hdbscan_values.append(df_hdbscan)


df_hdbscan = pd.concat(hdbscan_values)



In [21]:
df_hdbscan.drop(["with_noise"], axis=1, inplace=True)
df_hdbscan

Unnamed: 0,image_name,subject_id,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,EGI01-1-2_110.jpg,72332799,1,0.0,5,,6
0,EGI01-1-2_121.jpg,72332802,1,0.0,5,,6
0,EGI01-1-2_122.jpg,72332803,3,0.0,5,,0
0,EGI01-1-2_24.jpg,72332804,1,0.0,5,,8
0,EGI01-1-2_62.jpg,72332814,1,0.0,5,,17
...,...,...,...,...,...,...,...
0,ESCG02-2_53.jpg,72373336,10,0.0,5,,5
0,ESCG02-2_56.jpg,72373341,1,0.0,5,,0
0,ESCG02-2_63.jpg,72373348,1,0.0,5,,14
0,ESCG02-2_64.jpg,72373350,6,0.0,5,,14


In [22]:
## Statistics for all images
df_comparison = df_comparison.merge(df_hdbscan.drop(["image_name"], axis=1), on=['subject_id'], how='left')
df_comparison.fillna(0, inplace=True)
df_comparison

Unnamed: 0,image_name,subject_id,median_count,mean_count,mode_min_count,mode_max_count,mode_count,mode_count_avg,users,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,EGI01-1-1_11.jpg,72332772,2.0,2.00,2,2,[2],2.0,1,2,[2],0.0,0.0,0.0,0.0,0,0.0
1,EGI01-1-1_19.jpg,72332775,1.0,1.00,1,1,[1],1.0,1,1,[1],0.0,0.0,0.0,0.0,0,0.0
2,EGI01-1-1_54.jpg,72332788,1.0,1.00,1,1,[1],1.0,1,1,[1],0.0,0.0,0.0,0.0,0,0.0
3,EGI01-1-1_69.jpg,72332794,1.0,1.00,1,1,[1],1.0,1,1,[1],0.0,0.0,0.0,0.0,0,0.0
4,EGI01-1-2_110.jpg,72332799,12.0,12.00,12,12,[12],12.0,1,12,[12],1.0,1.0,0.0,5.0,0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4316,ESCG02-2_59.jpg,72373345,1.0,1.00,1,1,[1],1.0,2,2,"[1, 1]",0.0,0.0,0.0,0.0,0,0.0
4317,ESCG02-2_62.jpg,72373347,1.0,1.00,1,1,[1],1.0,4,4,"[1, 1, 1, 1]",0.0,0.0,0.0,0.0,0,0.0
4318,ESCG02-2_63.jpg,72373348,1.0,1.25,1,1,[1],1.0,16,20,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]",2.0,1.0,0.0,5.0,0,14.0
4319,ESCG02-2_64.jpg,72373350,5.0,5.47,5,5,[5],5.0,15,82,"[1, 2, 3, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 10]",4.0,6.0,0.0,5.0,0,14.0


In [23]:
df_panoptes_question = pd.read_csv(config["panoptes_question"])
df_panoptes_question

Unnamed: 0,subject_id,data.no,data.yes
0,72332768,22.0,0.0
1,72332769,21.0,0.0
2,72332770,21.0,0.0
3,72332771,20.0,1.0
4,72332772,21.0,2.0
...,...,...,...
9092,72373345,19.0,2.0
9093,72373347,18.0,4.0
9094,72373348,5.0,16.0
9095,72373350,6.0,15.0


In [24]:
df_comparison.to_csv(config["comparison_dataset"])
print(f"saved {config['comparison_dataset']}")

saved /Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/Iguanas 2nd launch_method_comparison.csv


## Join the Expert Dataset for comparison
This dataset contains total_counts found by experts. 

In [34]:
df_expert_count = pd.read_csv(config["goldstandard_data"], sep=";")
# check what is in there
df_expert_count[df_expert_count.image_name == "ESCG02-1_19.jpg"]

Unnamed: 0,subspecies,island,site_name,subject_group,image_name,subject_id,presence_absence,count_male-lek,count_male-no-lek,count_others,count_partial,count_total,quality,condition,comment
179,A. c. venustissimus,Española,South Coast G,South Coast G,ESCG02-1_19.jpg,72373250,Y,2,4,29,2,35,Good,Visible,Three iguanas only Andres sees; concenso 35 ig...


In [26]:
## join the gold standard data to the basic stats
df_comparison_expert = df_expert_count[["site_name", "subject_group", "image_name", "subject_id", "count_total"]].merge(df_comparison.drop(["image_name"], axis=1), on='subject_id', how='left')

columns_to_fill = ['median_count', 'mean_count', 'mode_min_count', 'mode_max_count',
       'mode_count', 'mode_count_avg', 'users', 'sum_annotations_count',
        'dbscan_count_sil', 'HDBSCAN_count']
df_comparison_expert[columns_to_fill] = df_comparison_expert[columns_to_fill].fillna(0)

df_comparison_expert.sort_values(by="median_count", ascending=False)

Unnamed: 0,site_name,subject_group,image_name,subject_id,count_total,median_count,mean_count,mode_min_count,mode_max_count,mode_count,mode_count_avg,users,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
179,South Coast G,South Coast G,ESCG02-1_19.jpg,72373250,35,30.5,29.90,29.0,35.0,"[29, 30, 31, 35]",31.250000,20.0,598.0,"[1, 14, 21, 23, 26, 28, 29, 29, 30, 30, 31, 31...",28.0,33.0,0.0,5.0,0.0,33.0
1,Gardner Islet,Gardner Islet,EGI01-1-2_121.jpg,72332802,0,27.0,27.00,27.0,27.0,[27],27.000000,1.0,27.0,[27],5.0,1.0,0.0,5.0,0.0,6.0
356,South Coast E,South Coast E,FSCE-1.1-2_50.jpg,72340402,0,26.0,26.00,26.0,26.0,[26],26.000000,1.0,26.0,[26],3.0,1.0,0.0,5.0,0.0,7.0
296,Punta Ayora,Punta Ayora,FPA04_151.jpg,72339662,24,22.0,19.32,24.0,24.0,[24],24.000000,19.0,367.0,"[2, 4, 12, 13, 15, 16, 21, 21, 22, 22, 23, 23,...",25.0,25.0,0.0,5.0,0.0,15.0
205,South Coast H,South Coast H,ESCH02-1_112.jpg,72337968,20,17.5,17.40,14.0,21.0,"[14, 20, 21]",18.333333,20.0,348.0,"[11, 12, 14, 14, 14, 14, 16, 16, 16, 17, 18, 2...",18.0,20.0,0.0,5.0,0.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,Montura,Montura,FMO02-2_157.jpg,72338536,0,0.0,0.00,0.0,0.0,0,0.000000,0.0,0.0,,0.0,0.0,,,,
245,Montura,Montura,FMO04-1_37.jpg,72338729,0,0.0,0.00,0.0,0.0,0,0.000000,0.0,0.0,,0.0,0.0,,,,
246,Montura,Montura,FMO04-1_53.jpg,72338735,0,0.0,0.00,0.0,0.0,0,0.000000,0.0,0.0,,0.0,0.0,,,,
247,Montura,Montura,FMO04-1_54.jpg,72338736,0,0.0,0.00,0.0,0.0,0,0.000000,0.0,0.0,,0.0,0.0,,,,


In [27]:
print(f"df_comparison_expert: {config['comparison_dataset_expert']}")
df_comparison_expert.to_csv(config["comparison_dataset_expert"])

df_comparison_expert: /Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/Iguanas 2nd launch_method_comparison_expert.csv


In [28]:
df_comparison_yes_no = df_comparison_expert.merge(df_panoptes_question, on="subject_id", how="left")
df_comparison_yes_no

Unnamed: 0,site_name,subject_group,image_name,subject_id,count_total,median_count,mean_count,mode_min_count,mode_max_count,mode_count,...,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points,data.no,data.yes
0,Gardner Islet,Gardner Islet,EGI01-1-2_120.jpg,72332801,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,23.0,0.0
1,Gardner Islet,Gardner Islet,EGI01-1-2_121.jpg,72332802,0,27.0,27.00,27.0,27.0,[27],...,27.0,[27],5.0,1.0,0.0,5.0,0.0,6.0,19.0,1.0
2,Gardner Islet,Gardner Islet,EGI01-2-2_116.jpg,72332876,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,21.0,0.0
3,Gardner Islet,Gardner Islet,EGI02-2_114.jpg,72332972,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,21.0,0.0
4,Gardner Islet,Gardner Islet,EGI02-2_36.jpg,72332999,0,1.0,1.00,1.0,1.0,[1],...,1.0,[1],0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,South Coast J,South Coast J,FSCJ02-2_243.jpg,72341576,3,1.0,1.53,1.0,1.0,[1],...,23.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4]",2.0,2.0,0.0,5.0,0.0,2.0,6.0,15.0
452,South Coast J,South Coast J,FSCJ02-2_251.jpg,72341585,1,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,21.0,0.0
453,South Coast J,South Coast J,FSCJ02-2_269.jpg,72341593,5,4.0,5.05,4.0,5.0,"[4, 5]",...,96.0,"[2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, ...",7.0,7.0,0.0,5.0,0.0,10.0,3.0,19.0
454,South Coast J,South Coast J,FSCJ02-2_59.jpg,72341609,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,22.0,0.0


In [29]:
df_comparison_yes_no.to_csv(config["comparison_dataset_yes_no"])
df_comparison_yes_no

Unnamed: 0,site_name,subject_group,image_name,subject_id,count_total,median_count,mean_count,mode_min_count,mode_max_count,mode_count,...,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points,data.no,data.yes
0,Gardner Islet,Gardner Islet,EGI01-1-2_120.jpg,72332801,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,23.0,0.0
1,Gardner Islet,Gardner Islet,EGI01-1-2_121.jpg,72332802,0,27.0,27.00,27.0,27.0,[27],...,27.0,[27],5.0,1.0,0.0,5.0,0.0,6.0,19.0,1.0
2,Gardner Islet,Gardner Islet,EGI01-2-2_116.jpg,72332876,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,21.0,0.0
3,Gardner Islet,Gardner Islet,EGI02-2_114.jpg,72332972,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,21.0,0.0
4,Gardner Islet,Gardner Islet,EGI02-2_36.jpg,72332999,0,1.0,1.00,1.0,1.0,[1],...,1.0,[1],0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,South Coast J,South Coast J,FSCJ02-2_243.jpg,72341576,3,1.0,1.53,1.0,1.0,[1],...,23.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4]",2.0,2.0,0.0,5.0,0.0,2.0,6.0,15.0
452,South Coast J,South Coast J,FSCJ02-2_251.jpg,72341585,1,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,21.0,0.0
453,South Coast J,South Coast J,FSCJ02-2_269.jpg,72341593,5,4.0,5.05,4.0,5.0,"[4, 5]",...,96.0,"[2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, ...",7.0,7.0,0.0,5.0,0.0,10.0,3.0,19.0
454,South Coast J,South Coast J,FSCJ02-2_59.jpg,72341609,0,0.0,0.00,0.0,0.0,0,...,0.0,,0.0,0.0,,,,,22.0,0.0


# A look into the results

## The sum of the clustering
What is the sum of the methods

In [30]:
df_comparison_yes_no[df_comparison_yes_no["data.yes"] >= 5]

Unnamed: 0,site_name,subject_group,image_name,subject_id,count_total,median_count,mean_count,mode_min_count,mode_max_count,mode_count,...,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points,data.no,data.yes
10,Gardner Islet,Gardner Islet,EGI04-1_141.jpg,72333244,3,1.0,1.64,1.0,1.0,[1],...,36.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...",2.0,3.0,0.0,5.0,0.0,0.0,0.0,23.0
14,Gardner Islet,Gardner Islet,EGI08-2_78.jpg,72333835,1,1.0,1.00,1.0,1.0,[1],...,12.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1.0,1.0,0.0,5.0,0.0,6.0,10.0,12.0
38,Punta Suarez,Punta Suarez,EPS01_28.jpg,72335422,1,1.0,1.50,1.0,1.0,[1],...,12.0,"[1, 1, 1, 1, 1, 1, 1, 5]",2.0,1.0,0.0,5.0,0.0,7.0,14.0,8.0
39,Punta Suarez,Punta Suarez,EPS01_41.jpg,72335435,1,2.0,1.67,2.0,2.0,[2],...,25.0,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",2.0,2.0,0.0,5.0,0.0,0.0,5.0,17.0
41,Punta Suarez,Punta Suarez,EPS02_208.jpg,72335541,2,2.0,2.19,2.0,2.0,[2],...,46.0,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4.0,2.0,0.0,5.0,0.0,7.0,1.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,South Coast J,South Coast J,FSCJ02-2_220.jpg,72341559,1,1.0,1.25,1.0,1.0,[1],...,25.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2.0,1.0,0.0,5.0,0.0,20.0,0.0,21.0
450,South Coast J,South Coast J,FSCJ02-2_233.jpg,72341570,1,1.0,1.33,1.0,1.0,[1],...,12.0,"[1, 1, 1, 1, 1, 1, 1, 1, 4]",1.0,1.0,0.0,5.0,0.0,7.0,9.0,12.0
451,South Coast J,South Coast J,FSCJ02-2_243.jpg,72341576,3,1.0,1.53,1.0,1.0,[1],...,23.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4]",2.0,2.0,0.0,5.0,0.0,2.0,6.0,15.0
453,South Coast J,South Coast J,FSCJ02-2_269.jpg,72341593,5,4.0,5.05,4.0,5.0,"[4, 5]",...,96.0,"[2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, ...",7.0,7.0,0.0,5.0,0.0,10.0,3.0,19.0


In [31]:
# Take only the subjects which have more or equal 5 yes votes.
df_method_sums = df_comparison_yes_no[df_comparison_yes_no["data.yes"] >= 5][["count_total", "median_count", "mean_count", "mode_min_count", "mode_max_count", "mode_count_avg", "mode_min_count", "mode_max_count", "dbscan_count_sil", "HDBSCAN_count"]].sum()
df_method_sums.to_csv(config["method_sums"])
df_method_sums

count_total         589.000000
median_count        484.000000
mean_count          499.970000
mode_min_count      467.000000
mode_max_count      512.000000
mode_count_avg      487.833333
mode_min_count      467.000000
mode_max_count      512.000000
dbscan_count_sil    517.000000
HDBSCAN_count       544.000000
dtype: float64

## Compare the numbers
The counts are only for images which were in the dataset after filtering.

### Sum of all the Methods

In [32]:
print(f"{config['method_sums'].name}")
df_method_sums = pd.read_csv(config["method_sums"])
df_method_sums

Iguanas 2nd launch_method_sums.csv


Unnamed: 0.1,Unnamed: 0,0
0,count_total,589.0
1,median_count,484.0
2,mean_count,499.97
3,mode_min_count,467.0
4,mode_max_count,512.0
5,mode_count_avg,487.833333
6,mode_min_count,467.0
7,mode_max_count,512.0
8,dbscan_count_sil,517.0
9,HDBSCAN_count,544.0


### Comparison per Image Level

In [33]:
print(f"load {config['comparison_dataset']}")
pd.read_csv(config["comparison_dataset"])

load /Users/christian/data/zooniverse/2024_04_16_analysis/Iguanas 2nd launch/Iguanas 2nd launch_method_comparison.csv


Unnamed: 0.1,Unnamed: 0,image_name,subject_id,median_count,mean_count,mode_min_count,mode_max_count,mode_count,mode_count_avg,users,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,0,EGI01-1-1_11.jpg,72332772,2.0,2.00,2,2,[2],2.0,1,2,[2],0.0,0.0,0.0,0.0,0,0.0
1,1,EGI01-1-1_19.jpg,72332775,1.0,1.00,1,1,[1],1.0,1,1,[1],0.0,0.0,0.0,0.0,0,0.0
2,2,EGI01-1-1_54.jpg,72332788,1.0,1.00,1,1,[1],1.0,1,1,[1],0.0,0.0,0.0,0.0,0,0.0
3,3,EGI01-1-1_69.jpg,72332794,1.0,1.00,1,1,[1],1.0,1,1,[1],0.0,0.0,0.0,0.0,0,0.0
4,4,EGI01-1-2_110.jpg,72332799,12.0,12.00,12,12,[12],12.0,1,12,[12],1.0,1.0,0.0,5.0,0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4316,4316,ESCG02-2_59.jpg,72373345,1.0,1.00,1,1,[1],1.0,2,2,"[1, 1]",0.0,0.0,0.0,0.0,0,0.0
4317,4317,ESCG02-2_62.jpg,72373347,1.0,1.00,1,1,[1],1.0,4,4,"[1, 1, 1, 1]",0.0,0.0,0.0,0.0,0,0.0
4318,4318,ESCG02-2_63.jpg,72373348,1.0,1.25,1,1,[1],1.0,16,20,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]",2.0,1.0,0.0,5.0,0,14.0
4319,4319,ESCG02-2_64.jpg,72373350,5.0,5.47,5,5,[5],5.0,15,82,"[1, 2, 3, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 10]",4.0,6.0,0.0,5.0,0,14.0
