# Clustering Zooniverse Marks to count Iguanas
Count all the iguanas in the images by clustering the marks from the zooniverse volunteers.
This does not compare the results to the gold standard.

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("./")
sys.path.append("./zooniverse")

## Intro
### Retrieve a Classification report from Zooniverse
Export the classification export from your zooniverse project.
https://www.zooniverse.org/lab/11905/data-exports

This leads to a csv file which can be used for the analysis which should be renamed to `iguanas-from-above-classifications.csv` and placed in the `input_path` directory.
The methods do not use methods from zooniverse. It is a custom implementation.

An alternative would be to use the [code provided by zooniverse](https://github.com/zooniverse/Data-digging/tree/master/notebooks_ProcessExports)
[Bird Count Example](https://github.com/zooniverse/Data-digging/blob/master/scripts_ProjectExamples/seabirdwatch/bird_count.py)

This notebooks assumes the data is flat and prepared. An alternative format would be the [cesar aggregation format](https://github.com/zooniverse/aggregation-for-caesar)

Used Methods are:

### DBSCAN 
It does not require the number of clusters to be specified. It is used here because, but has min_samples and eps as hyperparameters which need to be found. [Link](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html)
For finding eps and min_sample a simple **grid search** is used.
Additionally, DBSCAN not assume a specific shape for the clusters (K-means assumes clusters are gaussian in shape) even though we should assume that points around an iguana is gaussian shaped.

### HDBSCAN
It is an extension of DBSCAN which is more robust to hyperparameter settings as it finds epsilon and min_samples automatically. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html)

## Load the data

In [2]:
from pathlib import Path

import pandas as pd
from zooniverse.analysis import get_annotation_count_stats
from zooniverse.utils.filters import filter_df_user_threshold

## Input Path of all the data
input_path =Path("/Users/christian/data/zooniverse")

reprocess = False # if True, the raw classification data is reprocessed. If False, the data is loaded from disk

# Phase Selection
# phase_tag = "Iguanas 1st launch"
# phase_tag = "Iguanas 2nd launch"
phase_tag = "Iguanas 3rd launch"


debug = False # debugging with a smaller dataset
plot_diagrams = False # plot the diagrams to disk for the clustering methods
show_plots = False # show the plots in the notebook

user_threshold = 3 # None or a number, filter records which have less than these user interactions.

use_gold_standard_subset = None # Use no filtering

# Location for the analysis Results
output_path = Path(input_path.joinpath(f"2024_03_19_analysis").joinpath(phase_tag))
output_path.mkdir(exist_ok=True, parents=True)

# Location for plots
output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)


## Look into the config
This Config points to all files necessary for the analysis + the result files

In [3]:
from zooniverse.config import get_config_all

config = get_config_all(phase_tag=phase_tag, input_path=input_path, output_path=output_path)
config

{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': None,
 'gold_standard_image_subset': None,
 'image_source': None,
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/yes_no_dataset_Iguanas 3rd launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/flat_dataset_Iguanas 3rd launch.csv'),
 'merged_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/flat_dataset_filtered_Iguanas 3rd launch.csv'),
 'comparison_dataset': PosixPath('/Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/Iguanas 3rd launch_method_comparison.csv'),
 'method_sums': PosixPath('/Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/Iguanas 3rd launch_method_sums.csv'),
 'dbscan_hyperparam_grid': PosixPath('/Users/christian/d

In [4]:
from zooniverse.utils.anonymize import UserAnonymizer
from zooniverse.utils.data_format import data_prep_all

if reprocess:
    ds_stats = data_prep_all(phase_tag=phase_tag, 
                         output_path=output_path, 
                         input_path=input_path, 
                         config=config)
    print(ds_stats)
    
    anonymizer = UserAnonymizer(config["flat_dataset"])
    anonymizer.anonymize_data()
    anonymizer.save_anonymized_data(config["flat_dataset"])
    
    anonymizer = UserAnonymizer(config["merged_dataset"])
    anonymizer.anonymize_data()
    anonymizer.save_anonymized_data(config["merged_dataset"])

In [5]:
if plot_diagrams == False:
    output_plot_path = None

# the flattened, filtered marks from zooniverse.
df_merged_dataset = pd.read_csv(config["merged_dataset"])



## Look at the data


In [6]:
## Look at the data
df_merged_dataset


Unnamed: 0.1,Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
0,0,,,78861920,937.072327,58.004669,Adult Male not in a lek,Iguanas 3rd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
1,1,,,78861920,1094.908203,1027.116699,Adult Male not in a lek,Iguanas 3rd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
2,2,,,78861920,782.393250,62.739746,"Others (females, young males, juveniles)",Iguanas 3rd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
3,3,,,78861920,861.311157,275.818115,"Others (females, young males, juveniles)",Iguanas 3rd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
4,4,,,78861920,982.844727,353.157684,"Others (females, young males, juveniles)",Iguanas 3rd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
...,...,...,...,...,...,...,...,...,...,...
106332,113868,GEB02,GEB02-3_48.jpg,78922637,857.985596,933.296021,"Others (females, young males, juveniles)",Iguanas 3rd launch,,056c845097ec85c235b306eee6a8b7ac
106333,113869,GEB02,GEB02-3_115.jpg,78922583,600.195435,856.913757,"Others (females, young males, juveniles)",Iguanas 3rd launch,,b5dd00492abed0932dee745a2ef255bd
106334,113870,GEB02,GEB02-3_197.jpg,78922625,485.176636,426.866882,"Others (females, young males, juveniles)",Iguanas 3rd launch,,b5dd00492abed0932dee745a2ef255bd
106335,113871,GEB02,GEB02-3_145.jpg,78922607,939.968628,88.864525,"Others (females, young males, juveniles)",Iguanas 3rd launch,,76e50c3b5d561e911f31905321b3a20b


### Filter User if necessary and Marks


In [7]:
print(f"Before filtering: {df_merged_dataset.subject_id.nunique()}")
# There images in which some people said there are iguanas, but then didn't mark them. Clustering with fewer than 3 dots doesn't make sense
if user_threshold is not None:
    print(f"filtering records which have less than {user_threshold} interactions.")
    df_merged_dataset = filter_df_user_threshold(df_merged_dataset, user_threshold=user_threshold)
    
    
from zooniverse.utils.filters import filter_remove_marks
# Check if partials are still in the data. There shouldn't be any
df_merged_dataset = filter_remove_marks(df_merged_dataset)




Before filtering: 7666
filtering records which have less than 3 interactions.


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



### Are there anonymous users in the data?
There should be some.

In [8]:
df_merged_dataset[df_merged_dataset.user_id.isnull().values]

Unnamed: 0.1,Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
101280,108395,,EGI04-1_141.jpg,78963577,178.789062,142.882812,Adult Male not in a lek,Iguanas 3rd launch,,68ff1efc2df6568710cab521d1095e72
101281,108396,,EGI04-1_141.jpg,78963577,80.406250,239.140625,"Others (females, young males, juveniles)",Iguanas 3rd launch,,68ff1efc2df6568710cab521d1095e72
92404,98952,,EGI08-2_78.jpg,78963961,263.500549,597.307312,Adult Male not in a lek,Iguanas 3rd launch,,d97aa31a8ccc671b0b245f1030857543
32482,35456,,EPS01_28.jpg,78963899,404.259186,676.611877,Adult Male not in a lek,Iguanas 3rd launch,,64865a585c8844c6779ac3fb4acb0417
38206,41612,,EPS01_28.jpg,78963899,402.240112,679.267700,Adult Male not in a lek,Iguanas 3rd launch,,62762d334049b0001bfecf5937f62f32
...,...,...,...,...,...,...,...,...,...,...
10705,11577,WestCoast,PWC03-2-4_82.jpg,78963461,292.804230,1022.759766,Adult Male not in a lek,Iguanas 3rd launch,,77d1529c3ba0afd003581e1975243c28
23628,26014,WestCoast,PWC03-2-4_82.jpg,78963461,300.249969,1024.011719,"Others (females, young males, juveniles)",Iguanas 3rd launch,,ef6c7ca056c059911cdf140c2e755b22
13624,14905,WestCoast,PWC03-2-4_85.jpg,78963464,548.868530,51.378170,"Others (females, young males, juveniles)",Iguanas 3rd launch,,aef564f61364050072f0b1df2080d03d
44738,48532,WestCoast,PWC03-2-4_85.jpg,78963464,548.724854,55.921528,Adult Male not in a lek,Iguanas 3rd launch,,e7ee1905cdf749f2bf096393496876c2


In [9]:
# Amount of images
df_merged_dataset["subject_id"].nunique()

1216

In [10]:
## After filtering there
df_merged_dataset

Unnamed: 0.1,Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
13048,14294,,EGI04-1_141.jpg,78963577,174.377457,140.835648,Adult Male not in a lek,Iguanas 3rd launch,2b43a5981ef5e0c9345e32317105e429,c46159a74cc1f058c9cee2575138ae99
24783,27310,,EGI04-1_141.jpg,78963577,169.812500,141.750000,Adult Male not in a lek,Iguanas 3rd launch,0156eebf62383fedf03616142d065d39,dcaeb0f97c49f8f7fd4bd7778a65161b
25186,27760,,EGI04-1_141.jpg,78963577,176.250000,143.966675,"Others (females, young males, juveniles)",Iguanas 3rd launch,faa0df7dda7937bb487653ebfc56b546,a8054858afefe524394af1961fcc441a
25187,27761,,EGI04-1_141.jpg,78963577,81.250000,238.966675,"Others (females, young males, juveniles)",Iguanas 3rd launch,faa0df7dda7937bb487653ebfc56b546,a8054858afefe524394af1961fcc441a
29181,31981,,EGI04-1_141.jpg,78963577,173.804688,139.500000,Adult Male with a lek,Iguanas 3rd launch,e6f340f064f94a663878bc4b42405d26,5e399f2f7699370439f69cad583f99e3
...,...,...,...,...,...,...,...,...,...,...
66303,72154,WestCoast,PWC03-2-5_114.jpg,78963477,24.796158,602.070312,"Others (females, young males, juveniles)",Iguanas 3rd launch,2d2678d41ae210433fa8925fb4051ea7,820796dac33f79ab7a4e9048ee5b633f
92664,99222,WestCoast,PWC03-2-5_114.jpg,78963477,23.494946,604.926392,"Others (females, young males, juveniles)",Iguanas 3rd launch,,fdc9363749bb94c37bcd4fc2acfe48e5
92892,99476,WestCoast,PWC03-2-5_114.jpg,78963477,25.454941,601.314026,"Others (females, young males, juveniles)",Iguanas 3rd launch,c11a32c827347926881e5e1db75cb701,691500ccebe2131f83809524df652f87
98013,104954,WestCoast,PWC03-2-5_114.jpg,78963477,26.607056,604.818420,"Others (females, young males, juveniles)",Iguanas 3rd launch,a51bf56b64b52219bf601d3ded4cd557,7b2ac1ff690264fb36564345b7151831


In [11]:
# how many marks per user
df_merged_dataset[["user_id", "x"]].groupby("user_id").count().head()

Unnamed: 0_level_0,x
user_id,Unnamed: 1_level_1
002400ef36f94c5e2a6ccc49859923d8,1
00346ebf6ae91002059d21fa7090e46b,23
00793f958618604613957f477a017037,4
009387ec0ca4c7a3d03714180e93182f,3
00fe829914ab4a2c86c538876fb8469a,17


In [12]:
# How many images are left in the zooniverse dataset?
df_merged_dataset["subject_id"].nunique()

1216

## Clustering

### Basic Statics like mean, median, mode

In [13]:
from zooniverse.analysis import get_mark_overview

basic_stats = []

for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
    annotations_count = get_mark_overview(df_image_name)

    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"])

    ### basic statistics like mean, median
    basic_stats.append(annotations_count_stats)
    

df_basic_stats = pd.DataFrame(basic_stats)    
df_comparison = df_basic_stats


In [14]:
df_basic_stats

Unnamed: 0,image_name,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count
0,EGI04-1_141.jpg,2.0,1.80,1,15,27,"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4]"
1,EGI08-2_78.jpg,1.0,1.00,1,8,8,"[1, 1, 1, 1, 1, 1, 1, 1]"
2,EPS01_28.jpg,1.0,1.00,1,9,9,"[1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,EPS01_41.jpg,2.0,1.89,2,19,36,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,EPS02_208.jpg,2.0,2.59,2,27,70,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
...,...,...,...,...,...,...,...
1211,PWC03-2-2_98.jpg,1.0,1.25,1,4,5,"[1, 1, 1, 2]"
1212,PWC03-2-4_66.jpg,2.0,1.85,2,13,24,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4]"
1213,PWC03-2-4_82.jpg,1.0,1.00,1,16,16,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1214,PWC03-2-4_85.jpg,1.0,1.25,1,12,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]"


In [15]:
# There might be records with too few annotations if they were not removed before
df_comparison[(df_comparison.sum_annotations_count < 5)].sort_values(by="users", ascending=False)

Unnamed: 0,image_name,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count
31,ESCB-2.1-2_111.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
36,ESCB-2.1-2_52.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1167,PCIS03-5_88.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1122,PCIE08-6_49.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1108,PCIE07-3_153.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1100,PCIE06-4_125.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1072,PCIC01-3_167.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1050,PCI01-4_67.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
1023,MWBBC06-2_105.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"
949,MWBBA02-4_04.jpg,1.0,1.0,1,4,4,"[1, 1, 1, 1]"


In [16]:
# Fill NaN values with 0 because the errors can't be calculated otherwise
df_comparison.fillna(0, inplace=True)


In [17]:
df_comparison

Unnamed: 0,image_name,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count
0,EGI04-1_141.jpg,2.0,1.80,1,15,27,"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4]"
1,EGI08-2_78.jpg,1.0,1.00,1,8,8,"[1, 1, 1, 1, 1, 1, 1, 1]"
2,EPS01_28.jpg,1.0,1.00,1,9,9,"[1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,EPS01_41.jpg,2.0,1.89,2,19,36,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,EPS02_208.jpg,2.0,2.59,2,27,70,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
...,...,...,...,...,...,...,...
1211,PWC03-2-2_98.jpg,1.0,1.25,1,4,5,"[1, 1, 1, 2]"
1212,PWC03-2-4_66.jpg,2.0,1.85,2,13,24,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4]"
1213,PWC03-2-4_82.jpg,1.0,1.00,1,16,16,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1214,PWC03-2-4_85.jpg,1.0,1.25,1,12,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]"


In [18]:
df_comparison.sum()

image_name               EGI04-1_141.jpgEGI08-2_78.jpgEPS01_28.jpgEPS01...
median_count                                                        3734.5
mean_count                                                         4036.56
mode_count                                                            3669
users                                                                18275
sum_annotations_count                                                70948
annotations_count        [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, ...
dtype: object

### DBSCAN clustering and take the variant with the best silouette score for each image


In [19]:
### The old variant
# from zooniverse.analysis import compare_dbscan_hyp_v2
# 
# eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
# min_samples_variants = [3, 5, 8, 10]
# if debug:
#     eps_variants = [0.3]
#     min_samples_variants = [3]
# params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]
# 
# db_scan_results = {}
# db_scan_best_results = []
# db_scan_best_bic_results = []
# for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
# 
#     dbscan_localization = compare_dbscan_hyp_v2(
#         # phase_tag=phase_tag,
#         params=params,
#         df_flat=df_image_name,
#         # output_path=output_path,
#         output_plot_path=output_plot_path,
#         plot=show_plots,
#         
#     )
# 
#     db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
#     
#     # TODO Here lies the main problem with DBSCAN.
#     ## DBSCAN tends to classfy all points as noise if min_samples is too high. Often only a single user marked an iguana.
#     ## Sillouette Scoring needs a minimum of 2 clusters
#     ## if there are points in decent radius they will belong to a cluster
#     # if pd.DataFrame(dbscan_localization).dbscan_count.max() == 1:
#     #     db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
#     #     db_scan_best_bic_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
#     # # If two or more cluster seem to exists take ones with the best BIC or Silouette score
#     # else:  
#     # take the best result by silouette score if there are more clusters then 1
#     ## TODO make the sorting deterministic
#     db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_silouette_score", ascending=False).iloc[0])
#     
# df_dbscan_localization = pd.concat([*db_scan_results.values()])
# df_scan_best_results = pd.DataFrame(db_scan_best_results)



In [20]:
# df_scan_best_results

In [21]:
## fixes the problem with the silouette score sorting
from zooniverse.analysis import compare_dbscan_hyp_v2

eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
min_samples_variants = [3, 5, 8, 10]
if debug:
    eps_variants = [0.3]
    min_samples_variants = [3]
params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]

db_scan_results = {}
db_scan_best_results = []
db_scan_best_bic_results = []
for image_name, df_image_name in df_merged_dataset.groupby("image_name"):

    dbscan_localization = compare_dbscan_hyp_v2(
        params=params,
        df_flat=df_image_name,
        output_plot_path=output_plot_path,
        plot=show_plots
    )

    db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
    
    # TODO Here lies the main problem with DBSCAN.
    # DBSCAN tends to classfy all points as noise if min_samples is too high. Often only a single user marked an iguana.
    # Sillouette Scoring needs a minimum of 2 clusters
    # if there are points in decent radius they will belong to a cluster
    if pd.DataFrame(dbscan_localization).dbscan_count.max() == 1:
        db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
        db_scan_best_bic_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
        # If two or more cluster seem to exists take ones with the best Silouette score
    else:  
        # take the best result by silouette score if there are more clusters then 1
        db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values(["dbscan_silouette_score", "dbscan_count"], ascending=[False, False]).iloc[0])
    
df_dbscan_localization = pd.concat([*db_scan_results.values()])
df_scan_best_results = pd.DataFrame(db_scan_best_results)


  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluste

In [22]:
df_scan_best_results

Unnamed: 0,image_name,dbscan_count,dbscan_noise,dbscan_silouette_score,dbscan_BIC_score,eps,min_samples
12,EGI04-1_141.jpg,2,4,0.789230,-225.342799,0.20,3
0,EGI08-2_78.jpg,0,8,,,0.01,3
0,EPS01_28.jpg,0,9,,,0.01,3
8,EPS01_41.jpg,3,2,0.930901,-320.924560,0.10,3
24,EPS02_208.jpg,4,0,0.907422,-662.351013,0.50,3
...,...,...,...,...,...,...,...
16,PWC03-2-2_98.jpg,1,1,,,0.30,3
8,PWC03-2-4_66.jpg,2,4,0.801725,-227.740439,0.10,3
24,PWC03-2-4_82.jpg,1,9,,,0.50,3
16,PWC03-2-4_85.jpg,2,0,0.967358,-112.159695,0.30,3


Here it can be seen why the silouette score is difficult because it is often undefined.

In [23]:
## save the combinations of parameters, which maximized the silouette score.

df_dbscan_localization.to_csv(config["dbscan_hyperparam_grid"])
df_scan_best_results

Unnamed: 0,image_name,dbscan_count,dbscan_noise,dbscan_silouette_score,dbscan_BIC_score,eps,min_samples
12,EGI04-1_141.jpg,2,4,0.789230,-225.342799,0.20,3
0,EGI08-2_78.jpg,0,8,,,0.01,3
0,EPS01_28.jpg,0,9,,,0.01,3
8,EPS01_41.jpg,3,2,0.930901,-320.924560,0.10,3
24,EPS02_208.jpg,4,0,0.907422,-662.351013,0.50,3
...,...,...,...,...,...,...,...
16,PWC03-2-2_98.jpg,1,1,,,0.30,3
8,PWC03-2-4_66.jpg,2,4,0.801725,-227.740439,0.10,3
24,PWC03-2-4_82.jpg,1,9,,,0.50,3
16,PWC03-2-4_85.jpg,2,0,0.967358,-112.159695,0.30,3


In [24]:
df_scan_best_results.rename(columns={"dbscan_count": "dbscan_count_sil" }, inplace=True)

df_comparison = df_comparison.merge(df_scan_best_results, on='image_name', how='left')

In [25]:

df_comparison = df_comparison.drop(["dbscan_noise", "dbscan_silouette_score", "eps", "min_samples", "dbscan_BIC_score", "with_noise", "bic_avg"], axis=1, errors="ignore")
df_comparison

Unnamed: 0,image_name,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count,dbscan_count_sil
0,EGI04-1_141.jpg,2.0,1.80,1,15,27,"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4]",2
1,EGI08-2_78.jpg,1.0,1.00,1,8,8,"[1, 1, 1, 1, 1, 1, 1, 1]",0
2,EPS01_28.jpg,1.0,1.00,1,9,9,"[1, 1, 1, 1, 1, 1, 1, 1, 1]",0
3,EPS01_41.jpg,2.0,1.89,2,19,36,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3
4,EPS02_208.jpg,2.0,2.59,2,27,70,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4
...,...,...,...,...,...,...,...,...
1211,PWC03-2-2_98.jpg,1.0,1.25,1,4,5,"[1, 1, 1, 2]",1
1212,PWC03-2-4_66.jpg,2.0,1.85,2,13,24,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4]",2
1213,PWC03-2-4_82.jpg,1.0,1.00,1,16,16,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
1214,PWC03-2-4_85.jpg,1.0,1.25,1,12,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]",2


### HDBSCAN clustering for each image


In [26]:
from zooniverse.analysis import HDBSCAN_Wrapper

hdbscan_values = []

eps_variants = [0.0] # 0 is the default
min_cluster_sizes = [5] # 5 is the default


for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
    annotations_count = get_mark_overview(df_image_name)
    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"])
    
    # if less than min_cluster_sizes points are available clustering makes no sense
    if df_image_name.shape[0] >= 5: # If num_samples is 5 for the min_cluster_size is 5 there is no point in passing data with less than 5 samples
        params = [(eps, min_cluster_size, max_cluster_size) 
                    for eps in eps_variants
                    for min_cluster_size in min_cluster_sizes
                    for max_cluster_size in [None]
              ]

        df_hdbscan = HDBSCAN_Wrapper(df_marks=df_image_name[["x", "y"]],
                                     annotations_count=annotations_count,
                                     output_path=output_plot_path,
                                     plot=show_plots,
                                     show=show_plots,
                                     image_name=image_name,
                                     params=params)
        hdbscan_values.append(df_hdbscan)


df_hdbscan = pd.concat(hdbscan_values)



  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
  variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluste

In [27]:
df_hdbscan.drop(["with_noise", "bic_avg"], axis=1, inplace=True)
df_hdbscan

Unnamed: 0,image_name,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,EGI04-1_141.jpg,2,0.0,5,,3
0,EGI08-2_78.jpg,1,0.0,5,,3
0,EPS01_28.jpg,1,0.0,5,,4
0,EPS01_41.jpg,2,0.0,5,,2
0,EPS02_208.jpg,5,0.0,5,,0
...,...,...,...,...,...,...
0,PWC03-2-2_98.jpg,1,0.0,5,,0
0,PWC03-2-4_66.jpg,2,0.0,5,,0
0,PWC03-2-4_82.jpg,1,0.0,5,,11
0,PWC03-2-4_85.jpg,1,0.0,5,,10


In [28]:
df_comparison = df_comparison.merge(df_hdbscan, on='image_name', how='left')
df_comparison.fillna(0, inplace=True)
df_comparison

Unnamed: 0,image_name,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,EGI04-1_141.jpg,2.0,1.80,1,15,27,"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4]",2,2.0,0.0,5.0,0,3.0
1,EGI08-2_78.jpg,1.0,1.00,1,8,8,"[1, 1, 1, 1, 1, 1, 1, 1]",0,1.0,0.0,5.0,0,3.0
2,EPS01_28.jpg,1.0,1.00,1,9,9,"[1, 1, 1, 1, 1, 1, 1, 1, 1]",0,1.0,0.0,5.0,0,4.0
3,EPS01_41.jpg,2.0,1.89,2,19,36,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3,2.0,0.0,5.0,0,2.0
4,EPS02_208.jpg,2.0,2.59,2,27,70,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4,5.0,0.0,5.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211,PWC03-2-2_98.jpg,1.0,1.25,1,4,5,"[1, 1, 1, 2]",1,1.0,0.0,5.0,0,0.0
1212,PWC03-2-4_66.jpg,2.0,1.85,2,13,24,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4]",2,2.0,0.0,5.0,0,0.0
1213,PWC03-2-4_82.jpg,1.0,1.00,1,16,16,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1,1.0,0.0,5.0,0,11.0
1214,PWC03-2-4_85.jpg,1.0,1.25,1,12,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]",2,1.0,0.0,5.0,0,10.0


In [29]:
df_comparison.to_csv(config["comparison_dataset"])
print(f"saved {config['comparison_dataset']}")

saved /Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/Iguanas 3rd launch_method_comparison.csv


In [30]:
df_comparison.HDBSCAN_count.fillna(0, inplace=True)


# A look into the results

## The sum of the clustering
What is the sum of the methods

In [31]:

df_comparison_sum = df_comparison[["median_count", "mean_count", "mode_count", "dbscan_count_sil", "HDBSCAN_count"]].sum().sort_values()
df_comparison_sum.to_csv(config["method_sums"])
df_comparison_sum

mode_count          3669.00
median_count        3734.50
dbscan_count_sil    4018.00
mean_count          4036.56
HDBSCAN_count       4135.00
dtype: float64

In [32]:
print(f"phase_tag: {phase_tag}, user_threshold: {user_threshold}")

phase_tag: Iguanas 3rd launch, user_threshold: 3


## Compare the numbers
The counts are only for images which were in the dataset after filtering.

### Sum of all the Methods

In [33]:
print(f"{config['method_sums'].name}")
df_method_sums = pd.read_csv(config["method_sums"])
df_method_sums

Iguanas 3rd launch_method_sums.csv


Unnamed: 0.1,Unnamed: 0,0
0,mode_count,3669.0
1,median_count,3734.5
2,dbscan_count_sil,4018.0
3,mean_count,4036.56
4,HDBSCAN_count,4135.0


### Comparison per Image Level

In [34]:
print(f"load {config['comparison_dataset']}")
pd.read_csv(config["comparison_dataset"])

load /Users/christian/data/zooniverse/2024_03_19_analysis/Iguanas 3rd launch/Iguanas 3rd launch_method_comparison.csv


Unnamed: 0.1,Unnamed: 0,image_name,median_count,mean_count,mode_count,users,sum_annotations_count,annotations_count,dbscan_count_sil,HDBSCAN_count,eps,min_cluster_size,max_cluster_size,noise_points
0,0,EGI04-1_141.jpg,2.0,1.80,1,15,27,"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4]",2,2.0,0.0,5.0,0,3.0
1,1,EGI08-2_78.jpg,1.0,1.00,1,8,8,"[1, 1, 1, 1, 1, 1, 1, 1]",0,1.0,0.0,5.0,0,3.0
2,2,EPS01_28.jpg,1.0,1.00,1,9,9,"[1, 1, 1, 1, 1, 1, 1, 1, 1]",0,1.0,0.0,5.0,0,4.0
3,3,EPS01_41.jpg,2.0,1.89,2,19,36,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3,2.0,0.0,5.0,0,2.0
4,4,EPS02_208.jpg,2.0,2.59,2,27,70,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",4,5.0,0.0,5.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211,1211,PWC03-2-2_98.jpg,1.0,1.25,1,4,5,"[1, 1, 1, 2]",1,1.0,0.0,5.0,0,0.0
1212,1212,PWC03-2-4_66.jpg,2.0,1.85,2,13,24,"[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4]",2,2.0,0.0,5.0,0,0.0
1213,1213,PWC03-2-4_82.jpg,1.0,1.00,1,16,16,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1,1.0,0.0,5.0,0,11.0
1214,1214,PWC03-2-4_85.jpg,1.0,1.25,1,12,15,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]",2,1.0,0.0,5.0,0,10.0
