# Clustering Zooniverse Marks to count Iguanas
Count all the iguanas in the images by clustering the marks from the zooniverse volunteers.
This does not compare the results to the gold standard and requires only a single file, the flattened zooniverse data export.

In [8]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("./")
sys.path.append("./zooniverse")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Intro
### Retrieve a Classification report from Zooniverse
This notebook is used to cluster the marks from the zooniverse volunteers to count the iguanas in the images saved in results/<phase_tag>/flat_dataset_filtered_Iguanas 3rd launch.csv

Used Methods are:

### DBSCAN 
It does not require the number of clusters to be specified. It is used here because, but has min_samples and eps as hyperparameters which need to be found. [Link](https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html)
For finding eps and min_sample a simple **grid search** is used.
Additionally, DBSCAN not assume a specific shape for the clusters (K-means assumes clusters are gaussian in shape) even though we should assume that points around an iguana is gaussian shaped.

### HDBSCAN
It is an extension of DBSCAN which is more robust to hyperparameter settings as it finds epsilon and min_samples automatically. [Link](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html)

## Load the data

In [9]:
from pathlib import Path

import pandas as pd
from zooniverse.analysis import get_annotation_count_stats
from zooniverse.utils.filters import filter_df_user_threshold



# Phase Selection
phase_tag = "Iguanas 1st launch"
# phase_tag = "Iguanas 2nd launch"
# phase_tag = "Iguanas 3rd launch"


debug = False # debugging with a smaller dataset
plot_diagrams = False # plot the diagrams to disk for the clustering methods
show_plots = False # show the plots in the notebook

user_threshold = None # None or a number, filter records which have less than these user interactions.

use_gold_standard_subset = None # Use no filtering



## Input Path of all the data
input_path =Path("/Users/christian/data/zooniverse")
# input_path = Path("results/")
# Location for the analysis Results
output_path = Path(input_path.joinpath(f"2024_04_10_analysis").joinpath(phase_tag))
output_path.mkdir(exist_ok=True, parents=True)
reprocess = True # if True, the raw classification data is reprocessed. If False, the data is loaded from disk

# Location for plots
output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)


## Look into the config
This Config points to all files necessary for the analysis + the result files

In [10]:
from zooniverse.config import get_config_all

config = get_config_all(phase_tag=phase_tag, input_path=input_path, output_path=output_path)
config

{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': None,
 'gold_standard_image_subset': None,
 'image_source': None,
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/yes_no_dataset_Iguanas 1st launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_dataset_Iguanas 1st launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_panoptes_points_Iguanas 1st launch.csv'),
 'merged_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_dataset_filtered_Iguanas 1st launch.csv'),
 'comparison_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/Iguanas 1st launch_method_comparison.csv'),
 'method_sums': PosixPath('/Users/chri

In [11]:
from zooniverse.utils.anonymize import UserAnonymizer
from zooniverse.utils.data_format import data_prep_all

if reprocess:
    ds_stats = data_prep_all(phase_tag=phase_tag, 
                         output_path=output_path, 
                         input_path=input_path, 
                         config=config)
    print(ds_stats)
    
    # TODO the data should be anonymized already
    # Anomymise the data to prevent usernames and user_ids to become public
    anonymizer = UserAnonymizer(config["flat_dataset"])
    anonymizer.anonymize_data()
    anonymizer.save_anonymized_data(config["flat_dataset"])
    
    anonymizer = UserAnonymizer(config["merged_dataset"])
    anonymizer.anonymize_data()
    anonymizer.save_anonymized_data(config["merged_dataset"])

[32m2024-04-10 21:42:21.957[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 1st launch[0m


{'retired': None, 'link': 'https://www.flickr.com/photos/hellie55/6713782851', 'origin': 'Flickr', 'license': 'Creative Commons - share adapt attribute', 'subject_id': '3', 'attribution': 'hehaden', 'image_name_1': '6713782851_82fc8c73e5_z.jpg', '#secret_description': 'juvenile kittehs practice break-in at the catnip factory'}
{'retired': None, 'link': 'https://www.flickr.com/photos/davebloggs007/15899553966', 'origin': 'Flickr', 'license': 'Creative Commons - share adapt attribute', 'subject_id': '16', 'attribution': 'davebloggs007', 'image_name_1': '15899553966_5debf88369_z.jpg', '#secret_description': 'makeover kitteh not sure these new contacts are sassy enough'}
{'retired': None, 'link': 'https://www.flickr.com/photos/odoketa/8762013777', 'origin': 'Flickr', 'license': 'Creative Commons - share adapt attribute', 'subject_id': '4', 'attribution': 'David Barber', 'image_name_1': '8762013777_1fa7d91e98_z.jpg', '#secret_description': 'hunter kitteh only willing to go so far to capture

[32m2024-04-10 21:42:41.511[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Main Workflow[0m
[32m2024-04-10 21:42:44.958[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: survey tool 061417[0m
[32m2024-04-10 21:42:48.786[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 2nd launch[0m
[32m2024-04-10 21:42:58.557[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 3rd launch[0m
[32m2024-04-10 21:43:31.668[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 4th launch[0m
[32m2024-04-10 21:4

                                       filename  images
0           flat_dataset_Iguanas 1st launch.csv    8260
1  flat_dataset_filtered_Iguanas 1st launch.csv    3929
Data loaded successfully.
Anonymization completed.
Anonymized data saved to /Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_dataset_Iguanas 1st launch.csv
Data loaded successfully.
Anonymization completed.
Anonymized data saved to /Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_dataset_filtered_Iguanas 1st launch.csv


In [12]:
config

{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': None,
 'gold_standard_image_subset': None,
 'image_source': None,
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/yes_no_dataset_Iguanas 1st launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_dataset_Iguanas 1st launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_panoptes_points_Iguanas 1st launch.csv'),
 'merged_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_dataset_filtered_Iguanas 1st launch.csv'),
 'comparison_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/Iguanas 1st launch_method_comparison.csv'),
 'method_sums': PosixPath('/Users/chri

In [28]:
if plot_diagrams == False:
    output_plot_path = None

# the flattened, filtered marks from zooniverse.
df_merged_dataset = pd.read_csv(config["merged_dataset"])

df_merged_dataset[df_merged_dataset.subject_id == 47968406]

Unnamed: 0.1,Unnamed: 0,flight_site_code,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
2584,3451,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,462.800781,390.786377,Adult Male in a lek,Iguanas 1st launch,08cbd51d46cf6d3d7af3f5d1d4f909b6,dd741bfe7c2eabd0265422e728bd1738
2585,3452,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,545.598145,423.652527,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,08cbd51d46cf6d3d7af3f5d1d4f909b6,dd741bfe7c2eabd0265422e728bd1738
2586,3453,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,522.844666,412.275787,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,08cbd51d46cf6d3d7af3f5d1d4f909b6,dd741bfe7c2eabd0265422e728bd1738
2587,3454,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,507.675720,388.258240,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,08cbd51d46cf6d3d7af3f5d1d4f909b6,dd741bfe7c2eabd0265422e728bd1738
2588,3455,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,450.159973,434.397217,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,08cbd51d46cf6d3d7af3f5d1d4f909b6,dd741bfe7c2eabd0265422e728bd1738
...,...,...,...,...,...,...,...,...,...,...,...,...
21597,35696,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,461.436340,386.567505,Adult Male in a lek,Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945
21598,35697,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,506.434204,387.221497,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945
21599,35698,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,543.688904,419.596954,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945
21600,35699,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,530.580627,404.817047,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945


In [29]:
df_merged_dataset[df_merged_dataset.image_name == "SFM01-1-2_264.jpg"]


Unnamed: 0.1,Unnamed: 0,flight_site_code,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
52,64,SFM01-1,14370,121.144,SFM01-1-2_264.jpg,44660616,462.710938,382.437500,Adult Male in a lek,Iguanas 1st launch,da0196ebab87e82a7f2e5b6bee065aa3,14597090a1ca2ae233ec4ccbaf00de67
53,65,SFM01-1,14370,121.144,SFM01-1-2_264.jpg,44660616,522.082031,410.906250,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,da0196ebab87e82a7f2e5b6bee065aa3,14597090a1ca2ae233ec4ccbaf00de67
54,66,SFM01-1,14370,121.144,SFM01-1-2_264.jpg,44660616,507.128906,389.792969,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,da0196ebab87e82a7f2e5b6bee065aa3,14597090a1ca2ae233ec4ccbaf00de67
55,67,SFM01-1,14370,121.144,SFM01-1-2_264.jpg,44660616,548.996094,425.800781,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,da0196ebab87e82a7f2e5b6bee065aa3,14597090a1ca2ae233ec4ccbaf00de67
56,68,SFM01-1,14370,121.144,SFM01-1-2_264.jpg,44660616,370.199219,486.675781,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,da0196ebab87e82a7f2e5b6bee065aa3,14597090a1ca2ae233ec4ccbaf00de67
...,...,...,...,...,...,...,...,...,...,...,...,...
21597,35696,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,461.436340,386.567505,Adult Male in a lek,Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945
21598,35697,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,506.434204,387.221497,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945
21599,35698,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,543.688904,419.596954,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945
21600,35699,SFM1,14370,134.236,SFM01-1-2_264.jpg,47968406,530.580627,404.817047,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,4856c2c566a02d7b50d780ee9f282eb2,25ffd006d4afb04d099ef4c46ff9c945


In [14]:
config["flat_panoptes_points"]

PosixPath('/Users/christian/data/zooniverse/2024_04_10_analysis/Iguanas 1st launch/flat_panoptes_points_Iguanas 1st launch.csv')

In [15]:

df_flat_panoptes_points = pd.read_csv(config["flat_panoptes_points"])
df_flat_panoptes_points[df_flat_panoptes_points.subject_id == 47968406]

Unnamed: 0.1,Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
1197,23738,263876270,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,T2,2020-07-26 21:30:00 UTC,47968406,SFM01-1-2_264.jpg,462,390
1198,23738,263876270,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,T2,2020-07-26 21:30:00 UTC,47968406,SFM01-1-2_264.jpg,545,423
1199,23738,263876270,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,T2,2020-07-26 21:30:00 UTC,47968406,SFM01-1-2_264.jpg,522,412
1200,23738,263876270,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,T2,2020-07-26 21:30:00 UTC,47968406,SFM01-1-2_264.jpg,507,388
1201,23738,263876270,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,T2,2020-07-26 21:30:00 UTC,47968406,SFM01-1-2_264.jpg,450,434
...,...,...,...,...,...,...,...,...,...,...,...
18595,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,548,435
18596,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,360,463
18597,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,362,488
18598,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,456,429


In [30]:
df_flat_panoptes_points[df_flat_panoptes_points.image_name == "SFM01-1-2_264.jpg"] # 44660616, 47968406

Unnamed: 0.1,Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
56,2218,257526095,4d40c60cfa2948ad5bb0624e9b9c9721,015cbacf0a06e22f4543c55afd7d166b,14370,T2,2020-07-03 18:47:12 UTC,44660616,SFM01-1-2_264.jpg,466,390
57,2218,257526095,4d40c60cfa2948ad5bb0624e9b9c9721,015cbacf0a06e22f4543c55afd7d166b,14370,T2,2020-07-03 18:47:12 UTC,44660616,SFM01-1-2_264.jpg,523,414
58,2218,257526095,4d40c60cfa2948ad5bb0624e9b9c9721,015cbacf0a06e22f4543c55afd7d166b,14370,T2,2020-07-03 18:47:12 UTC,44660616,SFM01-1-2_264.jpg,462,431
59,2218,257526095,4d40c60cfa2948ad5bb0624e9b9c9721,015cbacf0a06e22f4543c55afd7d166b,14370,T2,2020-07-03 18:47:12 UTC,44660616,SFM01-1-2_264.jpg,525,436
60,2218,257526095,4d40c60cfa2948ad5bb0624e9b9c9721,015cbacf0a06e22f4543c55afd7d166b,14370,T2,2020-07-03 18:47:12 UTC,44660616,SFM01-1-2_264.jpg,257,592
...,...,...,...,...,...,...,...,...,...,...,...
18595,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,548,435
18596,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,360,463
18597,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,362,488
18598,23739,266898927,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-08 02:08:11 UTC,47968406,SFM01-1-2_264.jpg,456,429


In [16]:
df_flat_panoptes_points

Unnamed: 0.1,Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
0,230846,283007416,77dd9c009b5200eea39788f7f15862f0,003abe9e6bb90e03c50377a33f1137a5,14370,T2,2020-10-26 09:28:46 UTC,47974351,SMF02-1-2-2_570.jpg,186,310
1,372187,271011060,fcbbe951169dc1df0b1f803a199f0995,004eb4442ae5c85fcb6b5c78ed902f41,14370,T2,2020-08-30 16:22:33 UTC,47978494,SFP01_06-2_59.jpg,272,265
2,502904,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,358,194
3,502904,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,485,106
4,502904,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,458,205
...,...,...,...,...,...,...,...,...,...,...,...
24243,251731,282327563,abc5cb03709e2f07494e3e603b1ba8eb,,14370,T2,2020-10-22 19:32:52 UTC,48034447,SRBS03-4_25.jpg,441,879
24244,251731,282327563,abc5cb03709e2f07494e3e603b1ba8eb,,14370,T2,2020-10-22 19:32:52 UTC,48034447,SRBS03-4_25.jpg,136,1013
24245,251731,282327563,abc5cb03709e2f07494e3e603b1ba8eb,,14370,T2,2020-10-22 19:32:52 UTC,48034447,SRBS03-4_25.jpg,1674,811
24246,251731,282327563,abc5cb03709e2f07494e3e603b1ba8eb,,14370,T2,2020-10-22 19:32:52 UTC,48034447,SRBS03-4_25.jpg,1306,852


## Look at the data


In [17]:
## Look at the data
df_merged_dataset


Unnamed: 0.1,Unnamed: 0,flight_site_code,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
0,0,,14370,96.620,,44273024,484.234375,277.609375,Adult Male in a lek,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
1,1,,14370,104.720,,44273037,623.601562,236.324219,Adult Male in a lek,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
2,2,,14370,104.720,,44272996,427.359375,163.644531,Adult Male in a lek,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
3,3,,14370,104.720,,44272996,443.312500,181.160156,Adult Male alone,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
4,4,,14370,104.720,,44272996,541.656250,221.363281,Adult Male alone,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
...,...,...,...,...,...,...,...,...,...,...,...,...
24354,40279,SFB,14370,134.236,SFB02-4-2_69.jpg,47970106,127.448914,471.677856,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,3cc4249a9be2c13956ff0ba70de7ed21
24355,40283,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,351.080933,136.850372,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,1c659139c7b28aca84febf1686e50f4e
24356,40290,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,358.369049,137.822800,Adult Male alone,Iguanas 1st launch,,197be33cb4df4e4a90e590cba7524e26
24357,40291,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,355.557495,143.556152,Adult Male alone,Iguanas 1st launch,,0abea201d4258f8a7de6e9e301ae4b11


### Filter User if necessary and Marks


In [18]:
print(f"Before filtering: {df_merged_dataset.subject_id.nunique()}")
# There images in which some people said there are iguanas, but then didn't mark them. Clustering with fewer than 3 dots doesn't make sense
if user_threshold is not None:
    print(f"filtering records which have less than {user_threshold} interactions.")
    df_merged_dataset = filter_df_user_threshold(df_merged_dataset, user_threshold=user_threshold)
    

from zooniverse.utils.filters import filter_remove_marks
# Check if partials are still in the data. There shouldn't be any
df_merged_dataset = filter_remove_marks(df_merged_dataset)




Before filtering: 3967


### Are there anonymous users in the data?
There should be some.

In [19]:
df_merged_dataset[df_merged_dataset.user_id.isnull().values]

Unnamed: 0.1,Unnamed: 0,flight_site_code,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
6,9,SRIL03,14370,108.740,SRIL03-2-2_116.jpg,44281583,829.133301,599.966675,Adult Male alone,Iguanas 1st launch,,14fcdfb20e7001d8e866e24fef9780ec
7,10,SRIL03,14370,108.740,SRIL03-2-2_132.jpg,44281389,393.633331,614.966675,Adult Male in a lek,Iguanas 1st launch,,14fcdfb20e7001d8e866e24fef9780ec
8,11,SRIL03,14370,108.740,SRIL03-2-2_132.jpg,44281389,226.633331,561.966675,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,14fcdfb20e7001d8e866e24fef9780ec
9,12,SRIL03,14370,108.740,SRIL03-2-2_132.jpg,44281389,223.633331,500.966675,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,14fcdfb20e7001d8e866e24fef9780ec
10,13,SRIL03,14370,108.740,SRIL03-2-2_132.jpg,44281389,303.633331,499.966675,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,14fcdfb20e7001d8e866e24fef9780ec
...,...,...,...,...,...,...,...,...,...,...,...,...
24354,40279,SFB,14370,134.236,SFB02-4-2_69.jpg,47970106,127.448914,471.677856,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,3cc4249a9be2c13956ff0ba70de7ed21
24355,40283,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,351.080933,136.850372,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,1c659139c7b28aca84febf1686e50f4e
24356,40290,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,358.369049,137.822800,Adult Male alone,Iguanas 1st launch,,197be33cb4df4e4a90e590cba7524e26
24357,40291,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,355.557495,143.556152,Adult Male alone,Iguanas 1st launch,,0abea201d4258f8a7de6e9e301ae4b11


In [20]:
# Amount of images
df_merged_dataset["subject_id"].nunique()

3967

In [21]:
## After filtering there
df_merged_dataset

Unnamed: 0.1,Unnamed: 0,flight_site_code,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
0,0,,14370,96.620,,44273024,484.234375,277.609375,Adult Male in a lek,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
1,1,,14370,104.720,,44273037,623.601562,236.324219,Adult Male in a lek,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
2,2,,14370,104.720,,44272996,427.359375,163.644531,Adult Male in a lek,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
3,3,,14370,104.720,,44272996,443.312500,181.160156,Adult Male alone,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
4,4,,14370,104.720,,44272996,541.656250,221.363281,Adult Male alone,Iguanas 1st launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
...,...,...,...,...,...,...,...,...,...,...,...,...
24354,40279,SFB,14370,134.236,SFB02-4-2_69.jpg,47970106,127.448914,471.677856,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,3cc4249a9be2c13956ff0ba70de7ed21
24355,40283,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,351.080933,136.850372,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,,1c659139c7b28aca84febf1686e50f4e
24356,40290,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,358.369049,137.822800,Adult Male alone,Iguanas 1st launch,,197be33cb4df4e4a90e590cba7524e26
24357,40291,SFB,14370,134.236,SFB02-4-2_96.jpg,47970166,355.557495,143.556152,Adult Male alone,Iguanas 1st launch,,0abea201d4258f8a7de6e9e301ae4b11


In [22]:
# how many marks per user
df_merged_dataset[["user_id", "x"]].groupby("user_id").count().head()

Unnamed: 0_level_0,x
user_id,Unnamed: 1_level_1
003abe9e6bb90e03c50377a33f1137a5,1
004eb4442ae5c85fcb6b5c78ed902f41,1
006c16c227f4152ffff80d195fca443b,4
006e42e5ce984ed7c6b340949ae4d0ee,4
00747fbc3b2f5109c0ad9f52f9bf6784,9


In [23]:
# How many images are left in the zooniverse dataset?
df_merged_dataset["subject_id"].nunique()

3967

## Clustering

### Basic Statics like mean, median, mode

In [24]:
from zooniverse.analysis import get_mark_overview

basic_stats = []

for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
    annotations_count = get_mark_overview(df_image_name)

    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"])

    ### basic statistics like mean, median
    basic_stats.append(annotations_count_stats)
    

df_basic_stats = pd.DataFrame(basic_stats)    
df_comparison = df_basic_stats


AttributeError: 'list' object has no attribute 'mean'

In [None]:
df_basic_stats

In [None]:
# There might be records with too few annotations if they were not removed before
df_comparison[(df_comparison.sum_annotations_count < 5)].sort_values(by="users", ascending=False)

In [None]:
# Fill NaN values with 0 because the errors can't be calculated otherwise
df_comparison.fillna(0, inplace=True)


In [None]:
df_comparison

In [None]:
df_comparison.sum()

### DBSCAN clustering and take the variant with the best silouette score for each image


In [None]:
### The old variant
# from zooniverse.analysis import compare_dbscan
# 
# eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
# min_samples_variants = [3, 5, 8, 10]
# if debug:
#     eps_variants = [0.3]
#     min_samples_variants = [3]
# params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]
# 
# db_scan_results = {}
# db_scan_best_results = []
# db_scan_best_bic_results = []
# for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
# 
#     dbscan_localization = compare_dbscan(
#         # phase_tag=phase_tag,
#         params=params,
#         df_flat=df_image_name,
#         # output_path=output_path,
#         output_plot_path=output_plot_path,
#         plot=show_plots,
#         
#     )
# 
#     db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
#     db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_silouette_score", ascending=False).iloc[0])
#     
# df_dbscan_localization = pd.concat([*db_scan_results.values()])
# df_scan_best_results = pd.DataFrame(db_scan_best_results)



In [None]:
# df_scan_best_results

In [None]:
## fixes the problem with the silouette score sorting
from zooniverse.analysis import compare_dbscan

eps_variants = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
min_samples_variants = [3, 5, 8, 10]
if debug:
    eps_variants = [0.3]
    min_samples_variants = [3]
params = [(eps, min_samples) for eps in eps_variants for min_samples in min_samples_variants]

db_scan_results = {}
db_scan_best_results = []
db_scan_best_bic_results = []
for image_name, df_image_name in df_merged_dataset.groupby("image_name"):

    dbscan_localization = compare_dbscan(
        params=params,
        df_flat=df_image_name,
        output_plot_path=output_plot_path,
        plot=show_plots
    )

    db_scan_results[image_name] = pd.DataFrame(dbscan_localization)
    
    # DBSCAN tends to classfy all points as noise if min_samples is too high. Often only a single user marked an iguana.
    # Sillouette Scoring needs a minimum of 2 clusters
    # if there are points in decent radius they will belong to a cluster
    if pd.DataFrame(dbscan_localization).dbscan_count.max() == 1:
        db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
        db_scan_best_bic_results.append(pd.DataFrame(dbscan_localization).sort_values("dbscan_count", ascending=False).iloc[0])
        # If two or more cluster seem to exists take ones with the best Silouette score
    else:  
        # take the best result by silouette score if there are more clusters then 1
        db_scan_best_results.append(pd.DataFrame(dbscan_localization).sort_values(["dbscan_silouette_score", "dbscan_count"], ascending=[False, False]).iloc[0])
    
df_dbscan_localization = pd.concat([*db_scan_results.values()])
df_scan_best_results = pd.DataFrame(db_scan_best_results)


In [None]:
df_scan_best_results

Here it can be seen why the silouette score is difficult because it is often undefined.

In [None]:
## save the combinations of parameters, which maximized the silouette score.

df_dbscan_localization.to_csv(config["dbscan_hyperparam_grid"])
df_scan_best_results

In [None]:
df_scan_best_results.rename(columns={"dbscan_count": "dbscan_count_sil" }, inplace=True)

df_comparison = df_comparison.merge(df_scan_best_results, on='image_name', how='left')

In [None]:

df_comparison = df_comparison.drop(["dbscan_noise", "dbscan_silouette_score", "eps", "min_samples", "dbscan_BIC_score", "with_noise", "bic_avg"], axis=1, errors="ignore")
df_comparison

### HDBSCAN clustering for each image


In [None]:
from zooniverse.analysis import hdbscan

hdbscan_values = []

eps_variants = [0.0] # 0 is the default
min_cluster_sizes = [5] # 5 is the default


for image_name, df_image_name in df_merged_dataset.groupby("image_name"):
    annotations_count = get_mark_overview(df_image_name)
    annotations_count_stats = get_annotation_count_stats(annotations_count=annotations_count,
                                                         image_name=df_image_name.iloc[0]["image_name"])
    
    # if less than min_cluster_sizes points are available clustering makes no sense
    if df_image_name.shape[0] >= 5: # If num_samples is 5 for the min_cluster_size is 5 there is no point in passing data with less than 5 samples
        params = [(eps, min_cluster_size, max_cluster_size) 
                    for eps in eps_variants
                    for min_cluster_size in min_cluster_sizes
                    for max_cluster_size in [None]
              ]

        df_hdbscan = hdbscan(df_marks=df_image_name[["x", "y"]],
                                     output_path=output_plot_path,
                                     plot=show_plots,
                                     show=show_plots,
                                     image_name=image_name,
                                     params=params)
        hdbscan_values.append(df_hdbscan)


df_hdbscan = pd.concat(hdbscan_values)



In [None]:
df_hdbscan.drop(["with_noise"], axis=1, inplace=True)
df_hdbscan

In [None]:
df_comparison = df_comparison.merge(df_hdbscan, on='image_name', how='left')
df_comparison.fillna(0, inplace=True)
df_comparison

In [None]:
df_comparison.to_csv(config["comparison_dataset"])
print(f"saved {config['comparison_dataset']}")

# A look into the results

## The sum of the clustering
What is the sum of the methods

In [None]:

df_comparison_sum = df_comparison[["median_count", "mean_count", "mode_count", "dbscan_count_sil", "HDBSCAN_count"]].sum().sort_values()
df_comparison_sum.to_csv(config["method_sums"])


In [None]:
print(f"phase_tag: {phase_tag}, user_threshold: {user_threshold}")

## Compare the numbers
The counts are only for images which were in the dataset after filtering.

### Sum of all the Methods

In [None]:
print(f"{config['method_sums'].name}")
df_method_sums = pd.read_csv(config["method_sums"])
df_method_sums

### Comparison per Image Level

In [None]:
print(f"load {config['comparison_dataset']}")
pd.read_csv(config["comparison_dataset"])