# Recalculate errors of DBSCAN / HDBSCAN


In [18]:
from zooniverse.config import get_config
import pandas as pd
from pathlib import Path

### use either the subset of the subset
# phase_tag = "Iguanas 1st launch"
# phase_tag = "Iguanas 2nd launch"
phase_tag = "Iguanas 3rd launch"
input_path = Path("/Users/christian/data/zooniverse")

use_gold_standard_subset = "expert_goldstandard"  # Use the X-T2-GS-results-5th-0s as the basis
# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("../data/4th_december/").joinpath(phase_tag).resolve()
output_path.mkdir(exist_ok=True, parents=True)

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)

df_method_comparison = pd.read_csv(
    config["comparison_dataset"], sep=",", index_col=0)[["subject_id", "dbscan_count", "HDBSCAN_count", "median_count", "mean_count","mode_count"]]



FileNotFoundError: [Errno 2] No such file or directory: '/Users/christian/PycharmProjects/data/4th_december/Iguanas 3rd launch/Iguanas 3rd launch_method_comparison.csv'

In [406]:
df_method_comparison

Unnamed: 0,subject_id,dbscan_count,HDBSCAN_count,median_count,mean_count,mode_count
0,78926344,2.0,2.0,1.5,1.67,1.0
1,78928708,2.0,2.0,2.0,2.13,2.0
2,78938221,5.0,4.0,3.5,3.56,4.0
3,78938603,2.0,1.0,1.0,1.29,1.0
4,78938992,2.0,2.0,2.0,1.86,2.0
...,...,...,...,...,...,...
81,78925388,0.0,1.0,1.0,1.00,1.0
82,78925457,0.0,1.0,1.0,1.00,1.0
83,78925467,0.0,1.0,1.0,1.14,1.0
84,78925536,2.0,1.0,1.0,1.05,1.0


In [407]:
df_expert_count = pd.read_csv(config["goldstandard_data"], sep=";")

df_expert_count_1plus = df_expert_count
print(f"Number of all expert images: {len(df_expert_count_1plus)}")
# df_expert_count_1plus = df_expert_count[df_expert_count["count_total"] > 0]
print(f"Number of images with count_total > 0: {len(df_expert_count[df_expert_count['count_total'] > 0])}")

Number of all expert images: 1156
Number of images with count_total > 0: 111


In [408]:
df_expert_count_1plus = df_expert_count_1plus[['image_name', "subject_id", "count_total"]]

## set the predicted count to 0 for all images which volunteers did not count anything

In [409]:
df_expert_count_1plus

Unnamed: 0,image_name,subject_id,count_total
0,MBN02_72.jpg,78925728,0
1,MBN02_74.jpg,78925730,0
2,MBN02_95.jpg,78925747,0
3,MBN03-2_06.jpg,78925781,0
4,MBN03-2_38.jpg,78925808,0
...,...,...,...
1151,GWB01-3_66.jpg,78925600,1
1152,GWB01-3_70.jpg,78925604,0
1153,GWB01-3_71.jpg,78925605,0
1154,GWB01-3_82.jpg,78925608,0


In [410]:
# Elemeents where volunteers did find something but the expert not
df_method_comparison[~df_method_comparison['subject_id'].isin(df_expert_count_1plus['subject_id'])]

Unnamed: 0,subject_id,dbscan_count,HDBSCAN_count,median_count,mean_count,mode_count


In [411]:
# Outer Join is necessary because the expert did not count anything for some images
df_expert_count_1plus = df_expert_count_1plus.merge(df_method_comparison, on="subject_id", how="left")
df_expert_count_1plus

Unnamed: 0,image_name,subject_id,count_total,dbscan_count,HDBSCAN_count,median_count,mean_count,mode_count
0,MBN02_72.jpg,78925728,0,,,,,
1,MBN02_74.jpg,78925730,0,,,,,
2,MBN02_95.jpg,78925747,0,,,,,
3,MBN03-2_06.jpg,78925781,0,,,,,
4,MBN03-2_38.jpg,78925808,0,,,,,
...,...,...,...,...,...,...,...,...
1151,GWB01-3_66.jpg,78925600,1,,,,,
1152,GWB01-3_70.jpg,78925604,0,,,,,
1153,GWB01-3_71.jpg,78925605,0,,,,,
1154,GWB01-3_82.jpg,78925608,0,,,,,


In [412]:
pd.DataFrame(df_expert_count_1plus.drop(["image_name", "subject_id"], axis=1).sum(), columns=["sum"])

Unnamed: 0,sum
count_total,388.0
dbscan_count,310.0
HDBSCAN_count,357.0
median_count,316.5
mean_count,319.7
mode_count,314.0


In [413]:
from sklearn.metrics import mean_squared_error
# root mean squared error
df_expert_count_1plus.fillna(0, inplace=True)

round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.dbscan_count, squared=False), 4)

0.6322

In [414]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.HDBSCAN_count, squared=False), 4)


0.3891

In [415]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.mean_count, squared=False), 4)

0.4717

In [416]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.median_count, squared=False), 4)

0.4813

In [417]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.mode_count, squared=False), 4)

0.5487

## Compare to error rates when volunteers did find something
Lets double check

In [418]:
df_method_comparison = pd.read_csv(
    config["comparison_dataset"], sep=",", index_col=0)[["subject_id", "count_total", "dbscan_count", "HDBSCAN_count", "median_count", "mean_count","mode_count"]]



In [419]:
df_method_comparison.fillna(0, inplace=True)

In [420]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.dbscan_count, squared=False), 4)

2.1754

In [421]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.HDBSCAN_count, squared=False), 4)



1.1812

In [422]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.mean_count, squared=False), 4)


1.5336

In [423]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.median_count, squared=False), 4)


1.5728

In [424]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.mode_count, squared=False), 4)

1.8458

### Double checking the calculation of basic statistics


In [39]:
import pandas as pd

base_path = Path("/Users/christian/data/zooniverse/2024_03_24_expert_goldstandard_analysis/Iguanas 1st launch/")

df_flat_dataset = pd.read_csv(base_path / "flat_dataset_filtered_Iguanas 1st launch.csv", sep=",", index_col=0)
df_flat_dataset = df_flat_dataset[df_flat_dataset.subject_id == 47969478]
df_flat_dataset

Unnamed: 0,flight_site_code,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name,mission_name,image_path,width,height
49,SFM1,SFM01-2-2-1_293.jpg,47969478,385.480804,314.101715,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
50,SFM1,SFM01-2-2-1_293.jpg,47969478,297.649719,333.019196,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
51,SFM1,SFM01-2-2-1_293.jpg,47969478,308.459686,372.205383,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
52,SFM1,SFM01-2-2-1_293.jpg,47969478,357.104614,391.122833,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
53,SFM1,SFM01-2-2-1_293.jpg,47969478,385.480804,358.692902,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
54,SFM1,SFM01-2-2-1_293.jpg,47969478,334.133392,335.72168,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
55,SFM1,SFM01-2-2-1_293.jpg,47969478,271.976013,160.059494,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
56,SFM1,SFM01-2-2-1_293.jpg,47969478,247.653549,172.220734,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
57,SFM1,SFM01-2-2-1_293.jpg,47969478,266.571014,180.328217,"Others (females, young males, juveniles and ta...",Iguanas 1st launch,1911549.0,Darkstar1977,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649
74,SFM1,SFM01-2-2-1_293.jpg,47969478,369.232056,313.836273,"Could be an iguana, not sure",Iguanas 1st launch,2165180.0,sopeyJoe1,SFM1,/Users/christian/data/zooniverse/Images/Zooniv...,667,649


In [40]:
df_group_by = df_flat_dataset.groupby("user_name").count()[["subject_id"]]
df_group_by

Unnamed: 0_level_0,subject_id
user_name,Unnamed: 1_level_1
Darkstar1977,9
Pamelavans,7
not-logged-in-bcac79bef32f402fb848,2
sopeyJoe1,2


In [41]:
sorted(df_group_by["subject_id"].to_list())

[2, 2, 7, 9]

In [42]:
df_group_by.to_csv(base_path / "user_count.csv")

In [43]:
import statistics

statistics.mode(df_group_by["subject_id"])

2

In [44]:
statistics.median(df_group_by["subject_id"])

4.5