# Recalculate errors of DBSCAN / HDBSCAN


In [405]:
import pandas as pd
from config import get_config
from pathlib import Path

### use either the subset of the subset
# phase_tag = "Iguanas 1st launch"
# phase_tag = "Iguanas 2nd launch"
phase_tag = "Iguanas 3rd launch"
input_path = Path("/Users/christian/data/zooniverse")

use_gold_standard_subset = "expert_goldstandard"  # Use the X-T2-GS-results-5th-0s as the basis
# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("../data/4th_december/").joinpath(phase_tag).resolve()
output_path.mkdir(exist_ok=True, parents=True)

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)

df_method_comparison = pd.read_csv(
    config["comparison_dataset"], sep=",", index_col=0)[["subject_id", "dbscan_count", "HDBSCAN_count", "median_count", "mean_count","mode_count"]]



In [406]:
df_method_comparison

Unnamed: 0,subject_id,dbscan_count,HDBSCAN_count,median_count,mean_count,mode_count
0,78926344,2.0,2.0,1.5,1.67,1.0
1,78928708,2.0,2.0,2.0,2.13,2.0
2,78938221,5.0,4.0,3.5,3.56,4.0
3,78938603,2.0,1.0,1.0,1.29,1.0
4,78938992,2.0,2.0,2.0,1.86,2.0
...,...,...,...,...,...,...
81,78925388,0.0,1.0,1.0,1.00,1.0
82,78925457,0.0,1.0,1.0,1.00,1.0
83,78925467,0.0,1.0,1.0,1.14,1.0
84,78925536,2.0,1.0,1.0,1.05,1.0


In [407]:
df_expert_count = pd.read_csv(config["goldstandard_data"], sep=";")

df_expert_count_1plus = df_expert_count
print(f"Number of all expert images: {len(df_expert_count_1plus)}")
# df_expert_count_1plus = df_expert_count[df_expert_count["count_total"] > 0]
print(f"Number of images with count_total > 0: {len(df_expert_count[df_expert_count['count_total'] > 0])}")

Number of all expert images: 1156
Number of images with count_total > 0: 111


In [408]:
df_expert_count_1plus = df_expert_count_1plus[['image_name', "subject_id", "count_total"]]

## set the predicted count to 0 for all images which volunteers did not count anything

In [409]:
df_expert_count_1plus

Unnamed: 0,image_name,subject_id,count_total
0,MBN02_72.jpg,78925728,0
1,MBN02_74.jpg,78925730,0
2,MBN02_95.jpg,78925747,0
3,MBN03-2_06.jpg,78925781,0
4,MBN03-2_38.jpg,78925808,0
...,...,...,...
1151,GWB01-3_66.jpg,78925600,1
1152,GWB01-3_70.jpg,78925604,0
1153,GWB01-3_71.jpg,78925605,0
1154,GWB01-3_82.jpg,78925608,0


In [410]:
# Elemeents where volunteers did find something but the expert not
df_method_comparison[~df_method_comparison['subject_id'].isin(df_expert_count_1plus['subject_id'])]

Unnamed: 0,subject_id,dbscan_count,HDBSCAN_count,median_count,mean_count,mode_count


In [411]:
# Outer Join is necessary because the expert did not count anything for some images
df_expert_count_1plus = df_expert_count_1plus.merge(df_method_comparison, on="subject_id", how="left")
df_expert_count_1plus

Unnamed: 0,image_name,subject_id,count_total,dbscan_count,HDBSCAN_count,median_count,mean_count,mode_count
0,MBN02_72.jpg,78925728,0,,,,,
1,MBN02_74.jpg,78925730,0,,,,,
2,MBN02_95.jpg,78925747,0,,,,,
3,MBN03-2_06.jpg,78925781,0,,,,,
4,MBN03-2_38.jpg,78925808,0,,,,,
...,...,...,...,...,...,...,...,...
1151,GWB01-3_66.jpg,78925600,1,,,,,
1152,GWB01-3_70.jpg,78925604,0,,,,,
1153,GWB01-3_71.jpg,78925605,0,,,,,
1154,GWB01-3_82.jpg,78925608,0,,,,,


In [412]:
pd.DataFrame(df_expert_count_1plus.drop(["image_name", "subject_id"], axis=1).sum(), columns=["sum"])

Unnamed: 0,sum
count_total,388.0
dbscan_count,310.0
HDBSCAN_count,357.0
median_count,316.5
mean_count,319.7
mode_count,314.0


In [413]:
from sklearn.metrics import mean_squared_error
# root mean squared error
df_expert_count_1plus.fillna(0, inplace=True)

round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.dbscan_count, squared=False), 4)

0.6322

In [414]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.HDBSCAN_count, squared=False), 4)


0.3891

In [415]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.mean_count, squared=False), 4)

0.4717

In [416]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.median_count, squared=False), 4)

0.4813

In [417]:
round(mean_squared_error(df_expert_count_1plus.count_total, df_expert_count_1plus.mode_count, squared=False), 4)

0.5487

## Compare to error rates when volunteers did find something
Lets double check

In [418]:
df_method_comparison = pd.read_csv(
    config["comparison_dataset"], sep=",", index_col=0)[["subject_id", "count_total", "dbscan_count", "HDBSCAN_count", "median_count", "mean_count","mode_count"]]



In [419]:
df_method_comparison.fillna(0, inplace=True)

In [420]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.dbscan_count, squared=False), 4)

2.1754

In [421]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.HDBSCAN_count, squared=False), 4)



1.1812

In [422]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.mean_count, squared=False), 4)


1.5336

In [423]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.median_count, squared=False), 4)


1.5728

In [424]:
round(mean_squared_error(df_method_comparison.count_total, df_method_comparison.mode_count, squared=False), 4)

1.8458