# Looking into the workflow processes
This notebook is used for debugging and understanding the workflow processes of the Zooniverse project "Iguanas from above".

In [13]:
from zooniverse.config import get_config
import pandas as pd
from pathlib import Path

### use either the subset of the subset
# phase_tag = "Iguanas 1st launch"
phase_tag = "Iguanas 2nd launch"
# phase_tag = "Iguanas 3rd launch"
input_path = Path("/Users/christian/data/zooniverse")

# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("/Users/christian/data/zooniverse/2024_04_15_expert_goldstandard_analysis").joinpath(phase_tag).resolve()

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)


config


{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-2ndphase.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/2-T2-GS-results-5th-0s.csv'),
 'image_source': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/2nd launch_without_prefix'),
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_15_expert_goldstandard_analysis/Iguanas 2nd launch/yes_no_dataset_Iguanas 2nd launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_15_expert_goldstandard_analysis/Iguanas 2nd launch/flat_dataset_Iguanas 2nd launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_15_expert_goldstandard_analysis/Iguanas 2nd launch/flat_panopte

In [14]:
df_flat = pd.read_csv(
    config["flat_dataset"], sep=",", index_col=0)
df_flat

Unnamed: 0_level_0,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
flight_site_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,20600,35.102,iguanas-EPS03_69.jpg,72029511,703.617371,920.212830,Adult Male in a lek,Iguanas 2nd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
,20600,35.102,iguanas-EPS03_69.jpg,72029511,804.783386,977.087891,"Others (females, young males, juveniles)",Iguanas 2nd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
,20600,35.102,iguanas-EPS03_69.jpg,72029511,785.983765,1052.907104,"Others (females, young males, juveniles)",Iguanas 2nd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
,20600,35.102,iguanas-EPS03_69.jpg,72029511,259.066772,293.473267,Adult Male in a lek,Iguanas 2nd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
,20600,35.102,iguanas-EPS03_69.jpg,72029511,183.902252,302.392700,"Others (females, young males, juveniles)",Iguanas 2nd launch,ea57b1088a10fa7fef30ed0b344e2ca3,386fc0ec047b7e259744e72e8e64b9f9
...,...,...,...,...,...,...,...,...,...,...
EGI06,20600,94.166,EGI06-1_62.jpg,72333497,1416.692139,881.247559,"Others (females, young males, juveniles)",Iguanas 2nd launch,,50cb2f4ab2a0cb20578e4108c3e3ddc3
EGI04,20600,94.166,EGI04-2_261.jpg,72333410,348.615356,170.206467,Adult Male with a lek,Iguanas 2nd launch,,d51caf2b0d8a68548a70de2a954d48dd
EGI04,20600,94.166,EGI04-2_261.jpg,72333410,310.700073,242.079559,Adult Male with a lek,Iguanas 2nd launch,,d51caf2b0d8a68548a70de2a954d48dd
EGI04,20600,94.166,EGI04-2_261.jpg,72333410,516.515869,411.027252,Adult Male with a lek,Iguanas 2nd launch,,d51caf2b0d8a68548a70de2a954d48dd


In [15]:
df_flat.groupby('workflow_version').size().sort_values(ascending=False)

workflow_version
93.166    95867
94.166    62639
89.162      484
77.155      469
47.109       26
89.156       25
35.102       20
35.104       10
52.128        7
93.165        3
83.156        2
89.161        1
dtype: int64

In [16]:
df_flat_filtered = pd.read_csv(
    config["merged_dataset"], sep=",", index_col=0)
df_flat_filtered

Unnamed: 0_level_0,flight_site_code,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name,mission_name,image_path,width,height
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,ESCG02,20600,89.162,ESCG02-1_13.jpg,72373241,823.109619,480.843323,"Others (females, young males, juveniles)",Iguanas 2nd launch,e420d35a1ea646006eea5b67a80ae4c4,74a407ea10c24ff0e2c30672740b06d1,South Coast G,/Users/christian/data/zooniverse/Images/Zooniv...,1078,1074
1,ESCG02,20600,89.162,ESCG02-1_13.jpg,72373241,746.761475,456.873260,"Others (females, young males, juveniles)",Iguanas 2nd launch,e420d35a1ea646006eea5b67a80ae4c4,74a407ea10c24ff0e2c30672740b06d1,South Coast G,/Users/christian/data/zooniverse/Images/Zooniv...,1078,1074
2,ESCG02,20600,89.162,ESCG02-1_13.jpg,72373241,613.714417,418.871735,"Others (females, young males, juveniles)",Iguanas 2nd launch,e420d35a1ea646006eea5b67a80ae4c4,74a407ea10c24ff0e2c30672740b06d1,South Coast G,/Users/christian/data/zooniverse/Images/Zooniv...,1078,1074
3,ESCG02,20600,89.162,ESCG02-1_13.jpg,72373241,486.640900,415.071564,"Others (females, young males, juveniles)",Iguanas 2nd launch,e420d35a1ea646006eea5b67a80ae4c4,74a407ea10c24ff0e2c30672740b06d1,South Coast G,/Users/christian/data/zooniverse/Images/Zooniv...,1078,1074
4,ESCG02,20600,89.162,ESCG02-1_13.jpg,72373241,93.405724,368.645630,"Others (females, young males, juveniles)",Iguanas 2nd launch,e420d35a1ea646006eea5b67a80ae4c4,74a407ea10c24ff0e2c30672740b06d1,South Coast G,/Users/christian/data/zooniverse/Images/Zooniv...,1078,1074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8608,FMO04,20600,94.166,FMO04-2_38.jpg,72338789,561.572083,162.728821,Adult Male alone,Iguanas 2nd launch,82bc35ac2fff84ceaf64e6e963301a55,45eff38cb7f7707fc3c2b68964bc4ba5,Montura,/Users/christian/data/zooniverse/Images/Zooniv...,1499,1445
8609,FMO04,20600,94.166,FMO04-2_38.jpg,72338789,81.808907,918.737549,Adult Male alone,Iguanas 2nd launch,82bc35ac2fff84ceaf64e6e963301a55,45eff38cb7f7707fc3c2b68964bc4ba5,Montura,/Users/christian/data/zooniverse/Images/Zooniv...,1499,1445
8610,FMO04,20600,94.166,FMO04-2_38.jpg,72338789,553.821228,173.496384,Adult Male alone,Iguanas 2nd launch,556acb90168e837d01a641f7eee6d3b4,2195c9465b7d7cf53e0b394e562d3347,Montura,/Users/christian/data/zooniverse/Images/Zooniv...,1499,1445
8611,FMO04,20600,94.166,FMO04-2_38.jpg,72338789,82.596489,927.455994,Adult Male alone,Iguanas 2nd launch,556acb90168e837d01a641f7eee6d3b4,2195c9465b7d7cf53e0b394e562d3347,Montura,/Users/christian/data/zooniverse/Images/Zooniv...,1499,1445


In [17]:
df_expert_count = pd.read_csv(config["goldstandard_data"], sep=";")
df_expert_count["subject_id"]### Reading the data 


0      72332801
1      72332802
2      72332876
3      72332972
4      72332999
         ...   
451    72341576
452    72341585
453    72341593
454    72341609
455    72341627
Name: subject_id, Length: 456, dtype: int64

In [18]:
df_flat[df_flat.subject_id.isin(df_expert_count["subject_id"])].groupby('workflow_version').size()

workflow_version
89.162      55
93.166    5939
94.166    3623
dtype: int64

# Panoptes Data Extraction from Zooniverse

## Panoptes Extract




In [19]:
# create a configuration file from the workflow
# !mkdir ./data/phase_1
# ! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 14370 --min_version 0 --max_version 142.245 -d ./data/phase_1
# 
# !mkdir ./data/phase_2
# ! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 20600 --min_version 0 --max_version 94.166 -d ./data/phase_2
# 
# !mkdir ./data/phase_3
# ! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 22040 --min_version 0 --max_version 9.63 -d ./data/phase_3

In [20]:
# TODO how to extract multiple workflow versions???
# !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/ ./data/phase_1/Extractor_config_workflow_14370_V142.245.yaml
# 
# !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/ ./data/phase_2/Extractor_config_workflow_20600_V94.166.yaml

# !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3 ./data/phase_3/Extractor_config_workflow_22040_V9.63.yaml

# Look into the subjects file
This contains the mappings from the subject_id to the image file



In [24]:
df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")
# df_subjects

  df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")


Unnamed: 0,subject_id,project_id,workflow_id,subject_set_id,metadata,locations,classifications_count,retired_at,retirement_reason,created_at,updated_at
0,44660549,11905,,83766,"{""image_name"":""SMF02-1-2-2_740.jpg"",""subject_i...","{""0"":""https://panoptes-uploads.zooniverse.org/...",0,,,2020-05-12 11:01:33 UTC,2020-05-12 11:01:33 UTC
1,44660550,11905,,83766,"{""image_name"":""SRIL03-2-1_153.jpg"",""subject_id...","{""0"":""https://panoptes-uploads.zooniverse.org/...",0,,,2020-05-12 11:01:39 UTC,2020-05-12 11:01:39 UTC
2,44660551,11905,,83766,"{""image_name"":""SRIL03-2-2_90.jpg"",""subject_id""...","{""0"":""https://panoptes-uploads.zooniverse.org/...",0,,,2020-05-12 11:01:45 UTC,2020-05-12 11:01:45 UTC
3,44660552,11905,,83766,"{""image_name"":""SRIL03-2-2_127.jpg"",""subject_id...","{""0"":""https://panoptes-uploads.zooniverse.org/...",0,,,2020-05-12 11:01:49 UTC,2020-05-12 11:01:49 UTC
4,44660553,11905,,83766,"{""image_name"":""SRIL03-2-2_133.jpg"",""subject_id...","{""0"":""https://panoptes-uploads.zooniverse.org/...",0,,,2020-05-12 11:01:53 UTC,2020-05-12 11:01:53 UTC
...,...,...,...,...,...,...,...,...,...,...,...
68155,92469731,11905,25351.0,116619,"{""Site"":""Puerto Villamil"",""Island"":""Isabela"",""...","{""0"":""https://panoptes-uploads.zooniverse.org/...",2,,,2023-10-12 16:28:45 UTC,2023-10-12 16:28:45 UTC
68156,92469733,11905,25351.0,116619,"{""Site"":""Puerto Villamil"",""Island"":""Isabela"",""...","{""0"":""https://panoptes-uploads.zooniverse.org/...",3,,,2023-10-12 16:28:46 UTC,2023-10-12 16:28:46 UTC
68157,92469734,11905,25351.0,116619,"{""Site"":""Puerto Villamil"",""Island"":""Isabela"",""...","{""0"":""https://panoptes-uploads.zooniverse.org/...",6,,,2023-10-12 16:28:46 UTC,2023-10-12 16:28:46 UTC
68158,92469735,11905,25351.0,116619,"{""Site"":""Puerto Villamil"",""Island"":""Isabela"",""...","{""0"":""https://panoptes-uploads.zooniverse.org/...",1,,,2023-10-12 16:28:46 UTC,2023-10-12 16:28:46 UTC


## Merge the extractions if there are multiple extractions per phase

In [None]:
# for phase in ["phase_1", "phase_2", "phase_3"]:


# panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/panoptes Extractor_config_workflow_14370_V142.245.yaml

df_panotes_point_extractor = pd.read_csv("./data/phase_1/point_extractor_by_frame_extractions.csv", sep=",")
df_panotes_question = pd.read_csv("./data/phase_1/question_extractor_extractions.csv", sep=",")



In [None]:
df_panotes_point_extractor

In [None]:
def get_json_keys(json_str):
    try:
        json_obj = json.loads(json_str)
        return list(json_obj.keys())
    except json.JSONDecodeError:
        return []

# Apply the function to each row in the metadata column and collect all keys
all_keys = df_subjects['metadata'].apply(get_json_keys)

# Flatten the list of lists and get unique keys
unique_keys = set([key for sublist in all_keys for key in sublist])

print(unique_keys)

In [None]:
import json

df_subjects[["subject_id", "metadata"]]

df_subjects["image_name"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('Image_name') 
                                        or json.loads(x).get('image_name') 
                                        or json.loads(x).get('Filename')).sort_values(ascending=True)
df_subjects

In [None]:
df_panotes_point_extractor = df_panotes_point_extractor.merge(df_subjects[["subject_id", "image_name"]], left_on="subject_id", right_on="subject_id")
df_panotes_point_extractor

In [None]:
## Anonymise the data

In [None]:
from hashlib import blake2b

df_panotes_point_extractor["user_id"] = df_panotes_point_extractor['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panotes_point_extractor['user_name'] = df_panotes_point_extractor['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

df_panotes_question["user_id"] = df_panotes_question['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panotes_question['user_name'] = df_panotes_question['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

In [None]:
# TODO check if that doesn ruin the data
df_panotes_point_extractor

In [None]:
df_panotes_question

# Debugging

In [None]:
### Looks the images in question

subject_id_1 = 47968423 # median and mode 11, 11
subject_id_2 = 47969478 # median and mode 4.5, 2
df_panotes_point_extractor[(df_panotes_point_extractor.subject_id == subject_id_2)]

In [None]:
df_panotes_point_extractor_r = df_panotes_point_extractor[
    # (df_panotes_point_extractor.subject_id == subject_id_2) &  # filter for this subject_id only
    (df_panotes_point_extractor.task == "T2")
    # & (df_panotes_point_extractor.user_name == "jickjack")
]
df_panotes_point_extractor_r.columns

### Which is tool is which now??
There these tools 
- data.frame0.T2_tool0_x 
- data.frame0.T2_tool1_x 
- data.frame0.T2_tool2_x
- data.frame0.T2_tool3_x
- data.frame0.T2_tool4_x

Those reflect the tools in the point_extractor_by_frame Extractor_config_workflow_14370_V134.236.yaml
and probably belong to the classification (in that order, from the lab page) 
- Adult Male in a lek
- Adult Male alone
- Others (females, young males, juveniles)
- Partial iguana
- Could be an iguana, not sure

Is "Could be an iguana, not sure" included in the study?


In [None]:
# create a flat from that.
from ast import literal_eval

columns_keep_x = ['data.frame0.T2_tool0_x', 'data.frame0.T2_tool1_x', 'data.frame0.T2_tool2_x', 'data.frame0.T2_tool4_x']
columns_keep_y = ['data.frame0.T2_tool0_y', 'data.frame0.T2_tool1_y', 'data.frame0.T2_tool2_y', 'data.frame0.T2_tool4_y']

# for col in ['data.frame0.T2_tool0_x', 'data.frame0.T2_tool1_x', 'data.frame0.T2_tool2_x', 'data.frame0.T2_tool4_x',
#             'data.frame0.T2_tool0_y', 'data.frame0.T2_tool1_y', 'data.frame0.T2_tool2_y', 'data.frame0.T2_tool4_y']:

for col in columns_keep_x + columns_keep_y:
    df_panotes_point_extractor_r[col] = df_panotes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# Merge the lists in 'x' and 'y' coordinates
df_panotes_point_extractor_r['x'] = df_panotes_point_extractor_r[columns_keep_x].values.tolist()
df_panotes_point_extractor_r['y'] = df_panotes_point_extractor_r[columns_keep_y].values.tolist()

# Flatten the lists in each row for 'x' and 'y'
df_panotes_point_extractor_r['x'] = df_panotes_point_extractor_r['x'].apply(lambda x: [item for sublist in x for item in sublist])
df_panotes_point_extractor_r['y'] = df_panotes_point_extractor_r['y'].apply(lambda x: [item for sublist in x for item in sublist])

# Explode the DataFrame to separate rows for each x, y pair
# Note: This requires pandas >= 0.25 for simultaneous explode
# df_panotes_point_extractor_r_exploded = df_panotes_point_extractor_r.apply(pd.Series.explode)

# Explode the DataFrame based on these columns to get separate rows for each list element
# Make sure to perform the explode operation on both columns simultaneously to keep the x and y coordinates paired
df_panotes_point_extractor_r

In [None]:
df_panotes_point_extractor_r = df_panotes_point_extractor_r[
    ['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
     'created_at', 'subject_id', "image_name",
     'x', 'y'
     ]].reset_index(drop=True)

df_panotes_point_extractor_r

In [None]:
df_panotes_point_extractor_r_ex = df_panotes_point_extractor_r.apply(lambda x: x.explode() if x.name in ['x', 'y'] else x)

In [None]:
# images with no marks have NaN values in the 'merged_x' and 'merged_y' columns
df_panotes_point_extractor_r_ex_dropped = df_panotes_point_extractor_r_ex.dropna(subset=['x', 'y'], how='all').sort_values(by=['user_id', 'subject_id', 'task', 'created_at'])
df_panotes_point_extractor_r_ex_dropped

In [None]:
# cast x and y to int
df_panotes_point_extractor_r_ex_dropped = df_panotes_point_extractor_r_ex_dropped.astype({'x': 'int32', 'y': 'int32'})
df_panotes_point_extractor_r_ex_dropped

## Check the numbers for a single subject_id

In [22]:
df_flat[df_flat.image_name == "ESCG02-1_19.jpg"]

Unnamed: 0_level_0,workflow_id,workflow_version,image_name,subject_id,x,y,tool_label,phase_tag,user_id,user_name
flight_site_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ESCG02,20600,93.166,ESCG02-1_19.jpg,72373250,783.791748,833.887268,Adult Male alone,Iguanas 2nd launch,41acc16780b40b8ebd418e7c2eed38d1,d4db022c6b48b830d8f92d5c737d6364
ESCG02,20600,93.166,ESCG02-1_19.jpg,72373250,194.347977,141.158142,"Others (females, young males, juveniles)",Iguanas 2nd launch,c11a32c827347926881e5e1db75cb701,691500ccebe2131f83809524df652f87
ESCG02,20600,93.166,ESCG02-1_19.jpg,72373250,193.243042,303.586670,"Others (females, young males, juveniles)",Iguanas 2nd launch,c11a32c827347926881e5e1db75cb701,691500ccebe2131f83809524df652f87
ESCG02,20600,93.166,ESCG02-1_19.jpg,72373250,28.604599,456.070587,"Others (females, young males, juveniles)",Iguanas 2nd launch,c11a32c827347926881e5e1db75cb701,691500ccebe2131f83809524df652f87
ESCG02,20600,93.166,ESCG02-1_19.jpg,72373250,82.747437,482.589539,"Others (females, young males, juveniles)",Iguanas 2nd launch,c11a32c827347926881e5e1db75cb701,691500ccebe2131f83809524df652f87
...,...,...,...,...,...,...,...,...,...,...
ESCG02,20600,94.166,ESCG02-1_19.jpg,72373250,578.153015,308.388885,Adult Male alone,Iguanas 2nd launch,,c6d66b18814c9df1833d0a281e272323
ESCG02,20600,94.166,ESCG02-1_19.jpg,72373250,628.282471,458.777191,Adult Male alone,Iguanas 2nd launch,,c6d66b18814c9df1833d0a281e272323
ESCG02,20600,94.166,ESCG02-1_19.jpg,72373250,18.374290,731.704163,Partial iguana,Iguanas 2nd launch,,c6d66b18814c9df1833d0a281e272323
ESCG02,20600,94.166,ESCG02-1_19.jpg,72373250,427.764709,995.556763,"Others (females, young males, juveniles)",Iguanas 2nd launch,,c6d66b18814c9df1833d0a281e272323


## Analysing the differences between panotes extract and the custom data_prep method
I get [2, 2, 7, 9] for the SFM01-2-2-1_293.jpg - 47969478 with my dataprep
here I have [2, 7, 9] for the same image by only remove "partial iguana" but leaving "could be an iguana, not sure" in there

## Filter for the expert gold standard

In [23]:
from zooniverse.utils.data_format import data_prep
use_gold_standard_subset = "expert_goldstandard"
ds_stats = data_prep(phase_tag=phase_tag, 
                         output_path=output_path, 
                         input_path=input_path,
                         filter_combination=use_gold_standard_subset, 
                         config=config)

ds_stats

[32m2024-04-16 10:36:18.640[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 1st launch[0m
[32m2024-04-16 10:36:35.873[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Main Workflow[0m
[32m2024-04-16 10:36:40.248[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: survey tool 061417[0m
[32m2024-04-16 10:36:43.855[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 2nd launch[0m
[32m2024-04-16 10:36:54.434[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 3rd launch[0m


KeyboardInterrupt: 

In [None]:
df_flat_data_prep = pd.read_csv(config["flat_dataset"], sep=",", index_col=0)
df_flat_data_prep[df_flat_data_prep.subject_id == subject_id_2]

In [None]:
df_merged_data_prep = pd.read_csv(config["merged_dataset"], sep=",", index_col=0)
df_merged_data_prep[df_merged_data_prep.subject_id == subject_id_2]

In [None]:
sorted([int(x) for x in list(df_merged_data_prep[df_merged_data_prep.subject_id == subject_id_2].x)])

In [None]:
sorted([int(x) for x in list(df_panotes_point_extractor_r_ex_dropped.merged_x)])

In [ ]:
## Compare the results of december 4th results with the current results for phase 1

In [4]:
!pwd

/Users/christian/PycharmProjects/iguanas-from-above-zooniverse


In [16]:
import pandas as pd
df_method_4th = pd.read_csv("./data/Iguanas 1st launch_method_comparison_2023_12_04.csv", sep=",")
df_method_4th

Unnamed: 0.1,Unnamed: 0,image_name,subject_id,count_total,median_count,mean_count,mode_count,kmeans_knee,kmeans_sillouette_count,kmeans_BIC_count,dbscan_count,with_noise,HDBSCAN_count,bic_avg,eps,min_cluster_size,max_cluster_size,noise_points
0,0,SFM01-2-2-2_282.jpg,47969795,2,0.0,0.00,0.0,0.0,0.0,0.0,0.0,,,,,,,
1,1,SFM01-2-2-1_344.jpg,47969531,4,1.0,1.88,1.0,3.0,4.0,4.0,3.0,True,1.0,-189.620995,0.0,5.0,,12.0
2,2,SFM01-2-2-2_293.jpg,47969828,7,2.0,2.58,1.0,4.0,6.0,4.0,3.0,True,1.0,-435.000001,0.0,5.0,,30.0
3,3,SFM01-1-1_114.jpg,47967876,1,1.0,1.44,1.0,2.0,2.0,2.0,0.0,True,1.0,-123.812228,0.0,5.0,,8.0
4,4,SFM01-1-1_154.jpg,47967959,1,1.0,1.00,1.0,0.0,1.0,1.0,0.0,True,1.0,-67.957653,0.0,5.0,,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,102,SRPB02-2-1_67.jpg,47987920,1,1.0,1.09,1.0,2.0,2.0,1.0,2.0,True,1.0,-156.069513,0.0,5.0,,9.0
103,103,SRPB02-2-2_143.jpg,47988474,4,4.0,3.62,4.0,3.0,6.0,1.0,4.0,True,4.0,-354.144482,0.0,5.0,,2.0
104,104,SRPB02-2-2_153.jpg,47988580,1,1.0,1.00,1.0,0.0,1.0,2.0,0.0,True,1.0,-77.270888,0.0,5.0,,4.0
105,105,SRPB02-2-2_161.jpg,47988636,1,1.0,1.00,1.0,0.0,1.0,1.0,0.0,True,1.0,-74.985771,0.0,5.0,,5.0


In [17]:

df_method_4th.rename(columns={"dbscan_count": "dbscan_count_4th", 'HDBSCAN_count': 'HDBSCAN_count_4th' }, inplace=True)
df_method_4th[['image_name', 'subject_id', 'dbscan_count_4th', 'HDBSCAN_count_4th']]

Unnamed: 0,image_name,subject_id,dbscan_count_4th,HDBSCAN_count_4th
0,SFM01-2-2-2_282.jpg,47969795,0.0,
1,SFM01-2-2-1_344.jpg,47969531,3.0,1.0
2,SFM01-2-2-2_293.jpg,47969828,3.0,1.0
3,SFM01-1-1_114.jpg,47967876,0.0,1.0
4,SFM01-1-1_154.jpg,47967959,0.0,1.0
...,...,...,...,...
102,SRPB02-2-1_67.jpg,47987920,2.0,1.0
103,SRPB02-2-2_143.jpg,47988474,4.0,4.0
104,SRPB02-2-2_153.jpg,47988580,0.0,1.0
105,SRPB02-2-2_161.jpg,47988636,0.0,1.0


In [18]:
df_method_panoptes = pd.read_csv("data/zooniverse/Iguanas 1st launch/Iguanas 1st launch_method_comparison.csv", sep=",")
df_method_panoptes.rename(columns={"dbscan_count_sil": "dbscan_count_pan", 'HDBSCAN_count': 'HDBSCAN_count_pan' }, inplace=True)
df_method_panoptes[['image_name', 'subject_id', 'dbscan_count_pan', 'HDBSCAN_count_pan']]

Unnamed: 0,image_name,subject_id,dbscan_count_pan,HDBSCAN_count_pan
0,SFB01-3_08.jpg,47967468,0.0,0.0
1,SFB01-4-1_108.jpg,47967483,0.0,0.0
2,SFB01-4-1_129.jpg,47967503,0.0,0.0
3,SFB01-4-1_131.jpg,47967505,0.0,0.0
4,SFB01-4-1_132.jpg,47967506,0.0,0.0
...,...,...,...,...
3885,SRBS03-3-3_46.jpg,48034421,0.0,0.0
3886,SRBS03-4_25.jpg,48034447,0.0,1.0
3887,SRBS03-4_37.jpg,48034450,0.0,0.0
3888,SRBS03-4_40.jpg,48034453,1.0,1.0


In [26]:
df_method_compare = df_method_4th.merge(df_method_panoptes, on="subject_id", how="left")[['image_name_x', 'image_name_y', 'subject_id', 'dbscan_count_4th', 'dbscan_count_pan', 'HDBSCAN_count_4th', 'HDBSCAN_count_pan']]
df_method_compare

Unnamed: 0,image_name_x,image_name_y,subject_id,dbscan_count_4th,dbscan_count_pan,HDBSCAN_count_4th,HDBSCAN_count_pan
0,SFM01-2-2-2_282.jpg,SFM01-2-2-2_282.jpg,47969795,0.0,0.0,,1.0
1,SFM01-2-2-1_344.jpg,SFM01-2-2-1_344.jpg,47969531,3.0,2.0,1.0,1.0
2,SFM01-2-2-2_293.jpg,SFM01-2-2-2_293.jpg,47969828,3.0,2.0,1.0,1.0
3,SFM01-1-1_114.jpg,SFM01-1-1_114.jpg,47967876,0.0,1.0,1.0,1.0
4,SFM01-1-1_154.jpg,SFM01-1-1_154.jpg,47967959,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...
102,SRPB02-2-1_67.jpg,SRPB02-2-1_67.jpg,47987920,2.0,1.0,1.0,1.0
103,SRPB02-2-2_143.jpg,SRPB02-2-2_143.jpg,47988474,4.0,4.0,4.0,4.0
104,SRPB02-2-2_153.jpg,SRPB02-2-2_153.jpg,47988580,0.0,0.0,1.0,1.0
105,SRPB02-2-2_161.jpg,SRPB02-2-2_161.jpg,47988636,0.0,0.0,1.0,1.0


In [28]:
df_method_compare["hdbscan_diff"] = df_method_compare['HDBSCAN_count_4th'] - df_method_compare['HDBSCAN_count_pan']
df_method_compare

Unnamed: 0,image_name_x,image_name_y,subject_id,dbscan_count_4th,dbscan_count_pan,HDBSCAN_count_4th,HDBSCAN_count_pan,hdbscan_diff
0,SFM01-2-2-2_282.jpg,SFM01-2-2-2_282.jpg,47969795,0.0,0.0,,1.0,
1,SFM01-2-2-1_344.jpg,SFM01-2-2-1_344.jpg,47969531,3.0,2.0,1.0,1.0,0.0
2,SFM01-2-2-2_293.jpg,SFM01-2-2-2_293.jpg,47969828,3.0,2.0,1.0,1.0,0.0
3,SFM01-1-1_114.jpg,SFM01-1-1_114.jpg,47967876,0.0,1.0,1.0,1.0,0.0
4,SFM01-1-1_154.jpg,SFM01-1-1_154.jpg,47967959,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...
102,SRPB02-2-1_67.jpg,SRPB02-2-1_67.jpg,47987920,2.0,1.0,1.0,1.0,0.0
103,SRPB02-2-2_143.jpg,SRPB02-2-2_143.jpg,47988474,4.0,4.0,4.0,4.0,0.0
104,SRPB02-2-2_153.jpg,SRPB02-2-2_153.jpg,47988580,0.0,0.0,1.0,1.0,0.0
105,SRPB02-2-2_161.jpg,SRPB02-2-2_161.jpg,47988636,0.0,0.0,1.0,1.0,0.0
