# Using the Panoptes Aggregation Tool from Zooniverse
https://aggregation-caesar.zooniverse.org/Scripts.html#scripts


In [1]:
from zooniverse.config import get_config
import pandas as pd
from pathlib import Path

### use either the subset of the subset
#phase_tag = "Iguanas 1st launch"
#data_folder = "./data/phase_1"

phase_tag = "Iguanas 2nd launch"
data_folder = "./data/phase_2"

# phase_tag = "Iguanas 3rd launch"
# data_folder = "./data/phase_3"

workflow_id_p1 = 14370.0
workflow_id_p2 = 20600.0
workflow_id_p3 = 22040.0

input_path = Path("/Users/christian/data/zooniverse")

# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("/Users/christian/data/zooniverse/2024_04_12_analysis").joinpath(phase_tag).resolve()

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)


config


{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-2ndphase.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/2-T2-GS-results-5th-0s.csv'),
 'image_source': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/2nd launch_without_prefix'),
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 2nd launch/yes_no_dataset_Iguanas 2nd launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 2nd launch/flat_dataset_Iguanas 2nd launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 2nd launch/flat_panoptes_points_Iguanas 2nd launch.csv'),
 'panoptes_question': Pos

# Look into the subjects file
This contains the mappings from the subject_id to the image file

In [2]:
# read the original file
df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")



  df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")


In [3]:
# filter the subjects for only the images in the three phases

df_subjects = df_subjects[df_subjects.workflow_id.isin([workflow_id_p1, workflow_id_p2, workflow_id_p3])]


In [4]:
# inspect the metadata
import json
def get_json_keys(json_str):
    try:
        json_obj = json.loads(json_str)
        return list(json_obj.keys())
    except json.JSONDecodeError:
        return []

# Apply the function to each row in the metadata column and collect all keys
all_keys = df_subjects['locations'].apply(get_json_keys)

# Flatten the list of lists and get unique keys
unique_keys = set([key for sublist in all_keys for key in sublist])

print(unique_keys)

{'0'}


Clean up the subjects file

In [5]:
df_subjects["image_name"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('Image_name') 
                                        or json.loads(x).get('image_name') 
                                        or json.loads(x).get('Filename')).sort_values(ascending=True)

# 'site', 'flight', 'Flight', 'Site', 'flight_code' depict the same
df_subjects["flight_code"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('flight_code') 
                                        or json.loads(x).get('site') 
                                        or json.loads(x).get('flight')
                                        or json.loads(x).get('Flight')
                                        or json.loads(x).get('Site')).sort_values(ascending=True)

df_subjects["url"] = df_subjects['locations'].apply(lambda x: json.loads(x)["0"])
df_subjects["filepath"] = None

In [6]:
from loguru import logger
from time import sleep
# helper function to download the images
import requests

def download_image(url, filename):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            return True
        else:
            logger.warning(f"Failed to download {url}")
            logger.error(response)
            sleep(5)
            return False
    except Exception as e:
        logger.error(e)
        sleep(5)
        return False



# Panoptes Data Extraction from Zooniverse
## Panoptes config
### Create the configuration files automatically
The configurations were changed to custom workflow versions.

In [7]:
# create a configuration file from the workflow
#!mkdir ./data/phase_1
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 14370 --min_version 0 --max_version 142.245 -d ./data/phase_1
# 
#!mkdir ./data/phase_2
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 20600 --min_version 0 --max_version 94.166 -d ./data/phase_2
# 
#!mkdir ./data/phase_3
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 22040 --min_version 0 --max_version 9.63 -d ./data/phase_3

## Extract the data

In [8]:
# phase 1
if data_folder == "./data/phase_1":
    !mkdir ./data/phase_1/V121.144
    !mkdir ./data/phase_1/V134.236
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V121.144 ./data/phase_1/Extractor_config_workflow_14370_V121.144.yaml
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V134.236 ./data/phase_1/Extractor_config_workflow_14370_V134.236-1.yaml


In [9]:
if data_folder == "./data/phase_2":
    # phase 2
    
    !mkdir ./data/phase_2/V89.162
    !mkdir ./data/phase_2/V93.166
    !mkdir ./data/phase_2/V94.166 
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V89.162 ./data/phase_2/Extractor_config_workflow_20600_V89.162.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V93.166 ./data/phase_2/Extractor_config_workflow_20600_V93.166.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V94.166 ./data/phase_2/Extractor_config_workflow_20600_V94.166.yaml



mkdir: ./data/phase_2/V89.162: File exists
mkdir: ./data/phase_2/V93.166: File exists
mkdir: ./data/phase_2/V94.166: File exists
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:00:00
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:00:35
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:00:28


In [10]:
if data_folder == "./data/phase_3":
    !mkdir ./data/phase_3/V7.63
    !mkdir ./data/phase_3/V9.63
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V7.63 ./data/phase_3/Extractor_config_workflow_22040_V7.63.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V9.63 ./data/phase_3/Extractor_config_workflow_22040_V9.63.yaml

### Merge the single point and questions extractions

In [11]:
# phase 1
if data_folder == "./data/phase_1":
    df_panoptes_point_extractor_1 = pd.read_csv(f"./data/phase_1/V121.144/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"./data/phase_1/V134.236/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "121.144"
    df_panoptes_point_extractor_2["workflow_version"] = "134.236"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V121.144/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V134.236/question_extractor_extractions.csv", sep=",")
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)
    
    df_panoptes_point_extractor

In [12]:
# # phase 2
if data_folder == "./data/phase_2":
    # read the rectangles annotations too there
    df_panotes_rectangle_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/shape_extractor_rectangle_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V93.166/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_3 = pd.read_csv(f"{data_folder}/V94.166/point_extractor_by_frame_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1["workflow_version"] = "89.162"
    df_panoptes_point_extractor_2["workflow_version"] = "93.166"
    df_panoptes_point_extractor_3["workflow_version"] = "94.166"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V89.162/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V93.166/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_3 = pd.read_csv(f"{data_folder}/V94.166/question_extractor_extractions.csv", sep=",")
    
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2, df_panoptes_question_3], axis=0)

    df_panotes_rectangle_extractor_1

In [13]:
if data_folder == "./data/phase_3":
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V7.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V9.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "7.63"
    df_panoptes_point_extractor_2["workflow_version"] = "9.63"

    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V7.63/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V9.63/question_extractor_extractions.csv", sep=",")

    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)


In [14]:
df_panoptes_point_extractor

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T4_tool0_x,...,data.frame0.T4_tool6_y,workflow_version,data.frame0.T2_tool1_x,data.frame0.T2_tool1_y,data.frame0.T2_tool0_x,data.frame0.T2_tool0_y,data.frame0.T2_tool2_x,data.frame0.T2_tool2_y,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y
0,393742576,AmyMacLeod,1443210.0,20600,T4,2022-02-01 00:09:54 UTC,72335168,point_extractor_by_frame,4.1.0,"[1481.24658203125, 64.83441925048828, 152.1798...",...,,89.162,,,,,,,,
1,393742638,AmyMacLeod,1443210.0,20600,T4,2022-02-01 00:10:20 UTC,72334984,point_extractor_by_frame,4.1.0,,...,,89.162,,,,,,,,
2,393742678,AmyMacLeod,1443210.0,20600,T4,2022-02-01 00:10:33 UTC,72340244,point_extractor_by_frame,4.1.0,,...,,89.162,,,,,,,,
3,393743049,AmyMacLeod,1443210.0,20600,T4,2022-02-01 00:12:15 UTC,72336318,point_extractor_by_frame,4.1.0,,...,,89.162,,,,,,,,
4,393768668,not-logged-in-b0c587027139fe5e6549,,20600,T4,2022-02-01 03:12:48 UTC,72372999,point_extractor_by_frame,4.1.0,,...,,89.162,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215619,406113768,Lenoxx,2434025.0,20600,T4,2022-03-30 19:24:33 UTC,72335929,point_extractor_by_frame,4.1.0,,...,,93.166,,,,,,,,
215620,406113907,Lenoxx,2434025.0,20600,T2,2022-03-30 19:25:09 UTC,72334672,point_extractor_by_frame,4.1.0,,...,,93.166,,,,,,,,
215621,406113907,Lenoxx,2434025.0,20600,T4,2022-03-30 19:25:09 UTC,72334672,point_extractor_by_frame,4.1.0,,...,,93.166,,,,,,,,
215622,406113995,Lenoxx,2434025.0,20600,T2,2022-03-30 19:25:30 UTC,72338881,point_extractor_by_frame,4.1.0,,...,,93.166,,,,,,,,


In [15]:
# join the image name from the subjects file
df_panoptes_point_extractor = df_panoptes_point_extractor.merge(df_subjects[["subject_id", "image_name"]], left_on="subject_id", right_on="subject_id")
df_panoptes_point_extractor = df_panoptes_point_extractor[df_panoptes_point_extractor.subject_id.isin(df_subjects.subject_id)]

df_panoptes_point_extractor

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T4_tool0_x,...,workflow_version,data.frame0.T2_tool1_x,data.frame0.T2_tool1_y,data.frame0.T2_tool0_x,data.frame0.T2_tool0_y,data.frame0.T2_tool2_x,data.frame0.T2_tool2_y,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y,image_name
0,393742576,AmyMacLeod,1443210.0,20600,T4,2022-02-01 00:09:54 UTC,72335168,point_extractor_by_frame,4.1.0,"[1481.24658203125, 64.83441925048828, 152.1798...",...,89.162,,,,,,,,,EIG05-2_55.jpg
1,394132937,laswett,2309767.0,20600,T2,2022-02-02 19:19:04 UTC,72335168,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,EIG05-2_55.jpg
2,394132937,laswett,2309767.0,20600,T4,2022-02-02 19:19:04 UTC,72335168,point_extractor_by_frame,4.1.0,"[1474.2442626953125, 144.0193634033203, 12.096...",...,93.166,,,,,,,,,EIG05-2_55.jpg
3,394221102,colarsoledad,2360454.0,20600,T2,2022-02-03 03:24:54 UTC,72335168,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,EIG05-2_55.jpg
4,394221102,colarsoledad,2360454.0,20600,T4,2022-02-03 03:24:54 UTC,72335168,point_extractor_by_frame,4.1.0,"[1474.424560546875, 146.7071990966797, 12.0523...",...,93.166,,,,,,,,,EIG05-2_55.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431770,403414441,Fire_seeker,2439780.0,20600,T4,2022-03-18 13:15:20 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg
431771,403489002,Mary1863,1760559.0,20600,T2,2022-03-18 20:13:31 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg
431772,403489002,Mary1863,1760559.0,20600,T4,2022-03-18 20:13:31 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg
431773,403749038,databanana,1682891.0,20600,T2,2022-03-20 11:31:17 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg


## Anonymise the data

In [16]:
from hashlib import blake2b

df_panoptes_point_extractor["user_id"] = df_panoptes_point_extractor['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_point_extractor['user_name'] = df_panoptes_point_extractor['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

df_panoptes_question["user_id"] = df_panoptes_question['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_question['user_name'] = df_panoptes_question['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

In [17]:
df_panoptes_point_extractor

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T4_tool0_x,...,workflow_version,data.frame0.T2_tool1_x,data.frame0.T2_tool1_y,data.frame0.T2_tool0_x,data.frame0.T2_tool0_y,data.frame0.T2_tool2_x,data.frame0.T2_tool2_y,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y,image_name
0,393742576,19835766ecbb50ec6fbe68caa85e4c63,8bfe78a8dc879fc7aabe1ff94a42d74f,20600,T4,2022-02-01 00:09:54 UTC,72335168,point_extractor_by_frame,4.1.0,"[1481.24658203125, 64.83441925048828, 152.1798...",...,89.162,,,,,,,,,EIG05-2_55.jpg
1,394132937,691500ccebe2131f83809524df652f87,c11a32c827347926881e5e1db75cb701,20600,T2,2022-02-02 19:19:04 UTC,72335168,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,EIG05-2_55.jpg
2,394132937,691500ccebe2131f83809524df652f87,c11a32c827347926881e5e1db75cb701,20600,T4,2022-02-02 19:19:04 UTC,72335168,point_extractor_by_frame,4.1.0,"[1474.2442626953125, 144.0193634033203, 12.096...",...,93.166,,,,,,,,,EIG05-2_55.jpg
3,394221102,07452636222fb135e92fd5bedbecd18c,1dcb5107d19f3fa3d0d40eb4fa5a32ac,20600,T2,2022-02-03 03:24:54 UTC,72335168,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,EIG05-2_55.jpg
4,394221102,07452636222fb135e92fd5bedbecd18c,1dcb5107d19f3fa3d0d40eb4fa5a32ac,20600,T4,2022-02-03 03:24:54 UTC,72335168,point_extractor_by_frame,4.1.0,"[1474.424560546875, 146.7071990966797, 12.0523...",...,93.166,,,,,,,,,EIG05-2_55.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431770,403414441,54ae6fcd86ece9f2ddceb66b12764b08,181ab00d3414bff0e41fbdb9379795e1,20600,T4,2022-03-18 13:15:20 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg
431771,403489002,0a044a95880379be03b7f5800b7716b9,1fd5c346537644c93a0b7d87c436545b,20600,T2,2022-03-18 20:13:31 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg
431772,403489002,0a044a95880379be03b7f5800b7716b9,1fd5c346537644c93a0b7d87c436545b,20600,T4,2022-03-18 20:13:31 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg
431773,403749038,74a407ea10c24ff0e2c30672740b06d1,e420d35a1ea646006eea5b67a80ae4c4,20600,T2,2022-03-20 11:31:17 UTC,72339681,point_extractor_by_frame,4.1.0,,...,93.166,,,,,,,,,FPA04_31.jpg


## Determine the amount of yes Answers for "Is there an Iguana"

In [18]:
df_panoptes_question

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.no,data.aggregation_version,data.yes
0,393742576,19835766ecbb50ec6fbe68caa85e4c63,8bfe78a8dc879fc7aabe1ff94a42d74f,20600,T0,2022-02-01 00:09:54 UTC,72335168,question_extractor,1.0,4.1.0,
1,393742638,19835766ecbb50ec6fbe68caa85e4c63,8bfe78a8dc879fc7aabe1ff94a42d74f,20600,T0,2022-02-01 00:10:20 UTC,72334984,question_extractor,1.0,4.1.0,
2,393742678,19835766ecbb50ec6fbe68caa85e4c63,8bfe78a8dc879fc7aabe1ff94a42d74f,20600,T0,2022-02-01 00:10:33 UTC,72340244,question_extractor,1.0,4.1.0,
3,393743049,19835766ecbb50ec6fbe68caa85e4c63,8bfe78a8dc879fc7aabe1ff94a42d74f,20600,T0,2022-02-01 00:12:15 UTC,72336318,question_extractor,,4.1.0,1.0
4,393768668,dc0ea5c32da0e322969ff6e6aa76ca5f,,20600,T0,2022-02-01 03:12:48 UTC,72372999,question_extractor,,4.1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
88313,412711106,d607a725449d5e3a2deaba5014fce17d,dabdbb41b2897bd5fdef6dc4ca74584b,20600,T0,2022-05-03 07:02:18 UTC,72333409,question_extractor,1.0,4.1.0,
88314,412711577,d607a725449d5e3a2deaba5014fce17d,dabdbb41b2897bd5fdef6dc4ca74584b,20600,T0,2022-05-03 07:06:48 UTC,72333423,question_extractor,1.0,4.1.0,
88315,412711830,d607a725449d5e3a2deaba5014fce17d,dabdbb41b2897bd5fdef6dc4ca74584b,20600,T0,2022-05-03 07:08:50 UTC,72333494,question_extractor,1.0,4.1.0,
88316,412712258,d607a725449d5e3a2deaba5014fce17d,dabdbb41b2897bd5fdef6dc4ca74584b,20600,T0,2022-05-03 07:12:18 UTC,72333471,question_extractor,1.0,4.1.0,


In [19]:
df_panoptes_question_r = df_panoptes_question[df_panoptes_question.task == "T0"][["subject_id", "data.no", "data.yes"]].groupby("subject_id").sum()

df_panoptes_question_r = df_panoptes_question_r.reset_index()
df_panoptes_question_r = df_panoptes_question_r[df_panoptes_question_r.subject_id.isin(df_subjects.subject_id)]
df_panoptes_question_r

Unnamed: 0,subject_id,data.no,data.yes
0,72332768,22.0,0.0
1,72332769,21.0,0.0
2,72332770,21.0,0.0
3,72332771,20.0,1.0
4,72332772,21.0,2.0
...,...,...,...
9092,72373345,19.0,2.0
9093,72373347,18.0,4.0
9094,72373348,5.0,16.0
9095,72373350,6.0,15.0


In [20]:
df_panoptes_question_r.to_csv(output_path / config["panoptes_question"], index = False)

## Get the Point Marks Analysis Ready

Filter for T2 only

In [21]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor[
    (df_panoptes_point_extractor.task == "T2")
]
df_panoptes_point_extractor_r.columns

Index(['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
       'created_at', 'subject_id', 'extractor', 'data.aggregation_version',
       'data.frame0.T4_tool0_x', 'data.frame0.T4_tool0_y',
       'data.frame0.T4_tool2_x', 'data.frame0.T4_tool2_y',
       'data.frame0.T4_tool3_x', 'data.frame0.T4_tool3_y',
       'data.frame0.T4_tool1_x', 'data.frame0.T4_tool1_y',
       'data.frame0.T4_tool4_x', 'data.frame0.T4_tool4_y',
       'data.frame0.T4_tool5_x', 'data.frame0.T4_tool5_y',
       'data.frame0.T4_tool7_x', 'data.frame0.T4_tool7_y',
       'data.frame0.T4_tool6_x', 'data.frame0.T4_tool6_y', 'workflow_version',
       'data.frame0.T2_tool1_x', 'data.frame0.T2_tool1_y',
       'data.frame0.T2_tool0_x', 'data.frame0.T2_tool0_y',
       'data.frame0.T2_tool2_x', 'data.frame0.T2_tool2_y',
       'data.frame0.T2_tool3_x', 'data.frame0.T2_tool3_y', 'image_name'],
      dtype='object')

### Which tool is which now?
| Tool Name               | Classification                               |
|-------------------------|----------------------------------------------|
| data.frame0.T2_tool0_x  | Adult Male in a lek                          |
| data.frame0.T2_tool1_x  | Adult Male alone                             |
| data.frame0.T2_tool2_x  | Others (females, young males, juveniles)     |
| data.frame0.T2_tool3_x  | Partial iguana                               |
| data.frame0.T2_tool4_x  | Could be an iguana, not sure                 |

Is "Could be an iguana, not sure" and "Partial Iguana" are omitted.


In [22]:
# create a flat structure from the nested marks over multiple columns from that.
from ast import literal_eval

columns_keep_x = ['data.frame0.T2_tool0_x', 'data.frame0.T2_tool1_x', 'data.frame0.T2_tool2_x']
columns_keep_y = ['data.frame0.T2_tool0_y', 'data.frame0.T2_tool1_y', 'data.frame0.T2_tool2_y']

for col in columns_keep_x + columns_keep_y:
    df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# Merge the lists in 'x' and 'y' coordinates
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r[columns_keep_y].values.tolist()

# Flatten the lists in each row for 'x' and 'y'
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r['x'].apply(lambda x: [item for sublist in x for item in sublist])
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r['y'].apply(lambda x: [item for sublist in x for item in sublist])

# Explode the DataFrame to separate rows for each x, y pair
# Note: This requires pandas >= 0.25 for simultaneous explode
# df_panoptes_point_extractor_r_exploded = df_panoptes_point_extractor_r.apply(pd.Series.explode)

# Explode the DataFrame based on these columns to get separate rows for each list element
# Make sure to perform the explode operation on both columns simultaneously to keep the x and y coordinates paired
df_panoptes_point_extractor_r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T4_tool0_x,...,data.frame0.T2_tool1_y,data.frame0.T2_tool0_x,data.frame0.T2_tool0_y,data.frame0.T2_tool2_x,data.frame0.T2_tool2_y,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y,image_name,x,y
1,394132937,691500ccebe2131f83809524df652f87,c11a32c827347926881e5e1db75cb701,20600,T2,2022-02-02 19:19:04 UTC,72335168,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,EIG05-2_55.jpg,[],[]
3,394221102,07452636222fb135e92fd5bedbecd18c,1dcb5107d19f3fa3d0d40eb4fa5a32ac,20600,T2,2022-02-03 03:24:54 UTC,72335168,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,EIG05-2_55.jpg,[],[]
5,394417706,51ff8f6bdcfbc629ee64b0f03456f9be,73e55824a8121ac5c7e625d8644450fc,20600,T2,2022-02-03 22:37:12 UTC,72335168,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,EIG05-2_55.jpg,[],[]
7,394681993,1f1fbd60a47a8f619cab23b0367b305c,a0763c2de16d5ffb4cca7e97cfdba769,20600,T2,2022-02-05 00:53:57 UTC,72335168,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,EIG05-2_55.jpg,[],[]
9,394699098,b9dfe23678c9bbf754d04706a6e36269,a2f79791c7443eb1b6fec361a61f896e,20600,T2,2022-02-05 03:46:10 UTC,72335168,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,EIG05-2_55.jpg,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431765,399771935,691500ccebe2131f83809524df652f87,c11a32c827347926881e5e1db75cb701,20600,T2,2022-03-01 19:57:47 UTC,72339681,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,FPA04_31.jpg,[],[]
431767,403260922,8405b5369ddfd9c86ad032a0bc48dcbc,42e1de77e2ea77ceafd5c4926da78cb6,20600,T2,2022-03-17 16:57:25 UTC,72339681,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,FPA04_31.jpg,[],[]
431769,403414441,54ae6fcd86ece9f2ddceb66b12764b08,181ab00d3414bff0e41fbdb9379795e1,20600,T2,2022-03-18 13:15:20 UTC,72339681,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,FPA04_31.jpg,[],[]
431771,403489002,0a044a95880379be03b7f5800b7716b9,1fd5c346537644c93a0b7d87c436545b,20600,T2,2022-03-18 20:13:31 UTC,72339681,point_extractor_by_frame,4.1.0,,...,[],[],[],[],[],,,FPA04_31.jpg,[],[]


In [23]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor_r[
    ['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
     'created_at', 'subject_id', "image_name",
     'x', 'y'
     ]].reset_index(drop=True)

df_panoptes_point_extractor_r

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
0,394132937,691500ccebe2131f83809524df652f87,c11a32c827347926881e5e1db75cb701,20600,T2,2022-02-02 19:19:04 UTC,72335168,EIG05-2_55.jpg,[],[]
1,394221102,07452636222fb135e92fd5bedbecd18c,1dcb5107d19f3fa3d0d40eb4fa5a32ac,20600,T2,2022-02-03 03:24:54 UTC,72335168,EIG05-2_55.jpg,[],[]
2,394417706,51ff8f6bdcfbc629ee64b0f03456f9be,73e55824a8121ac5c7e625d8644450fc,20600,T2,2022-02-03 22:37:12 UTC,72335168,EIG05-2_55.jpg,[],[]
3,394681993,1f1fbd60a47a8f619cab23b0367b305c,a0763c2de16d5ffb4cca7e97cfdba769,20600,T2,2022-02-05 00:53:57 UTC,72335168,EIG05-2_55.jpg,[],[]
4,394699098,b9dfe23678c9bbf754d04706a6e36269,a2f79791c7443eb1b6fec361a61f896e,20600,T2,2022-02-05 03:46:10 UTC,72335168,EIG05-2_55.jpg,[],[]
...,...,...,...,...,...,...,...,...,...,...
215619,399771935,691500ccebe2131f83809524df652f87,c11a32c827347926881e5e1db75cb701,20600,T2,2022-03-01 19:57:47 UTC,72339681,FPA04_31.jpg,[],[]
215620,403260922,8405b5369ddfd9c86ad032a0bc48dcbc,42e1de77e2ea77ceafd5c4926da78cb6,20600,T2,2022-03-17 16:57:25 UTC,72339681,FPA04_31.jpg,[],[]
215621,403414441,54ae6fcd86ece9f2ddceb66b12764b08,181ab00d3414bff0e41fbdb9379795e1,20600,T2,2022-03-18 13:15:20 UTC,72339681,FPA04_31.jpg,[],[]
215622,403489002,0a044a95880379be03b7f5800b7716b9,1fd5c346537644c93a0b7d87c436545b,20600,T2,2022-03-18 20:13:31 UTC,72339681,FPA04_31.jpg,[],[]


In [24]:
# explode the lists of marks per user into one row per mark
df_panoptes_point_extractor_r_ex = df_panoptes_point_extractor_r.apply(lambda x: x.explode() if x.name in ['x', 'y'] else x)

In [25]:
# images with no marks have NaN values in the 'merged_x' and 'merged_y' columns
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex.dropna(subset=['x', 'y'], how='all').sort_values(by=['user_id', 'subject_id', 'task', 'created_at'])
df_panoptes_point_extractor_r_ex_dropped

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
161321,397130915,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-02-15 19:21:11 UTC,72332826,EGI01-1-2_83.jpg,297.09903,102.134964
161330,397130915,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-02-15 19:21:11 UTC,72332826,EGI01-1-2_83.jpg,297.09903,102.134964
103907,401114209,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-03-08 00:53:47 UTC,72332844,EGI01-2-1_26.jpg,1384.466797,1027.737793
103917,401114209,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-03-08 00:53:47 UTC,72332844,EGI01-2-1_26.jpg,1384.466797,1027.737793
199047,397739195,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-02-18 20:16:43 UTC,72332945,EGI02-1_23.jpg,679.872864,338.704895
...,...,...,...,...,...,...,...,...,...,...
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,97.770393,117.218422
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,113.634987,186.909332
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,87.571724,167.078568
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,443.786987,124.978645


In [26]:
# cast x and y to int
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex_dropped.astype({'x': 'int32', 'y': 'int32'})
df_panoptes_point_extractor_r_ex_dropped

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
161321,397130915,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-02-15 19:21:11 UTC,72332826,EGI01-1-2_83.jpg,297,102
161330,397130915,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-02-15 19:21:11 UTC,72332826,EGI01-1-2_83.jpg,297,102
103907,401114209,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-03-08 00:53:47 UTC,72332844,EGI01-2-1_26.jpg,1384,1027
103917,401114209,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-03-08 00:53:47 UTC,72332844,EGI01-2-1_26.jpg,1384,1027
199047,397739195,d84d31ab467c59e8a3ba49030312072e,003d2347111646e2fe7ae94b67bde94b,20600,T2,2022-02-18 20:16:43 UTC,72332945,EGI02-1_23.jpg,679,338
...,...,...,...,...,...,...,...,...,...,...
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,97,117
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,113,186
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,87,167
108350,397662853,9432ca625cbcee771e54db4c895908d3,,20600,T2,2022-02-18 12:38:57 UTC,72373351,ESCG02-2_65.jpg,443,124


In [27]:
df_panoptes_point_extractor_r_ex_dropped.to_csv(config["flat_panoptes_points"], sep=",", index = False)

## Inspecting the results
Check the numbers for a single subject_id

In [28]:
### Looks the images in question

subject_id_1 = 47968423
subject_id_2 = 47969478
df_panoptes_point_extractor_r_ex_dropped[(df_panoptes_point_extractor_r_ex_dropped.subject_id == subject_id_2)]

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y


## Download images
iguanas-from-above-subjects_with_url.csv will be used to track which url was already downlaoded.

In [29]:
## save the file the extra columns we need for downloading.
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")


# read the modified csv
df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")
df_subjects

Unnamed: 0.1,Unnamed: 0,subject_id,project_id,workflow_id,subject_set_id,metadata,locations,classifications_count,retired_at,retirement_reason,created_at,updated_at,image_name,flight_code,url,filepath
0,190,47967468,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_08.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-15 19:06:16 UTC,classification_count,2020-07-18 20:38:14 UTC,2020-07-18 20:38:14 UTC,SFB01-3_08.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
1,191,47967469,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_15.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-10-28 19:25:18 UTC,classification_count,2020-07-18 20:38:17 UTC,2020-07-18 20:38:17 UTC,SFB01-3_15.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
2,192,47967470,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_27.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-14 10:07:19 UTC,classification_count,2020-07-18 20:38:18 UTC,2020-07-18 20:38:18 UTC,SFB01-3_27.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
3,193,47967471,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_28.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-09 10:36:02 UTC,classification_count,2020-07-18 20:38:20 UTC,2020-07-18 20:38:20 UTC,SFB01-3_28.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
4,194,47967472,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_34.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-18 20:44:36 UTC,classification_count,2020-07-18 20:38:22 UTC,2020-07-18 20:38:22 UTC,SFB01-3_34.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57833,58027,78965182,11905,22040.0,106640,"{""id"":""476"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",30,2023-06-04 20:04:00 UTC,classification_count,2022-07-24 11:23:51 UTC,2022-07-24 11:23:51 UTC,ESCH02-2_80.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,
57834,58028,78965183,11905,22040.0,106640,"{""id"":""477"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",24,2023-08-30 16:43:06 UTC,classification_count,2022-07-24 11:23:51 UTC,2023-07-28 07:13:45 UTC,ESCH02-2_81.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,
57835,58029,78965184,11905,22040.0,106640,"{""id"":""478"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",30,2023-07-14 13:55:55 UTC,classification_count,2022-07-24 11:23:52 UTC,2022-07-24 11:23:52 UTC,ESCH02-2_91.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,
57836,58030,78965185,11905,22040.0,106640,"{""id"":""479"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",30,2023-07-20 04:37:58 UTC,classification_count,2022-07-24 11:23:53 UTC,2022-07-24 11:23:53 UTC,ESCH02-2_92.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,


In [30]:
# df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")

# downoaded_images_path = Path("./data/downloaded_images")
# downoaded_images_path.mkdir(exist_ok=True, parents=True)
# return_val = True
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# for index, row in df_subjects[df_subjects.workflow_id.isin([workflow_id_p1])].iterrows():
#     # Only download if necessary
#     if pd.isna(row.get("filepath")) or not row.get("filepath", False):
#         flight_code = row['flight_code']
#         url = row['url']
#         image_name = Path(row['image_name']).name
#         # Extract the filename from the URL and create a unique name using index
#         filename = downoaded_images_path.joinpath(f"{image_name}_{row['subject_id']}_{flight_code}.jpeg")
#         df_subjects.loc[index, 'filepath'] = filename
#         # Download the image
#         return_val = download_image(url, filename)
# 
#         # print(f"Downloaded {filename}")
#     if return_val == False:
#         print("there was a problem")
#         # break
        

In [31]:
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")

## Filter for the expert gold standard


In [32]:
gold_standard_path = config["goldstandard_data"]
df_gold_standard = pd.read_csv(gold_standard_path, sep=";")
df_gold_standard

Unnamed: 0,subspecies,island,site_name,subject_group,image_name,subject_id,presence_absence,count_male-lek,count_male-no-lek,count_others,count_partial,count_total,quality,condition,comment
0,A. c. venustissimus,Española,Gardner Islet,Gardner Islet,EGI01-1-2_120.jpg,72332801,N,0,0,0,0,0,,,
1,A. c. venustissimus,Española,Gardner Islet,Gardner Islet,EGI01-1-2_121.jpg,72332802,N,0,0,0,0,0,,,
2,A. c. venustissimus,Española,Gardner Islet,Gardner Islet,EGI01-2-2_116.jpg,72332876,N,0,0,0,0,0,,,
3,A. c. venustissimus,Española,Gardner Islet,Gardner Islet,EGI02-2_114.jpg,72332972,N,0,0,0,0,0,,,
4,A. c. venustissimus,Española,Gardner Islet,Gardner Islet,EGI02-2_36.jpg,72332999,N,0,0,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,A. c. venustissimus,Floreana,South Coast J,South Coast J,FSCJ02-2_243.jpg,72341576,Y,0,0,3,1,3,Good,Hard,
452,A. c. venustissimus,Floreana,South Coast J,South Coast J,FSCJ02-2_251.jpg,72341585,Y,0,0,1,0,1,Good,Hard,
453,A. c. venustissimus,Floreana,South Coast J,South Coast J,FSCJ02-2_269.jpg,72341593,Y,0,0,5,0,5,Bad,Visible,
454,A. c. venustissimus,Floreana,South Coast J,South Coast J,FSCJ02-2_59.jpg,72341609,N,0,0,0,0,0,,,


In [33]:
### Use the custom zooniverse extractor

In [34]:
# from zooniverse.utils.data_format import data_prep
# use_gold_standard_subset = "expert_goldstandard"
# ds_stats = data_prep(phase_tag=phase_tag, 
#                         output_path=output_path, 
#                         input_path=input_path,
#                         filter_combination=use_gold_standard_subset, 
#                         config=config)
#
# ds_stats

[32m2024-04-12 12:05:31.291[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 1st launch[0m
[32m2024-04-12 12:05:41.418[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Main Workflow[0m
[32m2024-04-12 12:05:43.330[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: survey tool 061417[0m
[32m2024-04-12 12:05:45.477[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 2nd launch[0m
[32m2024-04-12 12:05:51.721[0m | [1mINFO    [0m | [36mzooniverse.utils.data_format[0m:[36mread_zooniverse_annotations_v2[0m:[36m179[0m - [1mfound a new phase tag: Iguanas 3rd launch[0m
[32m2024-04-12 12:0

Unnamed: 0,filename,images
0,flat_dataset_Iguanas 2nd launch.csv,4574
1,2nd launch_without_prefix,455
2,expert-GS-2ndphase.csv,456
3,2-T2-GS-results-5th-0s.csv,169
4,flat_dataset_filtered_Iguanas 2nd launch.csv,169
