# Using the Panoptes Aggregation Tool from Zooniverse
https://aggregation-caesar.zooniverse.org/Scripts.html#scripts


In [1]:
from zooniverse.config import get_config
import pandas as pd
from pathlib import Path

### use either the subset of the subset
#phase_tag = "Iguanas 1st launch"
#data_folder = "./data/phase_1"

# phase_tag = "Iguanas 2nd launch"
# data_folder = "./data/phase_2"

phase_tag = "Iguanas 3rd launch"
data_folder = "./data/phase_3"

workflow_id_p1 = 14370.0
workflow_id_p2 = 20600.0
workflow_id_p3 = 22040.0

input_path = Path("/Users/christian/data/zooniverse")

# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("/Users/christian/data/zooniverse/2024_04_12_analysis").joinpath(phase_tag).resolve()

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)


config


{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-3rdphase_renamed.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/3-T2-GS-results-5th-0s.csv'),
 'image_source': None,
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/yes_no_dataset_Iguanas 3rd launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/flat_dataset_Iguanas 3rd launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/flat_panoptes_points_Iguanas 3rd launch.csv'),
 'panoptes_question': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/panoptes_question

# Look into the subjects file
This contains the mappings from the subject_id to the image file

In [2]:
# read the original file
df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")



  df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")


In [3]:
# filter the subjects for only the images in the three phases

df_subjects = df_subjects[df_subjects.workflow_id.isin([workflow_id_p1, workflow_id_p2, workflow_id_p3])]


In [4]:
# inspect the metadata
import json
def get_json_keys(json_str):
    try:
        json_obj = json.loads(json_str)
        return list(json_obj.keys())
    except json.JSONDecodeError:
        return []

# Apply the function to each row in the metadata column and collect all keys
all_keys = df_subjects['locations'].apply(get_json_keys)

# Flatten the list of lists and get unique keys
unique_keys = set([key for sublist in all_keys for key in sublist])

print(unique_keys)

{'0'}


Clean up the subjects file

In [5]:
df_subjects["image_name"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('Image_name') 
                                        or json.loads(x).get('image_name') 
                                        or json.loads(x).get('Filename')).sort_values(ascending=True)

# 'site', 'flight', 'Flight', 'Site', 'flight_code' depict the same
df_subjects["flight_code"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('flight_code') 
                                        or json.loads(x).get('site') 
                                        or json.loads(x).get('flight')
                                        or json.loads(x).get('Flight')
                                        or json.loads(x).get('Site')).sort_values(ascending=True)

df_subjects["url"] = df_subjects['locations'].apply(lambda x: json.loads(x)["0"])
df_subjects["filepath"] = None

In [6]:
from loguru import logger
from time import sleep
# helper function to download the images
import requests

def download_image(url, filename):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            return True
        else:
            logger.warning(f"Failed to download {url}")
            logger.error(response)
            sleep(5)
            return False
    except Exception as e:
        logger.error(e)
        sleep(5)
        return False



# Panoptes Data Extraction from Zooniverse
## Panoptes config
### Create the configuration files automatically
The configurations were changed to custom workflow versions.

In [7]:
# create a configuration file from the workflow
#!mkdir ./data/phase_1
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 14370 --min_version 0 --max_version 142.245 -d ./data/phase_1
# 
#!mkdir ./data/phase_2
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 20600 --min_version 0 --max_version 94.166 -d ./data/phase_2
# 
#!mkdir ./data/phase_3
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 22040 --min_version 0 --max_version 9.63 -d ./data/phase_3

## Extract the data

In [8]:
# phase 1
if data_folder == "./data/phase_1":
    !mkdir ./data/phase_1/V121.144
    !mkdir ./data/phase_1/V134.236
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V121.144 ./data/phase_1/Extractor_config_workflow_14370_V121.144.yaml
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V134.236 ./data/phase_1/Extractor_config_workflow_14370_V134.236-1.yaml


In [9]:
if data_folder == "./data/phase_2":
    # phase 2
    
    !mkdir ./data/phase_2/V89.162
    !mkdir ./data/phase_2/V93.166
    !mkdir ./data/phase_2/V94.166 
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V89.162 ./data/phase_2/Extractor_config_workflow_20600_V89.162.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V93.166 ./data/phase_2/Extractor_config_workflow_20600_V93.166.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V94.166 ./data/phase_2/Extractor_config_workflow_20600_V94.166.yaml



In [10]:
if data_folder == "./data/phase_3":
    !mkdir ./data/phase_3/V7.63
    !mkdir ./data/phase_3/V9.63
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V7.63 ./data/phase_3/Extractor_config_workflow_22040_V7.63.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V9.63 ./data/phase_3/Extractor_config_workflow_22040_V9.63.yaml

mkdir: ./data/phase_3/V7.63: File exists
mkdir: ./data/phase_3/V9.63: File exists
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:00:02
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:03:44


### Merge the single point and questions extractions

In [11]:
# phase 1
if data_folder == "./data/phase_1":
    df_panoptes_point_extractor_1 = pd.read_csv(f"./data/phase_1/V121.144/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"./data/phase_1/V134.236/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "121.144"
    df_panoptes_point_extractor_2["workflow_version"] = "134.236"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V121.144/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V134.236/question_extractor_extractions.csv", sep=",")
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)
    
    df_panoptes_point_extractor

In [12]:
# # phase 2
if data_folder == "./data/phase_2":
    # read the rectangles annotations too there
    df_panotes_rectangle_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/shape_extractor_rectangle_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V93.166/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_3 = pd.read_csv(f"{data_folder}/V94.166/point_extractor_by_frame_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1["workflow_version"] = "89.162"
    df_panoptes_point_extractor_2["workflow_version"] = "93.166"
    df_panoptes_point_extractor_3["workflow_version"] = "94.166"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V89.162/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V93.166/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_3 = pd.read_csv(f"{data_folder}/V94.166/question_extractor_extractions.csv", sep=",")
    
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2, df_panoptes_question_3], axis=0)

    df_panotes_rectangle_extractor_1

In [13]:
if data_folder == "./data/phase_3":
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V7.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V9.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "7.63"
    df_panoptes_point_extractor_2["workflow_version"] = "9.63"

    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V7.63/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V9.63/question_extractor_extractions.csv", sep=",")

    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)


In [14]:
df_panoptes_point_extractor

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T2_tool1_x,...,data.frame0.T4_tool7_y,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y,data.frame0.T4_tool5_x,data.frame0.T4_tool5_y,data.frame0.T4_tool6_x,data.frame0.T4_tool6_y,data.frame0.T4_tool4_x,data.frame0.T4_tool4_y,workflow_version
0,428441443,ANDREAVARELA89,1983945.0,22040,T2,2022-07-22 14:32:53 UTC,78861918,point_extractor_by_frame,4.1.0,[557.1560668945312],...,,,,,,,,,,7.63
1,428441443,ANDREAVARELA89,1983945.0,22040,T4,2022-07-22 14:32:53 UTC,78861918,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,7.63
2,428463469,ANDREAVARELA89,1983945.0,22040,T2,2022-07-22 16:29:00 UTC,78861913,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,7.63
3,428463469,ANDREAVARELA89,1983945.0,22040,T4,2022-07-22 16:29:00 UTC,78861913,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,7.63
4,428803453,Nomad_Purple,1312868.0,22040,T2,2022-07-25 08:44:40 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,7.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1425241,511354194,JenniLynn11,2642909.0,22040,T4,2023-09-16 23:41:20 UTC,78922610,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,9.63
1425242,511354201,JenniLynn11,2642909.0,22040,T2,2023-09-16 23:41:27 UTC,78922599,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,9.63
1425243,511354201,JenniLynn11,2642909.0,22040,T4,2023-09-16 23:41:27 UTC,78922599,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,9.63
1425244,511354219,JenniLynn11,2642909.0,22040,T2,2023-09-16 23:41:43 UTC,78922611,point_extractor_by_frame,4.1.0,,...,,,,,,,,,,9.63


In [15]:
# join the image name from the subjects file
df_panoptes_point_extractor = df_panoptes_point_extractor.merge(df_subjects[["subject_id", "image_name"]], left_on="subject_id", right_on="subject_id")
df_panoptes_point_extractor = df_panoptes_point_extractor[df_panoptes_point_extractor.subject_id.isin(df_subjects.subject_id)]

df_panoptes_point_extractor

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T2_tool1_x,...,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y,data.frame0.T4_tool5_x,data.frame0.T4_tool5_y,data.frame0.T4_tool6_x,data.frame0.T4_tool6_y,data.frame0.T4_tool4_x,data.frame0.T4_tool4_y,workflow_version,image_name
0,428803453,Nomad_Purple,1312868.0,22040,T2,2022-07-25 08:44:40 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,7.63,PCIE13-2-2_83.jpg
1,428803453,Nomad_Purple,1312868.0,22040,T4,2022-07-25 08:44:40 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,7.63,PCIE13-2-2_83.jpg
2,430711757,kmorrisseyukyahoo.co.uk,1325881.0,22040,T2,2022-08-04 14:48:04 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,PCIE13-2-2_83.jpg
3,430711757,kmorrisseyukyahoo.co.uk,1325881.0,22040,T4,2022-08-04 14:48:04 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,PCIE13-2-2_83.jpg
4,432408154,Barti,2497726.0,22040,T2,2022-08-13 20:33:37 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,PCIE13-2-2_83.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440565,506596846,not-logged-in-43473667caa941b5af21,,22040,T4,2023-08-16 17:59:50 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg
1440566,506724429,not-logged-in-27af61c94cd10d966386,,22040,T2,2023-08-17 13:41:27 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg
1440567,506724429,not-logged-in-27af61c94cd10d966386,,22040,T4,2023-08-17 13:41:27 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg
1440568,506829613,not-logged-in-d73d63da97c4d21278f3,,22040,T2,2023-08-18 03:50:13 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg


## Anonymise the data

In [16]:
from hashlib import blake2b

df_panoptes_point_extractor["user_id"] = df_panoptes_point_extractor['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_point_extractor['user_name'] = df_panoptes_point_extractor['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

df_panoptes_question["user_id"] = df_panoptes_question['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_question['user_name'] = df_panoptes_question['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

In [17]:
df_panoptes_point_extractor

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T2_tool1_x,...,data.frame0.T2_tool3_x,data.frame0.T2_tool3_y,data.frame0.T4_tool5_x,data.frame0.T4_tool5_y,data.frame0.T4_tool6_x,data.frame0.T4_tool6_y,data.frame0.T4_tool4_x,data.frame0.T4_tool4_y,workflow_version,image_name
0,428803453,6d90c675de24df885cc880eab99a0cbe,5c78309a3a0e2fb4505af64d71e7a83d,22040,T2,2022-07-25 08:44:40 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,7.63,PCIE13-2-2_83.jpg
1,428803453,6d90c675de24df885cc880eab99a0cbe,5c78309a3a0e2fb4505af64d71e7a83d,22040,T4,2022-07-25 08:44:40 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,7.63,PCIE13-2-2_83.jpg
2,430711757,c46159a74cc1f058c9cee2575138ae99,2b43a5981ef5e0c9345e32317105e429,22040,T2,2022-08-04 14:48:04 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,PCIE13-2-2_83.jpg
3,430711757,c46159a74cc1f058c9cee2575138ae99,2b43a5981ef5e0c9345e32317105e429,22040,T4,2022-08-04 14:48:04 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,PCIE13-2-2_83.jpg
4,432408154,070b22b65ade1189e49fdd4074ffd19e,c902c528ae34dd70036e12f944e8dd28,22040,T2,2022-08-13 20:33:37 UTC,78961556,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,PCIE13-2-2_83.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440565,506596846,0f233d137b8f4932724d6d6ee47da149,,22040,T4,2023-08-16 17:59:50 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg
1440566,506724429,4d9c49f508dd6e5a4bb6d87a0b7efd13,,22040,T2,2023-08-17 13:41:27 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg
1440567,506724429,4d9c49f508dd6e5a4bb6d87a0b7efd13,,22040,T4,2023-08-17 13:41:27 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg
1440568,506829613,539dc3219172652cf6a33839a7485421,,22040,T2,2023-08-18 03:50:13 UTC,78923883,point_extractor_by_frame,4.1.0,,...,,,,,,,,,9.63,GWA01-1_180.jpg


## Determine the amount of yes Answers for "Is there an Iguana"

In [18]:
df_panoptes_question

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.yes,data.aggregation_version,data.no
0,428441443,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,22040,T0,2022-07-22 14:32:53 UTC,78861918,question_extractor,1.0,4.1.0,
1,428463469,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,22040,T0,2022-07-22 16:29:00 UTC,78861913,question_extractor,1.0,4.1.0,
2,428803453,6d90c675de24df885cc880eab99a0cbe,5c78309a3a0e2fb4505af64d71e7a83d,22040,T0,2022-07-25 08:44:40 UTC,78961556,question_extractor,,4.1.0,1.0
3,428804329,d881a54ceca557c3fcd4c41a779bdc79,8824af3b56d4acd5574a1a5e239d0348,22040,T0,2022-07-25 08:57:30 UTC,78939084,question_extractor,,4.1.0,1.0
4,428830592,3e3f36ae551c3d0c2d21da11947419ac,fb780c1c46af055b562fae4f15c7207d,22040,T0,2022-07-25 13:30:14 UTC,78957387,question_extractor,,4.1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
712618,511354180,5db78ccea92857b77aeb4228f75e2030,e2b4fd6b6d39bebba0a2185fbc78edba,22040,T0,2023-09-16 23:41:11 UTC,78922581,question_extractor,,4.1.0,1.0
712619,511354184,5db78ccea92857b77aeb4228f75e2030,e2b4fd6b6d39bebba0a2185fbc78edba,22040,T0,2023-09-16 23:41:14 UTC,78922582,question_extractor,,4.1.0,1.0
712620,511354194,5db78ccea92857b77aeb4228f75e2030,e2b4fd6b6d39bebba0a2185fbc78edba,22040,T0,2023-09-16 23:41:20 UTC,78922610,question_extractor,,4.1.0,1.0
712621,511354201,5db78ccea92857b77aeb4228f75e2030,e2b4fd6b6d39bebba0a2185fbc78edba,22040,T0,2023-09-16 23:41:27 UTC,78922599,question_extractor,,4.1.0,1.0


In [19]:
df_panoptes_question_r = df_panoptes_question[df_panoptes_question.task == "T0"][["subject_id", "data.no", "data.yes"]].groupby("subject_id").sum()

df_panoptes_question_r = df_panoptes_question_r.reset_index()
df_panoptes_question_r = df_panoptes_question_r[df_panoptes_question_r.subject_id.isin(df_subjects.subject_id)]
df_panoptes_question_r

Unnamed: 0,subject_id,data.no,data.yes
2,78921848,24.0,0.0
3,78921849,31.0,0.0
4,78921850,29.0,2.0
5,78921851,30.0,1.0
6,78921852,31.0,0.0
...,...,...,...
24365,78965182,28.0,3.0
24366,78965183,5.0,20.0
24367,78965184,30.0,1.0
24368,78965185,13.0,18.0


In [20]:
df_panoptes_question_r.to_csv(output_path / config["panoptes_question"], index = False)

## Get the Point Marks Analysis Ready

Filter for T2 only

In [21]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor[
    (df_panoptes_point_extractor.task == "T2")
]
df_panoptes_point_extractor_r.columns

Index(['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
       'created_at', 'subject_id', 'extractor', 'data.aggregation_version',
       'data.frame0.T2_tool1_x', 'data.frame0.T2_tool1_y',
       'data.frame0.T4_tool0_x', 'data.frame0.T4_tool0_y',
       'data.frame0.T2_tool0_x', 'data.frame0.T2_tool0_y',
       'data.frame0.T2_tool2_x', 'data.frame0.T2_tool2_y',
       'data.frame0.T4_tool3_x', 'data.frame0.T4_tool3_y',
       'data.frame0.T4_tool2_x', 'data.frame0.T4_tool2_y',
       'data.frame0.T4_tool1_x', 'data.frame0.T4_tool1_y',
       'data.frame0.T4_tool7_x', 'data.frame0.T4_tool7_y',
       'data.frame0.T2_tool3_x', 'data.frame0.T2_tool3_y',
       'data.frame0.T4_tool5_x', 'data.frame0.T4_tool5_y',
       'data.frame0.T4_tool6_x', 'data.frame0.T4_tool6_y',
       'data.frame0.T4_tool4_x', 'data.frame0.T4_tool4_y', 'workflow_version',
       'image_name'],
      dtype='object')

### Which tool is which now?
| Tool Name               | Classification                               |
|-------------------------|----------------------------------------------|
| data.frame0.T2_tool0_x  | Adult Male in a lek                          |
| data.frame0.T2_tool1_x  | Adult Male alone                             |
| data.frame0.T2_tool2_x  | Others (females, young males, juveniles)     |
| data.frame0.T2_tool3_x  | Partial iguana                               |
| data.frame0.T2_tool4_x  | Could be an iguana, not sure                 |

Is "Could be an iguana, not sure" and "Partial Iguana" are omitted.


In [22]:
# create a flat structure from the nested marks over multiple columns from that.
from ast import literal_eval

columns_keep_x = ['data.frame0.T2_tool0_x', 'data.frame0.T2_tool1_x', 'data.frame0.T2_tool2_x']
columns_keep_y = ['data.frame0.T2_tool0_y', 'data.frame0.T2_tool1_y', 'data.frame0.T2_tool2_y']

for col in columns_keep_x + columns_keep_y:
    df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# Merge the lists in 'x' and 'y' coordinates
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r[columns_keep_y].values.tolist()

# Flatten the lists in each row for 'x' and 'y'
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r['x'].apply(lambda x: [item for sublist in x for item in sublist])
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r['y'].apply(lambda x: [item for sublist in x for item in sublist])

# Explode the DataFrame to separate rows for each x, y pair
# Note: This requires pandas >= 0.25 for simultaneous explode
# df_panoptes_point_extractor_r_exploded = df_panoptes_point_extractor_r.apply(pd.Series.explode)

# Explode the DataFrame based on these columns to get separate rows for each list element
# Make sure to perform the explode operation on both columns simultaneously to keep the x and y coordinates paired
df_panoptes_point_extractor_r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T2_tool1_x,...,data.frame0.T4_tool5_x,data.frame0.T4_tool5_y,data.frame0.T4_tool6_x,data.frame0.T4_tool6_y,data.frame0.T4_tool4_x,data.frame0.T4_tool4_y,workflow_version,image_name,x,y
0,428803453,6d90c675de24df885cc880eab99a0cbe,5c78309a3a0e2fb4505af64d71e7a83d,22040,T2,2022-07-25 08:44:40 UTC,78961556,point_extractor_by_frame,4.1.0,[],...,,,,,,,7.63,PCIE13-2-2_83.jpg,[],[]
2,430711757,c46159a74cc1f058c9cee2575138ae99,2b43a5981ef5e0c9345e32317105e429,22040,T2,2022-08-04 14:48:04 UTC,78961556,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,PCIE13-2-2_83.jpg,[],[]
4,432408154,070b22b65ade1189e49fdd4074ffd19e,c902c528ae34dd70036e12f944e8dd28,22040,T2,2022-08-13 20:33:37 UTC,78961556,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,PCIE13-2-2_83.jpg,[],[]
6,438063583,6be90f7c5906d966fa46278cab681c33,b88ed174577219f7bc20ed439b5b8349,22040,T2,2022-09-12 14:23:10 UTC,78961556,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,PCIE13-2-2_83.jpg,[],[]
8,444573170,0814d5106f0eb5e30bef5e461b5bc509,56a1977f012ec126a5e97415b0e8b7c1,22040,T2,2022-10-14 23:11:19 UTC,78961556,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,PCIE13-2-2_83.jpg,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440560,504670840,edf0fdcbbabe07f3ed93abf632c2e0c0,b59d03b689c8f933eed4ace8f7931639,22040,T2,2023-08-04 14:32:45 UTC,78923883,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,GWA01-1_180.jpg,[],[]
1440562,505765122,4c7fe317496db725f9f7f44a9e7960fc,a9f82bdcd6b06c824928a4102a669c06,22040,T2,2023-08-11 02:32:51 UTC,78923883,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,GWA01-1_180.jpg,[],[]
1440564,506596846,0f233d137b8f4932724d6d6ee47da149,,22040,T2,2023-08-16 17:59:50 UTC,78923883,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,GWA01-1_180.jpg,[],[]
1440566,506724429,4d9c49f508dd6e5a4bb6d87a0b7efd13,,22040,T2,2023-08-17 13:41:27 UTC,78923883,point_extractor_by_frame,4.1.0,[],...,,,,,,,9.63,GWA01-1_180.jpg,[],[]


In [23]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor_r[
    ['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
     'created_at', 'subject_id', "image_name",
     'x', 'y'
     ]].reset_index(drop=True)

df_panoptes_point_extractor_r

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
0,428803453,6d90c675de24df885cc880eab99a0cbe,5c78309a3a0e2fb4505af64d71e7a83d,22040,T2,2022-07-25 08:44:40 UTC,78961556,PCIE13-2-2_83.jpg,[],[]
1,430711757,c46159a74cc1f058c9cee2575138ae99,2b43a5981ef5e0c9345e32317105e429,22040,T2,2022-08-04 14:48:04 UTC,78961556,PCIE13-2-2_83.jpg,[],[]
2,432408154,070b22b65ade1189e49fdd4074ffd19e,c902c528ae34dd70036e12f944e8dd28,22040,T2,2022-08-13 20:33:37 UTC,78961556,PCIE13-2-2_83.jpg,[],[]
3,438063583,6be90f7c5906d966fa46278cab681c33,b88ed174577219f7bc20ed439b5b8349,22040,T2,2022-09-12 14:23:10 UTC,78961556,PCIE13-2-2_83.jpg,[],[]
4,444573170,0814d5106f0eb5e30bef5e461b5bc509,56a1977f012ec126a5e97415b0e8b7c1,22040,T2,2022-10-14 23:11:19 UTC,78961556,PCIE13-2-2_83.jpg,[],[]
...,...,...,...,...,...,...,...,...,...,...
720280,504670840,edf0fdcbbabe07f3ed93abf632c2e0c0,b59d03b689c8f933eed4ace8f7931639,22040,T2,2023-08-04 14:32:45 UTC,78923883,GWA01-1_180.jpg,[],[]
720281,505765122,4c7fe317496db725f9f7f44a9e7960fc,a9f82bdcd6b06c824928a4102a669c06,22040,T2,2023-08-11 02:32:51 UTC,78923883,GWA01-1_180.jpg,[],[]
720282,506596846,0f233d137b8f4932724d6d6ee47da149,,22040,T2,2023-08-16 17:59:50 UTC,78923883,GWA01-1_180.jpg,[],[]
720283,506724429,4d9c49f508dd6e5a4bb6d87a0b7efd13,,22040,T2,2023-08-17 13:41:27 UTC,78923883,GWA01-1_180.jpg,[],[]


In [24]:
# explode the lists of marks per user into one row per mark
df_panoptes_point_extractor_r_ex = df_panoptes_point_extractor_r.apply(lambda x: x.explode() if x.name in ['x', 'y'] else x)

In [25]:
# images with no marks have NaN values in the 'merged_x' and 'merged_y' columns
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex.dropna(subset=['x', 'y'], how='all').sort_values(by=['user_id', 'subject_id', 'task', 'created_at'])
df_panoptes_point_extractor_r_ex_dropped

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
22676,430180580,096835c7b506ed9511c344f420d01f74,002400ef36f94c5e2a6ccc49859923d8,22040,T2,2022-08-01 20:06:32 UTC,78963883,FPA03_54.jpg,449.09314,399.687042
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,511.535278,594.941162
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,349.569031,711.556946
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,255.988495,756.907471
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,9.079877,245.093994
...,...,...,...,...,...,...,...,...,...,...
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,267.777679,31.393913
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,221.41127,52.645176
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,198.228073,31.393913
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,582.840454,152.431137


In [26]:
# cast x and y to int
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex_dropped.astype({'x': 'int32', 'y': 'int32'})
df_panoptes_point_extractor_r_ex_dropped

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y
22676,430180580,096835c7b506ed9511c344f420d01f74,002400ef36f94c5e2a6ccc49859923d8,22040,T2,2022-08-01 20:06:32 UTC,78963883,FPA03_54.jpg,449,399
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,511,594
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,349,711
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,255,756
363194,501567924,9c249e3b4c1adc212dc055fabe626cfd,00346ebf6ae91002059d21fa7090e46b,22040,T2,2023-07-16 21:09:04 UTC,78925071,GWB01-1_218.jpg,9,245
...,...,...,...,...,...,...,...,...,...,...
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,267,31
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,221,52
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,198,31
462758,494172012,fab341928dc32883a4e5ffcbf12977af,,22040,T2,2023-06-06 12:35:40 UTC,78965185,ESCH02-2_92.jpg,582,152


In [27]:
df_panoptes_point_extractor_r_ex_dropped.to_csv(config["flat_panoptes_points"], sep=",", index = False)

## Inspecting the results
Check the numbers for a single subject_id

In [28]:
### Looks the images in question

subject_id_1 = 47968423
subject_id_2 = 47969478
df_panoptes_point_extractor_r_ex_dropped[(df_panoptes_point_extractor_r_ex_dropped.subject_id == subject_id_2)]

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,image_name,x,y


## Download images
iguanas-from-above-subjects_with_url.csv will be used to track which url was already downlaoded.

In [29]:
## save the file the extra columns we need for downloading.
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")


# read the modified csv
df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")
df_subjects

Unnamed: 0.1,Unnamed: 0,subject_id,project_id,workflow_id,subject_set_id,metadata,locations,classifications_count,retired_at,retirement_reason,created_at,updated_at,image_name,flight_code,url,filepath
0,190,47967468,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_08.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-15 19:06:16 UTC,classification_count,2020-07-18 20:38:14 UTC,2020-07-18 20:38:14 UTC,SFB01-3_08.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
1,191,47967469,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_15.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-10-28 19:25:18 UTC,classification_count,2020-07-18 20:38:17 UTC,2020-07-18 20:38:17 UTC,SFB01-3_15.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
2,192,47967470,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_27.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-14 10:07:19 UTC,classification_count,2020-07-18 20:38:18 UTC,2020-07-18 20:38:18 UTC,SFB01-3_27.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
3,193,47967471,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_28.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-09 10:36:02 UTC,classification_count,2020-07-18 20:38:20 UTC,2020-07-18 20:38:20 UTC,SFB01-3_28.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
4,194,47967472,11905,14370.0,86008,"{""site"":""SFB"",""image_name"":""SFB01-3_34.jpg"",""s...","{""0"":""https://panoptes-uploads.zooniverse.org/...",20,2020-11-18 20:44:36 UTC,classification_count,2020-07-18 20:38:22 UTC,2020-07-18 20:38:22 UTC,SFB01-3_34.jpg,SFB,https://panoptes-uploads.zooniverse.org/subjec...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57833,58027,78965182,11905,22040.0,106640,"{""id"":""476"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",30,2023-06-04 20:04:00 UTC,classification_count,2022-07-24 11:23:51 UTC,2022-07-24 11:23:51 UTC,ESCH02-2_80.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,
57834,58028,78965183,11905,22040.0,106640,"{""id"":""477"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",24,2023-08-30 16:43:06 UTC,classification_count,2022-07-24 11:23:51 UTC,2023-07-28 07:13:45 UTC,ESCH02-2_81.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,
57835,58029,78965184,11905,22040.0,106640,"{""id"":""478"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",30,2023-07-14 13:55:55 UTC,classification_count,2022-07-24 11:23:52 UTC,2022-07-24 11:23:52 UTC,ESCH02-2_91.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,
57836,58030,78965185,11905,22040.0,106640,"{""id"":""479"",""set"":""SouthCoastH"",""site"":""SouthC...","{""0"":""https://panoptes-uploads.zooniverse.org/...",30,2023-07-20 04:37:58 UTC,classification_count,2022-07-24 11:23:53 UTC,2022-07-24 11:23:53 UTC,ESCH02-2_92.jpg,SouthCoastH,https://panoptes-uploads.zooniverse.org/subjec...,


In [30]:
# df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")

# downoaded_images_path = Path("./data/downloaded_images")
# downoaded_images_path.mkdir(exist_ok=True, parents=True)
# return_val = True
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# for index, row in df_subjects[df_subjects.workflow_id.isin([workflow_id_p1])].iterrows():
#     # Only download if necessary
#     if pd.isna(row.get("filepath")) or not row.get("filepath", False):
#         flight_code = row['flight_code']
#         url = row['url']
#         image_name = Path(row['image_name']).name
#         # Extract the filename from the URL and create a unique name using index
#         filename = downoaded_images_path.joinpath(f"{image_name}_{row['subject_id']}_{flight_code}.jpeg")
#         df_subjects.loc[index, 'filepath'] = filename
#         # Download the image
#         return_val = download_image(url, filename)
# 
#         # print(f"Downloaded {filename}")
#     if return_val == False:
#         print("there was a problem")
#         # break
        

In [31]:
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")