# Using the Panoptes Aggregation Tool from Zooniverse
https://aggregation-caesar.zooniverse.org/Scripts.html#scripts


In [1]:
from zooniverse.config import get_config
import pandas as pd
from pathlib import Path

### use either the subset of the subset
#phase_tag = "Iguanas 1st launch"
#data_folder = "./data/phase_1"

# phase_tag = "Iguanas 2nd launch"
# data_folder = "./data/phase_2"

phase_tag = "Iguanas 3rd launch"
data_folder = "./data/phase_3"

workflow_id_p1 = 14370.0
workflow_id_p2 = 20600.0
workflow_id_p3 = 22040.0

input_path = Path("/Users/christian/data/zooniverse")

# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("/Users/christian/data/zooniverse/2024_04_12_analysis").joinpath(phase_tag).resolve()

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)


config


{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-3rdphase_renamed.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/3-T2-GS-results-5th-0s.csv'),
 'image_source': None,
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/yes_no_dataset_Iguanas 3rd launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/flat_dataset_Iguanas 3rd launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/flat_panoptes_points_Iguanas 3rd launch.csv'),
 'panoptes_question': PosixPath('/Users/christian/data/zooniverse/2024_04_12_analysis/Iguanas 3rd launch/panoptes_question

# Look into the subjects file
This contains the mappings from the subject_id to the image file

In [2]:
# read the original file
df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")



  df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")


In [3]:
# filter the subjects for only the images in the three phases

df_subjects = df_subjects[df_subjects.workflow_id.isin([workflow_id_p1, workflow_id_p2, workflow_id_p3])]


In [4]:
# inspect the metadata
import json
def get_json_keys(json_str):
    try:
        json_obj = json.loads(json_str)
        return list(json_obj.keys())
    except json.JSONDecodeError:
        return []

# Apply the function to each row in the metadata column and collect all keys
all_keys = df_subjects['locations'].apply(get_json_keys)

# Flatten the list of lists and get unique keys
unique_keys = set([key for sublist in all_keys for key in sublist])

print(unique_keys)

{'0'}


Clean up the subjects file

In [5]:
df_subjects["image_name"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('Image_name') 
                                        or json.loads(x).get('image_name') 
                                        or json.loads(x).get('Filename')).sort_values(ascending=True)

# 'site', 'flight', 'Flight', 'Site', 'flight_code' depict the same
df_subjects["flight_code"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('flight_code') 
                                        or json.loads(x).get('site') 
                                        or json.loads(x).get('flight')
                                        or json.loads(x).get('Flight')
                                        or json.loads(x).get('Site')).sort_values(ascending=True)

df_subjects["url"] = df_subjects['locations'].apply(lambda x: json.loads(x)["0"])
df_subjects["filepath"] = None

In [6]:
from loguru import logger
from time import sleep
# helper function to download the images
import requests

def download_image(url, filename):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            return True
        else:
            logger.warning(f"Failed to download {url}")
            logger.error(response)
            sleep(5)
            return False
    except Exception as e:
        logger.error(e)
        sleep(5)
        return False



# Panoptes Data Extraction from Zooniverse
## Panoptes config
### Create the configuration files automatically
The configurations were changed to custom workflow versions.

In [7]:
# create a configuration file from the workflow
#!mkdir ./data/phase_1
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 14370 --min_version 0 --max_version 142.245 -d ./data/phase_1
# 
#!mkdir ./data/phase_2
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 20600 --min_version 0 --max_version 94.166 -d ./data/phase_2
# 
#!mkdir ./data/phase_3
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 22040 --min_version 0 --max_version 9.63 -d ./data/phase_3

## Extract the data

In [8]:
# phase 1
if data_folder == "./data/phase_1":
    !mkdir ./data/phase_1/V121.144
    !mkdir ./data/phase_1/V134.236
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V121.144 ./data/phase_1/Extractor_config_workflow_14370_V121.144.yaml
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V134.236 ./data/phase_1/Extractor_config_workflow_14370_V134.236-1.yaml


In [9]:
if data_folder == "./data/phase_2":
    # phase 2
    
    !mkdir ./data/phase_2/V89.162
    !mkdir ./data/phase_2/V93.166
    !mkdir ./data/phase_2/V94.166 
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V89.162 ./data/phase_2/Extractor_config_workflow_20600_V89.162.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V93.166 ./data/phase_2/Extractor_config_workflow_20600_V93.166.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V94.166 ./data/phase_2/Extractor_config_workflow_20600_V94.166.yaml



In [None]:
if data_folder == "./data/phase_3":
    !mkdir ./data/phase_3/V7.63
    !mkdir ./data/phase_3/V9.63
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V7.63 ./data/phase_3/Extractor_config_workflow_22040_V7.63.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V9.63 ./data/phase_3/Extractor_config_workflow_22040_V9.63.yaml

mkdir: ./data/phase_3/V7.63: File exists
mkdir: ./data/phase_3/V9.63: File exists
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:00:02
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting:  66% |#############################                | ETA:   0:01:15

### Merge the single point and questions extractions

In [None]:
# phase 1
if data_folder == "./data/phase_1":
    df_panoptes_point_extractor_1 = pd.read_csv(f"./data/phase_1/V121.144/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"./data/phase_1/V134.236/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "121.144"
    df_panoptes_point_extractor_2["workflow_version"] = "134.236"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V121.144/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V134.236/question_extractor_extractions.csv", sep=",")
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)
    
    df_panoptes_point_extractor

In [None]:
# # phase 2
if data_folder == "./data/phase_2":
    # read the rectangles annotations too there
    df_panotes_rectangle_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/shape_extractor_rectangle_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V93.166/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_3 = pd.read_csv(f"{data_folder}/V94.166/point_extractor_by_frame_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1["workflow_version"] = "89.162"
    df_panoptes_point_extractor_2["workflow_version"] = "93.166"
    df_panoptes_point_extractor_3["workflow_version"] = "94.166"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V89.162/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V93.166/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_3 = pd.read_csv(f"{data_folder}/V94.166/question_extractor_extractions.csv", sep=",")
    
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2, df_panoptes_question_3], axis=0)

    df_panotes_rectangle_extractor_1

In [None]:
if data_folder == "./data/phase_3":
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V7.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V9.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "7.63"
    df_panoptes_point_extractor_2["workflow_version"] = "9.63"

    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V7.63/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V9.63/question_extractor_extractions.csv", sep=",")

    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)


In [None]:
df_panoptes_point_extractor

In [None]:
# join the image name from the subjects file
df_panoptes_point_extractor = df_panoptes_point_extractor.merge(df_subjects[["subject_id", "image_name"]], left_on="subject_id", right_on="subject_id")
df_panoptes_point_extractor = df_panoptes_point_extractor[df_panoptes_point_extractor.subject_id.isin(df_subjects.subject_id)]

df_panoptes_point_extractor

## Anonymise the data

In [None]:
from hashlib import blake2b

df_panoptes_point_extractor["user_id"] = df_panoptes_point_extractor['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_point_extractor['user_name'] = df_panoptes_point_extractor['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

df_panoptes_question["user_id"] = df_panoptes_question['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_question['user_name'] = df_panoptes_question['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

In [None]:
df_panoptes_point_extractor

## Determine the amount of yes Answers for "Is there an Iguana"

In [None]:
df_panoptes_question

In [None]:
df_panoptes_question_r = df_panoptes_question[df_panoptes_question.task == "T0"][["subject_id", "data.no", "data.yes"]].groupby("subject_id").sum()

df_panoptes_question_r = df_panoptes_question_r.reset_index()
df_panoptes_question_r = df_panoptes_question_r[df_panoptes_question_r.subject_id.isin(df_subjects.subject_id)]
df_panoptes_question_r

In [None]:
df_panoptes_question_r.to_csv(output_path / config["panoptes_question"], index = False)

## Get the Point Marks Analysis Ready

Filter for T2 only

In [None]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor[
    (df_panoptes_point_extractor.task == "T2")
]
df_panoptes_point_extractor_r.columns

### Which tool is which now?
| Tool Name               | Classification                               |
|-------------------------|----------------------------------------------|
| data.frame0.T2_tool0_x  | Adult Male in a lek                          |
| data.frame0.T2_tool1_x  | Adult Male alone                             |
| data.frame0.T2_tool2_x  | Others (females, young males, juveniles)     |
| data.frame0.T2_tool3_x  | Partial iguana                               |
| data.frame0.T2_tool4_x  | Could be an iguana, not sure                 |

Is "Could be an iguana, not sure" and "Partial Iguana" are omitted.


In [None]:
# create a flat structure from the nested marks over multiple columns from that.
from ast import literal_eval

columns_keep_x = ['data.frame0.T2_tool0_x', 'data.frame0.T2_tool1_x', 'data.frame0.T2_tool2_x']
columns_keep_y = ['data.frame0.T2_tool0_y', 'data.frame0.T2_tool1_y', 'data.frame0.T2_tool2_y']

for col in columns_keep_x + columns_keep_y:
    df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# Merge the lists in 'x' and 'y' coordinates
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r[columns_keep_y].values.tolist()

# Flatten the lists in each row for 'x' and 'y'
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r['x'].apply(lambda x: [item for sublist in x for item in sublist])
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r['y'].apply(lambda x: [item for sublist in x for item in sublist])

# Explode the DataFrame to separate rows for each x, y pair
# Note: This requires pandas >= 0.25 for simultaneous explode
# df_panoptes_point_extractor_r_exploded = df_panoptes_point_extractor_r.apply(pd.Series.explode)

# Explode the DataFrame based on these columns to get separate rows for each list element
# Make sure to perform the explode operation on both columns simultaneously to keep the x and y coordinates paired
df_panoptes_point_extractor_r

In [None]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor_r[
    ['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
     'created_at', 'subject_id', "image_name",
     'x', 'y'
     ]].reset_index(drop=True)

df_panoptes_point_extractor_r

In [None]:
# explode the lists of marks per user into one row per mark
df_panoptes_point_extractor_r_ex = df_panoptes_point_extractor_r.apply(lambda x: x.explode() if x.name in ['x', 'y'] else x)

In [None]:
# images with no marks have NaN values in the 'merged_x' and 'merged_y' columns
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex.dropna(subset=['x', 'y'], how='all').sort_values(by=['user_id', 'subject_id', 'task', 'created_at'])
df_panoptes_point_extractor_r_ex_dropped

In [None]:
# cast x and y to int
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex_dropped.astype({'x': 'int32', 'y': 'int32'})
df_panoptes_point_extractor_r_ex_dropped

In [None]:
df_panoptes_point_extractor_r_ex_dropped.to_csv(config["flat_panoptes_points"], sep=",", index = False)

## Inspecting the results
Check the numbers for a single subject_id

In [None]:
### Looks the images in question

subject_id_1 = 47968423
subject_id_2 = 47969478
df_panoptes_point_extractor_r_ex_dropped[(df_panoptes_point_extractor_r_ex_dropped.subject_id == subject_id_2)]

## Download images
iguanas-from-above-subjects_with_url.csv will be used to track which url was already downlaoded.

In [None]:
## save the file the extra columns we need for downloading.
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")


# read the modified csv
df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")
df_subjects

In [None]:
# df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")

# downoaded_images_path = Path("./data/downloaded_images")
# downoaded_images_path.mkdir(exist_ok=True, parents=True)
# return_val = True
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# for index, row in df_subjects[df_subjects.workflow_id.isin([workflow_id_p1])].iterrows():
#     # Only download if necessary
#     if pd.isna(row.get("filepath")) or not row.get("filepath", False):
#         flight_code = row['flight_code']
#         url = row['url']
#         image_name = Path(row['image_name']).name
#         # Extract the filename from the URL and create a unique name using index
#         filename = downoaded_images_path.joinpath(f"{image_name}_{row['subject_id']}_{flight_code}.jpeg")
#         df_subjects.loc[index, 'filepath'] = filename
#         # Download the image
#         return_val = download_image(url, filename)
# 
#         # print(f"Downloaded {filename}")
#     if return_val == False:
#         print("there was a problem")
#         # break
        

In [None]:
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")