# Using the Panoptes Aggregation Tool from Zooniverse
https://aggregation-caesar.zooniverse.org/Scripts.html#scripts

This notebook combines the aggregation steps from the raw classification from zooniverse.
Installed from github.


In [1]:
from zooniverse.config import get_config
import pandas as pd
from pathlib import Path

### use either the subset of the subset
phase_tag = "Iguanas 1st launch"
data_folder = "./data/phase_1"

# phase_tag = "Iguanas 2nd launch"
# data_folder = "./data/phase_2"

#phase_tag = "Iguanas 3rd launch"
#data_folder = "./data/phase_3"

input_path = Path("/Users/christian/data/zooniverse")
# use_gold_standard_subset = "expert" # Use the expert-GS-Xphase as the basis
output_path = Path("/Users/christian/data/zooniverse/2024_04_24_analysis").joinpath(phase_tag).resolve()

workflow_id_p1 = 14370.0
workflow_id_p2 = 20600.0
workflow_id_p3 = 22040.0

output_plot_path = output_path.joinpath("plots")
output_plot_path.mkdir(parents=True, exist_ok=True)

reprocess = False

config = get_config(phase_tag=phase_tag, input_path=input_path, output_path=output_path)
config


{'annotations_source': PosixPath('/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv'),
 'goldstandard_data': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-1stphase.csv'),
 'gold_standard_image_subset': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/1-T2-GS-results-5th-0s.csv'),
 'image_source': PosixPath('/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/1st launch'),
 'yes_no_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_24_analysis/Iguanas 1st launch/yes_no_dataset_Iguanas 1st launch.csv'),
 'flat_dataset': PosixPath('/Users/christian/data/zooniverse/2024_04_24_analysis/Iguanas 1st launch/flat_dataset_Iguanas 1st launch.csv'),
 'flat_panoptes_points': PosixPath('/Users/christian/data/zooniverse/2024_04_24_analysis/Iguanas 1st launch/flat_panoptes_points_Iguanas 1st launch.csv'),
 'panoptes_question': PosixPath('/Users/

# Look into the subjects file
This contains the mappings from the subject_id to the image file

In [2]:
# read the original file
df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")


  df_subjects = pd.read_csv("./data/zooniverse/iguanas-from-above-subjects.csv", sep=",")


In [3]:
# filter the subjects for only the images in the three phases

df_subjects = df_subjects[df_subjects.workflow_id.isin([workflow_id_p1, workflow_id_p2, workflow_id_p3])]


In [4]:
# inspect the metadata
import json
def get_json_keys(json_str):
    try:
        json_obj = json.loads(json_str)
        return list(json_obj.keys())
    except json.JSONDecodeError:
        return []

# Apply the function to each row in the metadata column and collect all keys
all_keys = df_subjects['locations'].apply(get_json_keys)

# Flatten the list of lists and get unique keys
unique_keys = set([key for sublist in all_keys for key in sublist])

print(unique_keys)

{'0'}


Clean up the subjects file for inconsistent naming.

In [5]:
df_subjects["image_name"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('Image_name') 
                                        or json.loads(x).get('image_name') 
                                        or json.loads(x).get('Filename')).sort_values(ascending=True)

# 'site', 'flight', 'Flight', 'Site', 'flight_code' depict the same
df_subjects["flight_code"] = df_subjects['metadata'].apply(lambda x: json.loads(x).get('flight_code') 
                                        or json.loads(x).get('site') 
                                        or json.loads(x).get('flight')
                                        or json.loads(x).get('Flight')
                                        or json.loads(x).get('Site')).sort_values(ascending=True)

df_subjects["url"] = df_subjects['locations'].apply(lambda x: json.loads(x)["0"])
df_subjects["filepath"] = None

helper function to download the images using the urls in the subjects file

In [6]:
from loguru import logger
from time import sleep

import requests

def download_image(url, filename):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as file:
                file.write(response.content)
            return True
        else:
            logger.warning(f"Failed to download {url}")
            logger.error(response)
            sleep(5)
            return False
    except Exception as e:
        logger.error(e)
        sleep(5)
        return False



# Panoptes Data Extraction from Zooniverse
## Panoptes config
### Create the configuration files automatically
The configurations were changed to custom workflow versions.

In [7]:
# create a configuration file from the workflow
#!mkdir ./data/phase_1
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 14370 --min_version 0 --max_version 142.245 -d ./data/phase_1
# 
#!mkdir ./data/phase_2
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 20600 --min_version 0 --max_version 94.166 -d ./data/phase_2
# 
#!mkdir ./data/phase_3
#! panoptes_aggregation config /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-workflows.csv 22040 --min_version 0 --max_version 9.63 -d ./data/phase_3

## Extract the data

In [8]:
# phase 1
if data_folder == "./data/phase_1":
    !mkdir ./data/phase_1/V121.144
    !mkdir ./data/phase_1/V134.236
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V121.144 ./data/phase_1/Extractor_config_workflow_14370_V121.144.yaml
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_1/V134.236 ./data/phase_1/Extractor_config_workflow_14370_V134.236-1.yaml


mkdir: ./data/phase_1/V121.144: File exists
mkdir: ./data/phase_1/V134.236: File exists
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:00:02
  classifications = pandas.read_csv(classification_csv_in, encoding='utf-8', dtype={'workflow_version': str})
Extracting: 100% |#############################################| Time:  0:02:29


In [9]:
if data_folder == "./data/phase_2" and reprocess == True:
    # phase 2
    
    !mkdir ./data/phase_2/V89.162
    !mkdir ./data/phase_2/V93.166
    !mkdir ./data/phase_2/V94.166 
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V89.162 ./data/phase_2/Extractor_config_workflow_20600_V89.162.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V93.166 ./data/phase_2/Extractor_config_workflow_20600_V93.166.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_2/V94.166 ./data/phase_2/Extractor_config_workflow_20600_V94.166.yaml



In [10]:
if data_folder == "./data/phase_3":
    !mkdir ./data/phase_3/V7.63
    !mkdir ./data/phase_3/V9.63
    
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V7.63 ./data/phase_3/Extractor_config_workflow_22040_V7.63.yaml
    !panoptes_aggregation extract /Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv -d ./data/phase_3/V9.63 ./data/phase_3/Extractor_config_workflow_22040_V9.63.yaml

### Merge the single point and questions extractions

In [11]:
# phase 1
if data_folder == "./data/phase_1":
    df_panoptes_point_extractor_1 = pd.read_csv(f"./data/phase_1/V121.144/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"./data/phase_1/V134.236/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "121.144"
    df_panoptes_point_extractor_2["workflow_version"] = "134.236"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V121.144/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V134.236/question_extractor_extractions.csv", sep=",")
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)
    
    df_panoptes_point_extractor

  df_panoptes_point_extractor_2 = pd.read_csv(f"./data/phase_1/V134.236/point_extractor_by_frame_extractions.csv", sep=",")


In [12]:
# # phase 2
if data_folder == "./data/phase_2":
    # read the rectangles annotations too there
    df_panotes_rectangle_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/shape_extractor_rectangle_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V89.162/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V93.166/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_3 = pd.read_csv(f"{data_folder}/V94.166/point_extractor_by_frame_extractions.csv", sep=",")
    
    df_panoptes_point_extractor_1["workflow_version"] = "89.162"
    df_panoptes_point_extractor_2["workflow_version"] = "93.166"
    df_panoptes_point_extractor_3["workflow_version"] = "94.166"
    
    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V89.162/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V93.166/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_3 = pd.read_csv(f"{data_folder}/V94.166/question_extractor_extractions.csv", sep=",")
    
    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2, df_panoptes_point_extractor_3], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2, df_panoptes_question_3], axis=0)

    df_panotes_rectangle_extractor_1

In [13]:
if data_folder == "./data/phase_3":
    df_panoptes_point_extractor_1 = pd.read_csv(f"{data_folder}/V7.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_2 = pd.read_csv(f"{data_folder}/V9.63/point_extractor_by_frame_extractions.csv", sep=",")
    df_panoptes_point_extractor_1["workflow_version"] = "7.63"
    df_panoptes_point_extractor_2["workflow_version"] = "9.63"

    df_panoptes_question_1 = pd.read_csv(f"{data_folder}/V7.63/question_extractor_extractions.csv", sep=",")
    df_panoptes_question_2 = pd.read_csv(f"{data_folder}/V9.63/question_extractor_extractions.csv", sep=",")

    df_panoptes_point_extractor = pd.concat([df_panoptes_point_extractor_1, df_panoptes_point_extractor_2], axis=0)
    df_panoptes_question = pd.concat([df_panoptes_question_1, df_panoptes_question_2], axis=0)


In [14]:
df_panoptes_point_extractor.drop(columns=["user_name", "user_id"], inplace=False)

Unnamed: 0,classification_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T4_tool5_x,data.frame0.T4_tool5_y,data.frame0.T4_tool3_x,...,data.frame0.T4_tool0_y,data.frame0.T2_tool1_x,data.frame0.T2_tool1_y,data.frame0.T4_tool1_x,data.frame0.T4_tool1_y,data.frame0.T4_tool2_x,data.frame0.T4_tool2_y,workflow_version,data.frame0.T4_tool6_x,data.frame0.T4_tool6_y
0,256866778,14370,T2,2020-07-01 10:24:23 UTC,44660635,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,121.144,,
1,256866778,14370,T4,2020-07-01 10:24:23 UTC,44660635,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,121.144,,
2,256866835,14370,T2,2020-07-01 10:24:40 UTC,44660558,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,121.144,,
3,256866835,14370,T4,2020-07-01 10:24:40 UTC,44660558,point_extractor_by_frame,4.1.0,"[370.5579833984375, 295.52032470703125]","[186.3606414794922, 54.69659423828125]",,...,,,,,,,,121.144,,
4,256866880,14370,T2,2020-07-01 10:24:52 UTC,44660712,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,121.144,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000385,383293740,14370,T4,2021-12-14 19:51:54 UTC,47970069,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,134.236,,
1000386,389328206,14370,T2,2022-01-13 14:13:01 UTC,47970037,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,134.236,,
1000387,389328206,14370,T4,2022-01-13 14:13:01 UTC,47970037,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,134.236,,
1000388,389328326,14370,T2,2022-01-13 14:13:18 UTC,47969950,point_extractor_by_frame,4.1.0,,,,...,,,,,,,,134.236,,


In [15]:
# join the image name from the subjects file
df_panoptes_point_extractor = df_panoptes_point_extractor.merge(df_subjects[["subject_id", "image_name"]], left_on="subject_id", right_on="subject_id")
df_panoptes_point_extractor = df_panoptes_point_extractor[df_panoptes_point_extractor.subject_id.isin(df_subjects.subject_id)]



## Anonymise the data

In [16]:
from hashlib import blake2b

df_panoptes_point_extractor["user_id"] = df_panoptes_point_extractor['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_point_extractor['user_name'] = df_panoptes_point_extractor['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

df_panoptes_question["user_id"] = df_panoptes_question['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)
# Anonymize 'user_name' by hashing
df_panoptes_question['user_name'] = df_panoptes_question['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)

In [17]:
df_panoptes_question[df_panoptes_question["data.yes"] == 1.0].groupby("subject_id").size().sort_values(ascending=False)

subject_id
47970166    81
44660571    64
44660669    58
44660650    57
44660617    56
            ..
47971722     1
47980697     1
47980695     1
47980691     1
47980069     1
Length: 9873, dtype: int64

## Determine the amount of yes Answers for "Is there an Iguana"

In [18]:
df_panoptes_question

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.no,data.aggregation_version,data.photo-too-blurry-or-dark,data.yes
0,256866778,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,14370,T0,2020-07-01 10:24:23 UTC,44660635,question_extractor,1.0,4.1.0,,
1,256866835,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,14370,T0,2020-07-01 10:24:40 UTC,44660558,question_extractor,1.0,4.1.0,,
2,256866880,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,14370,T0,2020-07-01 10:24:52 UTC,44660712,question_extractor,1.0,4.1.0,,
3,256867624,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,14370,T0,2020-07-01 10:29:11 UTC,44660654,question_extractor,1.0,4.1.0,,
4,256867676,386fc0ec047b7e259744e72e8e64b9f9,ea57b1088a10fa7fef30ed0b344e2ca3,14370,T0,2020-07-01 10:29:26 UTC,44660552,question_extractor,1.0,4.1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
500190,383293694,cfbf6573c2dcd1386dfce451d9f6be93,945fed5a4ee1a07cb3b13647e3c1f8e9,14370,T0,2021-12-14 19:51:44 UTC,47970089,question_extractor,1.0,4.1.0,,
500191,383293720,cfbf6573c2dcd1386dfce451d9f6be93,945fed5a4ee1a07cb3b13647e3c1f8e9,14370,T0,2021-12-14 19:51:50 UTC,47970146,question_extractor,1.0,4.1.0,,
500192,383293740,cfbf6573c2dcd1386dfce451d9f6be93,945fed5a4ee1a07cb3b13647e3c1f8e9,14370,T0,2021-12-14 19:51:54 UTC,47970069,question_extractor,1.0,4.1.0,,
500193,389328206,aa5f2961151e502de5ff29fb40337fe5,,14370,T0,2022-01-13 14:13:01 UTC,47970037,question_extractor,1.0,4.1.0,,


In [19]:
df_panoptes_question_r = df_panoptes_question[df_panoptes_question.task == "T0"][["subject_id", "data.no", "data.yes"]].groupby("subject_id").sum()

df_panoptes_question_r = df_panoptes_question_r.reset_index()
df_panoptes_question_r = df_panoptes_question_r[df_panoptes_question_r.subject_id.isin(df_subjects.subject_id)]
df_panoptes_question_r

Unnamed: 0,subject_id,data.no,data.yes
190,47967468,18.0,2.0
191,47967469,19.0,0.0
192,47967470,20.0,0.0
193,47967471,20.0,0.0
194,47967472,18.0,0.0
...,...,...,...
24558,48034473,20.0,0.0
24559,48034474,18.0,0.0
24560,48034475,18.0,0.0
24561,48034476,18.0,0.0


In [20]:
df_panoptes_question_r.to_csv(output_path / config["panoptes_question"], index = False)

## Get the Point Marks Analysis Ready

Filter for T2 only

In [21]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor[
    (df_panoptes_point_extractor.task == "T2")
]
df_panoptes_point_extractor_r.columns

Index(['classification_id', 'user_name', 'user_id', 'workflow_id', 'task',
       'created_at', 'subject_id', 'extractor', 'data.aggregation_version',
       'data.frame0.T4_tool5_x', 'data.frame0.T4_tool5_y',
       'data.frame0.T4_tool3_x', 'data.frame0.T4_tool3_y',
       'data.frame0.T2_tool3_x', 'data.frame0.T2_tool3_y',
       'data.frame0.T4_tool4_x', 'data.frame0.T4_tool4_y',
       'data.frame0.T2_tool0_x', 'data.frame0.T2_tool0_y',
       'data.frame0.T2_tool2_x', 'data.frame0.T2_tool2_y',
       'data.frame0.T2_tool4_x', 'data.frame0.T2_tool4_y',
       'data.frame0.T4_tool0_x', 'data.frame0.T4_tool0_y',
       'data.frame0.T2_tool1_x', 'data.frame0.T2_tool1_y',
       'data.frame0.T4_tool1_x', 'data.frame0.T4_tool1_y',
       'data.frame0.T4_tool2_x', 'data.frame0.T4_tool2_y', 'workflow_version',
       'data.frame0.T4_tool6_x', 'data.frame0.T4_tool6_y', 'image_name'],
      dtype='object')

### Which tool is which now?
| Tool Name               | Classification                               |
|-------------------------|----------------------------------------------|
| data.frame0.T2_tool0_x  | Adult Male in a lek                          |
| data.frame0.T2_tool1_x  | Adult Male alone                             |
| data.frame0.T2_tool2_x  | Others (females, young males, juveniles)     |
| data.frame0.T2_tool3_x  | Partial iguana                               |
| data.frame0.T2_tool4_x  | Could be an iguana, not sure                 |

Is "Could be an iguana, not sure" and "Partial Iguana" are omitted.


In [22]:
# create a flat structure from the nested marks over multiple columns from that.
from ast import literal_eval

columns_keep_x = ['data.frame0.T2_tool0_x', 'data.frame0.T2_tool1_x', 'data.frame0.T2_tool2_x', 'data.frame0.T2_tool4_x']
columns_keep_y = ['data.frame0.T2_tool0_y', 'data.frame0.T2_tool1_y', 'data.frame0.T2_tool2_y', 'data.frame0.T2_tool4_y']

for col in columns_keep_x + columns_keep_y:
    df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])

# Merge the lists in 'x' and 'y' coordinates
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r[columns_keep_y].values.tolist()

# Flatten the lists in each row for 'x' and 'y'
df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r['x'].apply(lambda x: [item for sublist in x for item in sublist])
df_panoptes_point_extractor_r['y'] = df_panoptes_point_extractor_r['y'].apply(lambda x: [item for sublist in x for item in sublist])

# Explode the DataFrame to separate rows for each x, y pair
# Explode the DataFrame based on these columns to get separate rows for each list element
df_panoptes_point_extractor_r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_panoptes_point_extractor_r[col] = df_panoptes_point_extractor_r[col].apply(lambda x: literal_eval(x) if pd.notnull(x) else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_panoptes_point_extractor_r['x'] = df_panoptes_point_extractor_r[columns_keep_x].values.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Unnamed: 0,classification_id,user_name,user_id,workflow_id,task,created_at,subject_id,extractor,data.aggregation_version,data.frame0.T4_tool5_x,...,data.frame0.T4_tool1_x,data.frame0.T4_tool1_y,data.frame0.T4_tool2_x,data.frame0.T4_tool2_y,workflow_version,data.frame0.T4_tool6_x,data.frame0.T4_tool6_y,image_name,x,y
0,262324331,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,T2,2020-07-20 17:19:21 UTC,47982231,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SRIL05-4_171.jpg,[],[]
2,267990486,98de0c7d6fadb9a441e4fd03bee2713d,1f3b77093f4b8b5a02710c611d6d5761,14370,T2,2020-08-13 22:56:56 UTC,47982231,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SRIL05-4_171.jpg,[],[]
4,268033101,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,T2,2020-08-14 05:47:06 UTC,47982231,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SRIL05-4_171.jpg,[],[]
6,268168511,6bac281943e323a47c309cf3b7845f2b,b265ba70f80676717aadafcf2031feec,14370,T2,2020-08-14 21:36:53 UTC,47982231,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SRIL05-4_171.jpg,[],[]
8,269118003,152ad9dcd6135dd625a8051f0efdbb72,3f5a5ac90682c77736c2eacbf774c521,14370,T2,2020-08-20 04:59:18 UTC,47982231,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SRIL05-4_171.jpg,[406.6333312988281],[265.9666748046875]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998422,287939566,a7c38b9d4c0f563d6c452d4db06db678,e410233db1e818f51a48a0d47fa4fa92,14370,T2,2020-11-18 04:32:48 UTC,47970949,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SFM04-1-1_369.jpg,[],[]
998424,288102487,58fa6a1c297b9ff1d6f96fa93f112c48,8f92f55991060f2111a7a0fab2e453ad,14370,T2,2020-11-18 19:47:25 UTC,47970949,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SFM04-1-1_369.jpg,[],[]
998426,288127706,7e82ef4b88b037461f90958b2716c1b8,2a7bf97c8666cf6289378831707942a2,14370,T2,2020-11-18 21:32:33 UTC,47970949,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SFM04-1-1_369.jpg,[],[]
998428,288168438,9e15149b8e8ef01f93697b4dfd952d33,f0e8db7c9aee1017a2699e2483bc4238,14370,T2,2020-11-19 01:03:05 UTC,47970949,point_extractor_by_frame,4.1.0,,...,,,,,134.236,,,SFM04-1-1_369.jpg,[],[]


In [23]:
df_panoptes_point_extractor_r = df_panoptes_point_extractor_r[
    ['classification_id', 'user_name', 'user_id', 'workflow_id',  'workflow_version', 'task',
     'created_at', 'subject_id', "image_name",
     'x', 'y'
     ]].reset_index(drop=True)

df_panoptes_point_extractor_r

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y
0,262324331,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,14370,134.236,T2,2020-07-20 17:19:21 UTC,47982231,SRIL05-4_171.jpg,[],[]
1,267990486,98de0c7d6fadb9a441e4fd03bee2713d,1f3b77093f4b8b5a02710c611d6d5761,14370,134.236,T2,2020-08-13 22:56:56 UTC,47982231,SRIL05-4_171.jpg,[],[]
2,268033101,ec17f4ab6a51fee747c3f1e5f0a004c5,,14370,134.236,T2,2020-08-14 05:47:06 UTC,47982231,SRIL05-4_171.jpg,[],[]
3,268168511,6bac281943e323a47c309cf3b7845f2b,b265ba70f80676717aadafcf2031feec,14370,134.236,T2,2020-08-14 21:36:53 UTC,47982231,SRIL05-4_171.jpg,[],[]
4,269118003,152ad9dcd6135dd625a8051f0efdbb72,3f5a5ac90682c77736c2eacbf774c521,14370,134.236,T2,2020-08-20 04:59:18 UTC,47982231,SRIL05-4_171.jpg,[406.6333312988281],[265.9666748046875]
...,...,...,...,...,...,...,...,...,...,...,...
499211,287939566,a7c38b9d4c0f563d6c452d4db06db678,e410233db1e818f51a48a0d47fa4fa92,14370,134.236,T2,2020-11-18 04:32:48 UTC,47970949,SFM04-1-1_369.jpg,[],[]
499212,288102487,58fa6a1c297b9ff1d6f96fa93f112c48,8f92f55991060f2111a7a0fab2e453ad,14370,134.236,T2,2020-11-18 19:47:25 UTC,47970949,SFM04-1-1_369.jpg,[],[]
499213,288127706,7e82ef4b88b037461f90958b2716c1b8,2a7bf97c8666cf6289378831707942a2,14370,134.236,T2,2020-11-18 21:32:33 UTC,47970949,SFM04-1-1_369.jpg,[],[]
499214,288168438,9e15149b8e8ef01f93697b4dfd952d33,f0e8db7c9aee1017a2699e2483bc4238,14370,134.236,T2,2020-11-19 01:03:05 UTC,47970949,SFM04-1-1_369.jpg,[],[]


In [24]:
# explode the lists of marks per user into one row per mark
df_panoptes_point_extractor_r_ex = df_panoptes_point_extractor_r.apply(lambda x: x.explode() if x.name in ['x', 'y'] else x)

In [25]:
# images with no marks have NaN values in the 'merged_x' and 'merged_y' columns
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex.dropna(subset=['x', 'y'], how='all').sort_values(by=['user_id', 'subject_id', 'task', 'created_at'])
df_panoptes_point_extractor_r_ex_dropped

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y
220164,283007416,77dd9c009b5200eea39788f7f15862f0,003abe9e6bb90e03c50377a33f1137a5,14370,134.236,T2,2020-10-26 09:28:46 UTC,47974351,SMF02-1-2-2_570.jpg,186.475037,310.925049
361505,271011060,fcbbe951169dc1df0b1f803a199f0995,004eb4442ae5c85fcb6b5c78ed902f41,14370,134.236,T2,2020-08-30 16:22:33 UTC,47978494,SFP01_06-2_59.jpg,272.292664,265.212067
492222,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,134.236,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,358.3125,194
492222,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,134.236,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,485.3125,106
492222,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,134.236,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,458.3125,205
...,...,...,...,...,...,...,...,...,...,...,...
466862,269965841,46c1b35e9e7a99fe391c096e87a43677,,14370,134.236,T2,2020-08-24 15:39:55 UTC,48034453,SRBS03-4_40.jpg,484.661652,550.373962
307560,278911094,ec6e6d9157dc272dae21504deaa63687,,14370,134.236,T2,2020-10-08 15:07:18 UTC,48034455,SRBS03-4_43.jpg,742.105591,531.119507
307563,280834231,28e1c957b094499b9178b64aadab669e,,14370,134.236,T2,2020-10-16 07:02:12 UTC,48034455,SRBS03-4_43.jpg,1412.728516,1483.689331
331694,283374116,b35f4fa3408911f68c9c80ba68b8611f,,14370,134.236,T2,2020-10-27 21:11:40 UTC,48034465,SRBS03-4_61.jpg,1458.386841,1283.980591


In [26]:
# cast x and y to int
df_panoptes_point_extractor_r_ex_dropped = df_panoptes_point_extractor_r_ex_dropped.astype({'x': 'int32', 'y': 'int32'})
df_panoptes_point_extractor_r_ex_dropped

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y
220164,283007416,77dd9c009b5200eea39788f7f15862f0,003abe9e6bb90e03c50377a33f1137a5,14370,134.236,T2,2020-10-26 09:28:46 UTC,47974351,SMF02-1-2-2_570.jpg,186,310
361505,271011060,fcbbe951169dc1df0b1f803a199f0995,004eb4442ae5c85fcb6b5c78ed902f41,14370,134.236,T2,2020-08-30 16:22:33 UTC,47978494,SFP01_06-2_59.jpg,272,265
492222,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,134.236,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,358,194
492222,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,134.236,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,485,106
492222,286621582,59e4feed8a3a84fb6bfdc5f29a1713b4,006c16c227f4152ffff80d195fca443b,14370,134.236,T2,2020-11-12 11:08:10 UTC,47980877,SRL04-1_348.jpg,458,205
...,...,...,...,...,...,...,...,...,...,...,...
466862,269965841,46c1b35e9e7a99fe391c096e87a43677,,14370,134.236,T2,2020-08-24 15:39:55 UTC,48034453,SRBS03-4_40.jpg,484,550
307560,278911094,ec6e6d9157dc272dae21504deaa63687,,14370,134.236,T2,2020-10-08 15:07:18 UTC,48034455,SRBS03-4_43.jpg,742,531
307563,280834231,28e1c957b094499b9178b64aadab669e,,14370,134.236,T2,2020-10-16 07:02:12 UTC,48034455,SRBS03-4_43.jpg,1412,1483
331694,283374116,b35f4fa3408911f68c9c80ba68b8611f,,14370,134.236,T2,2020-10-27 21:11:40 UTC,48034465,SRBS03-4_61.jpg,1458,1283


In [27]:
df_panoptes_point_extractor_r_ex_dropped.to_csv(config["flat_panoptes_points"], sep=",", index = False)

## Inspecting the results
Check the numbers for a single subject_id

In [28]:
### Looks the images in question

subject_id_2 = 72373250 
df_debug = df_panoptes_point_extractor_r_ex_dropped[(df_panoptes_point_extractor_r_ex_dropped.subject_id == subject_id_2)]
df_debug

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y


In [29]:
df_debug.groupby('user_name').size()


Series([], dtype: int64)

In [30]:
df_debug[df_debug.user_name == "CallieSanDiego"]

Unnamed: 0,classification_id,user_name,user_id,workflow_id,workflow_version,task,created_at,subject_id,image_name,x,y


## Download images
iguanas-from-above-subjects_with_url.csv will be used to track which url was already downlaoded.

In [31]:
## save the file the extra columns we need for downloading.
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")


# read the modified csv
df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")


In [32]:
# df_subjects = pd.read_csv(output_path / "iguanas-from-above-subjects_with_url.csv")

# downoaded_images_path = Path("./data/downloaded_images")
# downoaded_images_path.mkdir(exist_ok=True, parents=True)
# return_val = True
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# # df = df_subjects[df_subjects.subject_id.isin([44660616, 47968406])]
# for index, row in df_subjects[df_subjects.workflow_id.isin([workflow_id_p1])].iterrows():
#     # Only download if necessary
#     if pd.isna(row.get("filepath")) or not row.get("filepath", False):
#         flight_code = row['flight_code']
#         url = row['url']
#         image_name = Path(row['image_name']).name
#         # Extract the filename from the URL and create a unique name using index
#         filename = downoaded_images_path.joinpath(f"{image_name}_{row['subject_id']}_{flight_code}.jpeg")
#         df_subjects.loc[index, 'filepath'] = filename
#         # Download the image
#         return_val = download_image(url, filename)
# 
#         # print(f"Downloaded {filename}")
#     if return_val == False:
#         print("there was a problem")
#         # break
        

In [33]:
df_subjects.to_csv(output_path / "iguanas-from-above-subjects_with_url.csv")