# Filter and Anonymise Zooniverse Classifications 
* replace the user_name and user_id with hashes. Keep "None" for user_id
* filter for expert subject ids because only them are part of the analysis

In [10]:
import pandas as pd
from hashlib import blake2b
from pathlib import Path

classification_file = "/Users/christian/data/zooniverse/IguanasFromAbove/2023-10-15/iguanas-from-above-classifications.csv"

subject_id_filter_p1 = "/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-1stphase.csv"
subject_id_filter_p2 = "/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-2ndphase.csv"
subject_id_filter_p3 = "/Users/christian/data/zooniverse/Images/Zooniverse_Goldstandard_images/expert-GS-3rdphase.csv"

df_subject_p1 = pd.read_csv(subject_id_filter_p1, sep=';')
df_subject_p2 = pd.read_csv(subject_id_filter_p2, sep=';')
df_subject_p3 = pd.read_csv(subject_id_filter_p3, sep=';')

df_subject = pd.concat([df_subject_p1, df_subject_p2, df_subject_p3], axis=0)

# Open a new file to write the modified chunks
output_file_path = 'data/iguanas-from-above-classifications_anonymized_GS.csv'


# Define the chunk size
chunk_size = 100000

# Create an iterator for the CSV file
csv_iterator = pd.read_csv(classification_file, chunksize=chunk_size)

if Path(output_file_path).is_file():
    print(f"deleted the file to prevent redundant entries")
    Path(output_file_path).unlink() 

# Process each chunk
for i, df in enumerate(csv_iterator):
    print(f"Chunk {i + 1}")
    #print(chunk)
    df = df[df.subject_ids.isin(df_subject.subject_id)]
    
    df.loc[:, 'user_id'] = df['user_id'].apply(lambda x: blake2b(str(x).encode(), digest_size=16).hexdigest() if not pd.isnull(x) else x)

    # Anonymize 'user_name' by hashing
    df.loc[:, 'user_name'] = df['user_name'].apply(lambda x: blake2b(x.encode(), digest_size=16).hexdigest() if isinstance(x, str) else x)
    
    
    # Write the modified chunk to the new file
    df.to_csv(output_file_path, mode='a', index=False, header=(i == 0))
    
    

deleted the file to prevent redundant entries
Chunk 1
Chunk 2
Chunk 3
Chunk 4
Chunk 5
Chunk 6
Chunk 7
Chunk 8
Chunk 9
Chunk 10
Chunk 11


  for i, df in enumerate(csv_iterator):


Chunk 12


  for i, df in enumerate(csv_iterator):


Chunk 13
Chunk 14
Chunk 15


In [11]:
df_C = pd.read_csv(output_file_path)

df_C.shape

(100515, 14)

In [13]:
output_file_path_andrea_2 = "/Users/christian/PycharmProjects/iguanas-from-above-zooniverse/data/iguanas-from-above-classifications-GoldStandard.csv"

df_A_2 = pd.read_csv(output_file_path_andrea_2)
df_A_2.shape

(100515, 10)

In [14]:
df_A_2

Unnamed: 0,classification_id,workflow_id,workflow_name,workflow_version,created_at,metadata,annotations,subject_data,subject_ids,anonymized_user_id
0,262327774,14370,Iguanas 1st launch,134.236,2020-07-20 17:27:20 UTC,"{""source"":""api"",""session"":""c06622ef692041b2d0e...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47979290"":{""retired"":{""id"":67610680,""workflo...",47979290,79dbfdb604cd19cab0309b05e1a1a7fe
1,262328317,14370,Iguanas 1st launch,134.236,2020-07-20 17:28:36 UTC,"{""source"":""api"",""session"":""c06622ef692041b2d0e...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47970534"":{""retired"":{""id"":67627516,""workflo...",47970534,79dbfdb604cd19cab0309b05e1a1a7fe
2,262337957,14370,Iguanas 1st launch,134.236,2020-07-20 17:49:40 UTC,"{""source"":""api"",""session"":""c06622ef692041b2d0e...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47987595"":{""retired"":{""id"":67605167,""workflo...",47987595,79dbfdb604cd19cab0309b05e1a1a7fe
3,262393415,14370,Iguanas 1st launch,134.236,2020-07-20 19:43:51 UTC,"{""source"":""api"",""session"":""0211b2351f7e5a1c95a...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47968991"":{""retired"":{""id"":67623024,""workflo...",47968991,79dbfdb604cd19cab0309b05e1a1a7fe
4,262394522,14370,Iguanas 1st launch,134.236,2020-07-20 19:45:55 UTC,"{""source"":""api"",""session"":""0211b2351f7e5a1c95a...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47970508"":{""retired"":{""id"":67610551,""workflo...",47970508,79dbfdb604cd19cab0309b05e1a1a7fe
...,...,...,...,...,...,...,...,...,...,...
100510,511059428,22040,Iguanas 3rd launch,9.630,2023-09-14 23:34:28 UTC,"{""source"":""api"",""session"":""1b9f6b5ed223bdb14fa...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922625"":{""retired"":{""id"":105927200,""workfl...",78922625,d3f7ee0fa9f3edd2b20e083d858662f6
100511,511159826,22040,Iguanas 3rd launch,9.630,2023-09-15 16:04:33 UTC,"{""source"":""api"",""session"":""0ab084911f920ad95b5...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922632"":{""retired"":{""id"":105913462,""workfl...",78922632,785c7059801a15ef1aab6439941b1e8d
100512,511171731,22040,Iguanas 3rd launch,9.630,2023-09-15 17:23:14 UTC,"{""source"":""api"",""session"":""0ab084911f920ad95b5...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922613"":{""retired"":{""id"":105924667,""workfl...",78922613,001fc754df90fda372d1dc680c8d109e
100513,511349810,22040,Iguanas 3rd launch,9.630,2023-09-16 22:56:04 UTC,"{""source"":""api"",""session"":""207e7382699acaa3063...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922625"":{""retired"":{""id"":105927200,""workfl...",78922625,57aacacca90d8049578c1c01db8441f2


In [15]:
df_C

Unnamed: 0,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,expert,metadata,annotations,subject_data,subject_ids
0,262327774,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,33bc5b21c88460dbfea6,14370,Iguanas 1st launch,134.236,2020-07-20 17:27:20 UTC,,,"{""source"":""api"",""session"":""c06622ef692041b2d0e...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47979290"":{""retired"":{""id"":67610680,""workflo...",47979290
1,262328317,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,33bc5b21c88460dbfea6,14370,Iguanas 1st launch,134.236,2020-07-20 17:28:36 UTC,,,"{""source"":""api"",""session"":""c06622ef692041b2d0e...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47970534"":{""retired"":{""id"":67627516,""workflo...",47970534
2,262337957,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,33bc5b21c88460dbfea6,14370,Iguanas 1st launch,134.236,2020-07-20 17:49:40 UTC,,,"{""source"":""api"",""session"":""c06622ef692041b2d0e...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47987595"":{""retired"":{""id"":67605167,""workflo...",47987595
3,262393415,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,33bc5b21c88460dbfea6,14370,Iguanas 1st launch,134.236,2020-07-20 19:43:51 UTC,,,"{""source"":""api"",""session"":""0211b2351f7e5a1c95a...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47968991"":{""retired"":{""id"":67623024,""workflo...",47968991
4,262394522,dd741bfe7c2eabd0265422e728bd1738,08cbd51d46cf6d3d7af3f5d1d4f909b6,33bc5b21c88460dbfea6,14370,Iguanas 1st launch,134.236,2020-07-20 19:45:55 UTC,,,"{""source"":""api"",""session"":""0211b2351f7e5a1c95a...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""47970508"":{""retired"":{""id"":67610551,""workflo...",47970508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100510,511059428,b5dd00492abed0932dee745a2ef255bd,,9fe421fff053a486b442,22040,Iguanas 3rd launch,9.630,2023-09-14 23:34:28 UTC,,,"{""source"":""api"",""session"":""1b9f6b5ed223bdb14fa...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922625"":{""retired"":{""id"":105927200,""workfl...",78922625
100511,511159826,94c60cb03a82b8456f673029e0f4fe02,,e4306eba15c5f315c384,22040,Iguanas 3rd launch,9.630,2023-09-15 16:04:33 UTC,,,"{""source"":""api"",""session"":""0ab084911f920ad95b5...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922632"":{""retired"":{""id"":105913462,""workfl...",78922632
100512,511171731,54d5e8f0591f9fd8e583e963f2188a7d,,d4a679e8dd9339688be1,22040,Iguanas 3rd launch,9.630,2023-09-15 17:23:14 UTC,,,"{""source"":""api"",""session"":""0ab084911f920ad95b5...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922613"":{""retired"":{""id"":105924667,""workfl...",78922613
100513,511349810,132764123b0e3c0dbdb07bd2e1cdaaaf,,b55876b8596ff960dbb0,22040,Iguanas 3rd launch,9.630,2023-09-16 22:56:04 UTC,,,"{""source"":""api"",""session"":""207e7382699acaa3063...","[{""task"":""T0"",""task_label"":""Do you see any **M...","{""78922625"":{""retired"":{""id"":105927200,""workfl...",78922625
