In [None]:
import os
import argilla as rg
import pandas as pd
import os.path as osp
from tqdm.autonotebook import tqdm
import pickle

slice = '0_0'

data_dir = '../generated_data'
out_dir = osp.abspath(f'{data_dir}/experiment_slices/results/{slice}')

In [None]:
def make_item_identifier(row):
    return f'{row.tangram_id}-{row.scene}-{row.workspace_name}'

In [None]:
# parse credentials
with open('group_argilla_credentials.sh', 'r') as f:
    lines = f.readlines()
    content_lines = [c.strip() for c in lines if "=" in c]
    credentials = {
        l.split('=')[0]: l.split('=')[1] 
        for l in content_lines
    }
    
    
# connect as owner to argilla server
rg.init(
    api_url=credentials['ARGILLA_API_URL'],
    api_key=credentials['OWNER_API_KEY'],
    #extra_headers={"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
)

# print owner info
rg.User.me()

In [None]:
# load data
item_path = osp.join(data_dir, 'dense10_items.json')
print(f'read items from {item_path} ...')
item_df = pd.read_json(item_path).set_index('item_id')

# get user info
users = rg.User.list()
users_dict = {u.id.hex : u.username for u in users}

In [None]:
# get workspaces
workspaces = rg.Workspace.list()

scene_workspaces = [w for w in workspaces if w.name.startswith('sws')]
baseline_workspaces = [w for w in workspaces if w.name.startswith('bws')]

annotation_workspaces = scene_workspaces + baseline_workspaces

print(f'{len(annotation_workspaces)} workspaces found')

In [None]:
all_responses = []  # init results list
ann_errors = []

for workspace in tqdm(annotation_workspaces, total=len(annotation_workspaces)):
    # iterate through workspaces
    workspace_name = workspace.name
    dataset_name = f"02_annotation_{workspace_name}"

    # load feedback dataset
    feedback = rg.FeedbackDataset.from_argilla(dataset_name, workspace=workspace_name)
    
    for record in feedback.records:
        # iterate through records in current dataset
        
        try:
        
            record_metadata = record.metadata
            item_id = record_metadata['item_id']
            item_data = item_df.loc[item_id].to_dict()
            
            for response in record.responses:
                # iterate through responses for current record
                
                user_id = response.user_id.hex
                user_name = users_dict[user_id]
                raw_annotation = response.values['response'].value
                response_status = response.status
                response_time = response.updated_at

                # merge the data into one dict
                response_data = {
                    **item_data,
                    **record_metadata,
                    'user_id': user_id,
                    'user_name': user_name,
                    'raw_annotation': raw_annotation,
                    'status': response_status,
                    'time': response_time
                }

                # append dict to results list
                all_responses.append(response_data)
                
        except:
            ann_errors.append((feedback.name, dict(record)))
            
annotation_df = pd.DataFrame(all_responses)
annotation_df['item_identifyer'] = annotation_df.apply(lambda x: make_item_identifier(x), axis=1)
annotation_df = annotation_df.sort_values(by=['item_id', 'workspace_name']).reset_index(drop=True)

In [None]:
annotation_df

In [None]:
ann_errors

In [None]:
csv_out_path = osp.join(out_dir, 'collected_annotations.csv')
pkl_out_path = osp.join(out_dir, 'collected_annotations.pkl')

# merge with existing annotations

if osp.isfile(pkl_out_path):
    with open(pkl_out_path, 'rb') as f:
        prev_annotation_df = pickle.load(f)
        
    if 'item_identifyer' not in prev_annotation_df.columns:
        prev_annotation_df['item_identifyer'] = prev_annotation_df.apply(lambda x: make_item_identifier(x), axis=1)
    previous_records_df = prev_annotation_df[~prev_annotation_df.item_identifyer.isin(annotation_df.item_identifyer)]
    merged_annotation_df = pd.concat([previous_records_df, annotation_df]).reset_index(drop=True)
else:
    merged_annotation_df = annotation_df

In [None]:
# save csv
merged_annotation_df.to_csv(csv_out_path)

# save pkl
with open(pkl_out_path, 'wb') as f:
    pickle.dump(merged_annotation_df, f)

In [None]:
other_datasets = [d for d in rg.list_datasets() if not d.name.startswith('02_')]
info_datasets = [d for d in other_datasets if d.name.startswith('01_info')]
completion_datasets = [d for d in other_datasets if d.name.startswith('03_completion')]

In [None]:
user_prolific_ids = []
id_errors = []

for dataset in info_datasets:
    feedback = rg.FeedbackDataset.from_argilla(dataset.name, workspace=dataset.workspace.name)
    for record in feedback.records:
        
        try:
            record_metadata = record.metadata
            for response in record.responses:
                user_id = response.user_id.hex
                user_name = users_dict[user_id]
                prolific_id = response.values['prolific_id'].value
                response_status = response.status
                response_time = response.updated_at
                
                response_data = {
                    **record_metadata,
                    'user_id': user_id,
                    'user_name': user_name,
                    'prolific_id': prolific_id,
                    'status': response_status,
                    'time': response_time
                }
                
                user_prolific_ids.append(response_data)
        except:
            id_errors.append((feedback.name, dict(record)))
            
prolific_ids_df = pd.DataFrame(user_prolific_ids)

In [None]:
prolific_ids_df

In [None]:
csv_out_path = osp.join(out_dir, 'prolific_ids.csv')

# merge with existing data
if osp.isfile(csv_out_path):
    prev_prolific_ids_df = pd.read_csv(csv_out_path, index_col=0)
    prev_ids_df = prev_prolific_ids_df[~prev_prolific_ids_df.user_id.isin(prolific_ids_df.user_id)]
    merged_ids_df = pd.concat([prev_ids_df, prolific_ids_df]).reset_index(drop=True)
else:
    merged_ids_df = prolific_ids_df

In [None]:
prolific_ids_df

In [None]:
merged_ids_df

In [None]:
# save csv
merged_ids_df.to_csv(csv_out_path)

In [None]:
id_errors

In [None]:
completion_data = []
completion_errors = []

for dataset in tqdm(completion_datasets):
    feedback = rg.FeedbackDataset.from_argilla(dataset.name, workspace=dataset.workspace.name)
    for record in feedback.records:
        record_metadata = record.metadata
        for response in record.responses:
            try:
                user_id = response.user_id.hex
                user_name = users_dict[user_id]
                submission_confirmation = response.values['submission_confirmation'].value
                comments = response.values.get('comments', None)
                if comments is not None:
                    comments = comments.value
                response_status = response.status
                response_time = response.updated_at
                
                response_data = {
                    **record_metadata,
                    'user_id': user_id,
                    'user_name': user_name,
                    'submission_confirmation': submission_confirmation,
                    'comments': comments,
                    'status': response_status,
                    'time': response_time
                }
                
                completion_data.append(response_data)
            except: 
                completion_errors.append((feedback.name, dict(record)))
            
completion_df = pd.DataFrame(completion_data)

In [None]:
csv_out_path = osp.join(out_dir, 'completion_data.csv')

# merge with existing data
if osp.isfile(csv_out_path):
    prev_completion_df = pd.read_csv(csv_out_path, index_col=0)
    prev_comp_df = prev_completion_df[~prev_completion_df.user_id.isin(completion_df.user_id)]
    merged_completion_df = pd.concat([prev_comp_df, completion_df]).reset_index(drop=True)
else: 
    merged_completion_df = completion_df

In [None]:
completion_df

In [None]:
prev_comp_df

In [None]:
merged_completion_df

In [None]:
# save csv
merged_completion_df.to_csv(csv_out_path)

In [None]:
completion_errors