In [None]:
import argilla as rg
import pandas as pd
import os.path as osp
from tqdm.autonotebook import tqdm
from collections import defaultdict, Counter

data_dir = osp.abspath('../generated_data/')
from collections import Counter

In [None]:
# parse credentials
with open('argilla_credentials.sh', 'r') as f:
    lines = f.readlines()
    content_lines = [c.strip() for c in lines if "=" in c]
    credentials = {
        l.split('=')[0]: l.split('=')[1] 
        for l in content_lines
    }
    
    
# connect as owner to argilla server
rg.init(
    api_url=credentials['ARGILLA_API_URL'],
    api_key=credentials['OWNER_API_KEY'],
    #extra_headers={"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
)

# load data
item_path = osp.join(data_dir, 'dense10_items.json')
print(f'read items from {item_path} ...')
item_df = pd.read_json(item_path).set_index('item_id')

# print owner info
rg.User.me()

In [None]:

workspaces = rg.Workspace.list()
all_records = []

for workspace in tqdm(workspaces):
    workspace_datasets = rg.FeedbackDataset.list(workspace=workspace.name)
    
    annotation_datasets = [wd for wd in workspace_datasets if '02_annotation' in wd.name]
    assert len(annotation_datasets) == 1
    annotation_dataset = annotation_datasets[0]
    
    annotation_records = list(annotation_dataset.records)

    for record in annotation_records:
        all_records.append({**record.metadata, 'n_responses': len(record.responses)})

all_records_df = pd.DataFrame(all_records)
all_records_df = all_records_df.merge(item_df, left_index=True, right_index=True)

In [None]:
def get_count_range(x):
    return (x.min(), x.max())

print(
    get_count_range(all_records_df.groupby('tangram_id').size())
)

print(
    get_count_range(all_records_df.groupby('scene').size())
)

In [None]:
tangram_distribution = []

for t in pd.unique(all_records_df.tangram_id):
    t_df = all_records_df.loc[all_records_df.tangram_id == t]
    
    n_entries = len(t_df)
    n_workspaces = len(pd.unique(t_df.workspace_name))
    n_scenes = len(pd.unique(t_df.scene))
    
    tangram_distribution.append({
        'tangram_id': t, 'n_entries': n_entries, 'n_workspaces': n_workspaces, 'n_scenes': n_scenes,
    })

tangram_distribution_df = pd.DataFrame(tangram_distribution)
tangram_distribution_df
    

In [None]:
tangrams_scenes_in_workspaces = []

for ws in pd.unique(all_records_df.workspace_name):
    ws_df = all_records_df.loc[all_records_df.workspace_name == ws]
    
    n_entries = len(ws_df)
    n_tangrams = len(pd.unique(ws_df.tangram_id))
    n_scenes = len(pd.unique(ws_df.scene))
    
    tangram_counts_range =  get_count_range(ws_df.groupby('tangram_id').size())
    scene_counts_range =  get_count_range(ws_df.groupby('scene').size())
    
    tangrams_scenes_in_workspaces.append({
        'workspace': ws, 'n_entries': n_entries, 'n_tangrams': n_tangrams, 'n_scenes': n_scenes, 'tangram_counts_range': tangram_counts_range, 'scene_counts_range': scene_counts_range
    })
    
pd.DataFrame(tangrams_scenes_in_workspaces).set_index('workspace')

In [None]:
ws_df