In [None]:
from argilla_utils import make_password, make_name
import pandas as pd
import numpy as np
import os.path as osp
from random import sample, seed, shuffle
from collections import Counter
from itertools import product

# Argilla & data settings
RESET_ARGILLA = True        # delete workspaces, users and datasets
TEST_MODE = False           # restrict dataset size
if TEST_MODE:
    TEST_K_SCENES = 5
    TEST_K_TANGRAMS = 14
MODE = 'grid'               # 'inline', 'side' or 'grid', default: 'grid'
assert MODE in {'inline', 'side', 'grid'}, f'invalid mode: {MODE}'
N_GROUPS = 2                # number of groups between which the tangrams are distributed, default: 2
ANNOTATIONS_PER_ITEM = 10   #  default: 10

# random seed
RANDOM_STATE = 123
seed(RANDOM_STATE)

# paths
data_dir = osp.abspath('../generated_data/')

In [None]:
def validate_slices(slices, reference_df):
    # ensure tangram ids are unique within slices
    for i, slice in enumerate(slices):
        if not slice.groupby('tangram_id').size().max() == 1:
            raise Exception(f'tangram ids not unique in slice {i}')
    # ensure all data is contained in slices
    try:
        pd.testing.assert_frame_equal(
            pd.concat(slices).sort_index(), 
            reference_df.sort_index()
        )
    except Exception as e:
        raise Exception(f'scene partitions differ from reference df with error message: {e}')
        
    return True


def tangram_group_idx(tangram_id, partition_idx):
    mask =  np.array([tangram_id in pi for pi in partition_idx]).astype(int)
    assert mask.sum() == 1
    return mask.argmax()

In [None]:
# load data
item_path = osp.join(data_dir, 'dense10_items.json')
print(f'read items from {item_path} ...')
item_df = pd.read_json(item_path)

scene_items = item_df.loc[item_df.scene != 'none']
baseline_items = item_df.loc[item_df.scene == 'none']

print(f'total items: {len(item_df)}')
print(f'scene items: {len(scene_items)}')
print(f'baseline items: {len(baseline_items)}')
assert len(scene_items) + len(baseline_items) == len(item_df)

In [None]:
if TEST_MODE:
    
    n_scenes = len(pd.unique(scene_items.scene))
    n_tangram_ids = len(pd.unique(scene_items.tangram_id))
    TEST_K_SCENES = TEST_K_SCENES if TEST_K_SCENES > 0 else n_scenes
    TEST_K_TANGRAMS = TEST_K_TANGRAMS if TEST_K_TANGRAMS > 0 else n_tangram_ids
    
    print(f'restrict to {TEST_K_SCENES} scenes and {TEST_K_TANGRAMS} tangrams for testing')
    
    unique_scenes = pd.unique(scene_items.scene).tolist()
    unique_tangrams = pd.unique(scene_items.tangram_id).tolist()
    scene_selection = sample(unique_scenes, k=TEST_K_SCENES)
    tangram_selection = sample(unique_tangrams, k=TEST_K_TANGRAMS)

    scene_items = scene_items.loc[scene_items.scene.isin(scene_selection)]
    scene_items = scene_items.loc[scene_items.tangram_id.isin(tangram_selection)]
    baseline_items = baseline_items.loc[baseline_items.tangram_id.isin(tangram_selection)]

    print(f'number of selected items: {len(scene_items)} scene items, {len(baseline_items)} baseline items')

## Make Workspaces and Credentials

In [None]:
# make workspaces

ann_workspaces = []

n_baseline_workspaces = N_GROUPS * ANNOTATIONS_PER_ITEM
print(f'create {n_baseline_workspaces} baseline workspaces (for {N_GROUPS} groups, {ANNOTATIONS_PER_ITEM} anns per item)...')
partition_idx = list(range(N_GROUPS))
annotator_idx = list(range(ANNOTATIONS_PER_ITEM))
for p_idx, a_idx in list(product(partition_idx, annotator_idx)):
    ann_workspaces.append(f'bws{p_idx}_{a_idx}')
    
n_scenes = len(pd.unique(scene_items.scene))
n_scene_workspaces = n_scenes * N_GROUPS * ANNOTATIONS_PER_ITEM
print(f'create {n_scene_workspaces} scene workspaces (for {n_scenes} scenes and {N_GROUPS} groups, {ANNOTATIONS_PER_ITEM} anns per item)...')
partition_idx = list(range(n_scenes * N_GROUPS))
annotator_idx = list(range(ANNOTATIONS_PER_ITEM))
for p_idx, a_idx in list(product(partition_idx, annotator_idx)):
    ann_workspaces.append(f'sws{p_idx}_{a_idx}')
    
print(f'total number of annotation workspaces: {len(ann_workspaces)}')

In [None]:
# make annotator credentials

all_users = []

print(f'create users and credentials for {len(ann_workspaces)} workspaces ({ANNOTATIONS_PER_ITEM} annotations per item)...')

# make credentials
for workspace in ann_workspaces:
        
    user_name = ''
    while user_name == '' or user_name in [u['username'] for u in all_users]:
        # ensure that generated user names are valid
        user_name = make_name()
    password = make_password()
    
    partition_name, partition_annotator_idx = workspace.split('_')
    
    all_users.append({
        'username': user_name,
        'password': password,
        'workspace': workspace,
        'partition': partition_name,
        'partition_annotator_idx': int(partition_annotator_idx), 
        'valid': True
    })
    
user_df = pd.DataFrame(all_users)
user_df = user_df.reset_index()

user_filepath = osp.join(data_dir, 'argilla_users.csv')

print(f'save credentials to {user_filepath} ...')
user_df.to_csv(user_filepath)

In [None]:
user_df.head()

## Create Splits

In [None]:
unique_tangrams = pd.unique(scene_items.tangram_id)
unique_scenes = pd.unique(scene_items.scene)

n_scenes = len(unique_tangrams)
n_tangrams = len(unique_scenes)

shuffled_tangrams = unique_tangrams.tolist()
shuffle(shuffled_tangrams)

group_tangrams = np.array_split(shuffled_tangrams, N_GROUPS)

In [None]:
print(f'creating baseline splits for {n_tangrams} tangrams and {N_GROUPS} groups...')

baseline_partitions = []
for tangram_ids in group_tangrams:
    baseline_partition = baseline_items.loc[baseline_items.tangram_id.isin(tangram_ids)]
    baseline_partitions.append(baseline_partition)

In [None]:
print(f'creating splits for {n_tangrams} tangrams, {n_scenes} scenes and {N_GROUPS} groups...')

# shuffle items and sort by tangram_id -> tangram_ids clustered together, but with permuted scenes
shuffled_scene_items  = scene_items.sample(frac=1, random_state=RANDOM_STATE).sort_values(by='tangram_id')

# create slices containing each tangram once with a random scene
tangram_slices = [
    shuffled_scene_items[i::n_tangrams]  
    # step size is n_tangrams, 
    # i.e. select entry per tangram (starting with offset i)
    for i in range(n_tangrams)
]

# split up the tangram slices for annotation groups
scene_partitions = []
for tangram_slice in tangram_slices:
    partitioned_slices = [tangram_slice.loc[tangram_slice.tangram_id.isin(g_idx)] for g_idx in group_tangrams]
    scene_partitions += partitioned_slices

# ensure everything is OK
assert validate_slices(scene_partitions, scene_items)

In [None]:
baseline_partition_dict = {f'bws{i}': p for i, p in enumerate(baseline_partitions)}
scene_partition_dict = {f'sws{i}': p for i, p in enumerate(scene_partitions)}

In [None]:
baseline_workspaces = [w for w in ann_workspaces if 'bws' in w]
scene_workspaces = [w for w in ann_workspaces if 'sws' in w]

assert len(baseline_workspaces) == len(baseline_partitions) * ANNOTATIONS_PER_ITEM
assert len(scene_workspaces) == len(scene_partitions) * ANNOTATIONS_PER_ITEM

all_workspaces = baseline_workspaces + scene_workspaces
all_partitions = baseline_partitions + scene_partitions

workspace_partition_map = {w: w.split('_')[0] for w in all_workspaces}

print(f'# baseline workspaces/partitions: {len(baseline_workspaces)} / {len(baseline_partitions)}')
print(f'# scene workspaces/partitions: {len(scene_workspaces)} / {len(scene_partitions)}')
print(f'# all workspaces/partitions: {len(all_workspaces)} / {len(baseline_partitions + scene_partitions)}')

output_partitions = []

for k, v in {**baseline_partition_dict, **scene_partition_dict}.items():
    part = v.copy()
    part['partition'] = k
    part['workspaces'] = [[w for w, p in workspace_partition_map.items() if p == k]] * len(part.index)
    
    output_partitions.append(part)
    
partition_df = pd.concat(output_partitions).reset_index(drop=True)
partition_df['tangram_group'] = partition_df.tangram_id.map(lambda x: tangram_group_idx(x, group_tangrams))

partition_path = osp.join(data_dir, 'argilla_partitions.csv')
partition_df.to_csv(partition_path)

In [None]:
partition_df.sample(5)

In [None]:
partition_stats = pd.DataFrame(index=pd.unique(partition_df.partition))

partition_group_map = partition_df.groupby('partition').agg({'tangram_group': set})
assert not False in partition_group_map.map(lambda x: len(x) == 1)
partition_group_map = partition_group_map.tangram_group.explode()

min_vals = partition_df.groupby('partition').agg({'kilogram_snd': np.min}).kilogram_snd
max_vals = partition_df.groupby('partition').agg({'kilogram_snd': np.max}).kilogram_snd
mean_vals = partition_df.groupby('partition').agg({'kilogram_snd': np.mean}).kilogram_snd
std_vals = partition_df.groupby('partition').agg({'kilogram_snd': np.std}).kilogram_snd
scene_counts = partition_df.groupby('partition').agg({'scene': lambda x: dict(Counter(x))}).scene
tangram_ids = partition_df.groupby('partition').agg({'tangram_id': set}).tangram_id

partition_stats['partition_group'] = partition_group_map
partition_stats['snd_min'] = min_vals
partition_stats['snd_max'] = max_vals
partition_stats['snd_mean'] = mean_vals
partition_stats['snd_std'] = std_vals
partition_stats['tangram_ids'] = tangram_ids
partition_stats['scenes'] = scene_counts

stats_path = osp.join(data_dir, 'argilla_partition_stats.csv')
partition_stats.to_csv(stats_path)

partition_stats