In [None]:
import pandas as pd
import os
import os.path as osp

input_dir = '../generated_data/experiment_slices/results'
user_dir = '../generated_data/experiment_slices'
slices = os.listdir(input_dir)
print(f'slices found: {slices}')

In [None]:
# read data
input_files = [osp.join(input_dir, s, 'collected_annotations.csv') for s in slices]
input_dfs = [pd.read_csv(file, index_col=0) for file in input_files]

# merge dfs
merged_df = pd.concat(input_dfs)

# sort out invalid entries
user_files = [osp.join(user_dir, f'argilla_users_{s}.csv') for s in slices]
user_dfs = [pd.read_csv(file, index_col=0) for file in user_files]
merged_user_df = pd.concat(user_dfs)
merged_df = pd.merge(
    merged_df,
    merged_user_df[['username', 'valid']],
    left_on='user_name',
    right_on='username'
)
merged_df = merged_df[merged_df.valid.values]
assert not False in merged_df.valid.values

# sort df
sorted_scenes = ['none'] + sorted(['kitchen', 'bathroom', 'bedroom', 'office', 'forest', 'mountain', 'beach', 'street', 'sky', 'sea_bottom'])
merged_df.scene = pd.Categorical(merged_df.scene, categories=sorted_scenes, ordered=True)
merged_df = merged_df.sort_values(by=['tangram', 'scene', 'workspace_name']).set_index('item_identifyer')  # reset_index(drop=True)

# conver image urls to names
merged_df['image_name'] = merged_df.image_url.map(lambda x: osp.split(x)[-1])

# select columns
columns = [
    'tangram', 'scene', 'raw_annotation', 
    'tangram_id', 'item_id', 'tangram_pos', 'image_name',
    'partition_name', 'workspace_name', 'user_name', 'time']

merged_df = merged_df[columns]

merged_df.sample(5)


In [None]:
# annotations per item
merged_df.groupby('item_id').size().value_counts()

In [None]:
# annotations per user
merged_df.groupby('user_name').size().value_counts()

In [None]:
out_dir = '../collected_data'
out_path = osp.abspath(osp.join(out_dir, 'raw_collected_data.csv'))
print(f'saving data to {out_path}')

merged_df.to_csv(out_path)

In [None]:
def collapse_identical(x):
    assert len(set(x)) == 1
    return list(x)[0]

collapsed_df = merged_df.groupby('item_id').agg({
    'tangram': collapse_identical, 
    'scene': collapse_identical, 
    'raw_annotation': list, 
    'tangram_id': collapse_identical, 
    'item_id': collapse_identical,
    'tangram_pos': collapse_identical, 
    'image_name': collapse_identical, 
    'partition_name': collapse_identical, 
    'workspace_name': list,
    'user_name': list, 
    'time': list
})

collapsed_df = collapsed_df.set_index(['tangram', 'scene'])

collapsed_df.sample(5)