In [None]:
import pandas as pd
import os
import os.path as osp

# paths
in_data_dir = osp.abspath('../generated_data/')
out_data_dir = osp.abspath('../generated_data/experiment_slices/')

if not osp.isdir(out_data_dir):
    print(f'create output path {out_data_dir}')
    os.makedirs(out_data_dir)
    
print(f'save outputs to {out_data_dir}')

In [None]:
user_path = osp.join(in_data_dir, 'argilla_users.csv')
partition_path = osp.join(in_data_dir, 'argilla_partitions.csv')
stats_path = osp.join(in_data_dir, 'argilla_partition_stats.csv')

user_df = pd.read_csv(user_path, index_col=0)
partition_df = pd.read_csv(partition_path, index_col=0)
stats_df = pd.read_csv(stats_path, index_col=0)

In [None]:
# partition names for groups

slice_partition_names_0 = stats_df.loc[stats_df.partition_group == 0].index
slice_partition_names_1 = stats_df.loc[stats_df.partition_group == 1].index

In [None]:
# partition_df for group 0
slice_partition_df_0 = partition_df.loc[partition_df.partition.isin(slice_partition_names_0)]

slice_user_df_0 = user_df.loc[user_df.partition.isin(slice_partition_names_0)]
# user_df for group 0, first 5 annotations
slice_user_df_0_0 = slice_user_df_0.loc[slice_user_df_0.partition_annotator_idx < 5]
# user_df for group 0, last 5 annotations
slice_user_df_0_1 = slice_user_df_0.loc[slice_user_df_0.partition_annotator_idx >= 5]

In [None]:
# partition_df for group 1
slice_partition_df_1 = partition_df.loc[partition_df.partition.isin(slice_partition_names_1)]

slice_user_df_1 = user_df.loc[user_df.partition.isin(slice_partition_names_1)]
# user_df for group 1, first 5 annotations
slice_user_df_1_0 = slice_user_df_1.loc[slice_user_df_1.partition_annotator_idx < 5]
# user_df for group 1, last 5 annotations
slice_user_df_1_1= slice_user_df_1.loc[slice_user_df_1.partition_annotator_idx >= 5]

In [None]:
# validation

pd.testing.assert_frame_equal(
    pd.concat([slice_partition_df_0, slice_partition_df_1]).sort_index(),
    partition_df.sort_index()
)

pd.testing.assert_frame_equal(
    pd.concat([slice_user_df_0, slice_user_df_1]).sort_index(), 
    user_df.sort_index()
)

pd.testing.assert_frame_equal(
    pd.concat([slice_user_df_0_0, slice_user_df_0_1, slice_user_df_1_0, slice_user_df_1_1]).sort_index(), 
    user_df.sort_index()
)

assert len(set([
    len(df) for df in [slice_partition_df_0, slice_partition_df_1]
])) == 1

assert len(set([
    len(df) for df in [slice_user_df_0, slice_user_df_1]
])) == 1

assert len(set([
    len(df) for df in [slice_user_df_0_0, slice_user_df_0_1, slice_user_df_1_0, slice_user_df_1_1]
])) == 1

In [None]:
# save to files

slice_partition_df_0.to_csv(osp.join(out_data_dir, 'argilla_partitions_0.csv'))
slice_partition_df_1.to_csv(osp.join(out_data_dir, 'argilla_partitions_1.csv'))
slice_user_df_0.to_csv(osp.join(out_data_dir, 'argilla_users_0.csv'))
slice_user_df_1.to_csv(osp.join(out_data_dir, 'argilla_users_1.csv'))
slice_user_df_0_0.to_csv(osp.join(out_data_dir, 'argilla_users_0_0.csv'))
slice_user_df_0_1.to_csv(osp.join(out_data_dir, 'argilla_users_0_1.csv'))
slice_user_df_1_0.to_csv(osp.join(out_data_dir, 'argilla_users_1_0.csv'))
slice_user_df_1_1.to_csv(osp.join(out_data_dir, 'argilla_users_1_1.csv'))