# Prepare hand annotations dataset
The hand annotations data is reformatted to adapt it to the requirements of the code.

- Rename columns
- Remove exceeding annotations to create balance dataset

NOTE: Creating a balanced dataset is not necessary since the code will balance the data if it is specified.

In [1]:
import os
import random
import pandas as pd

In [2]:
RES_DIR = '../res'

%cd {RES_DIR}



In [3]:
REDUCE_BALANCED = False

In [4]:
SRC_CSV = 'annotations/annotations_merged.csv'

if REDUCE_BALANCED:
    DST_CSV = 'annotations/annotations_merged_renamed_balanced.csv'
else:
    DST_CSV = 'annotations/annotations_merged_renamed.csv'

## Read csv

In [5]:
df = pd.read_csv(SRC_CSV, index_col='id')
df

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
192,old
241,old
294,old
328,young
338,old
...,...
580613,old
580847,young
581040,old
581496,old


## Reduce (optional) -> balanced ds

In [6]:
df.label.value_counts()

old      4603
young    2435
Name: label, dtype: int64

In [7]:
old_idx = df[df.label == 'old'].index
young_idx = df[df.label == 'young'].index
print(old_idx)
print(young_idx)

Int64Index([   192,    241,    294,    338,    395,    415,    459,    544,
               564,    623,
            ...
            578545, 578861, 579226, 579231, 579240, 579362, 580248, 580613,
            581040, 581496],
           dtype='int64', name='id', length=4603)
Int64Index([   328,    428,    474,    536,    589,    692,    693,    831,
              1000,   1164,
            ...
            572510, 573179, 575964, 576045, 576566, 576629, 577403, 579438,
            580847, 581736],
           dtype='int64', name='id', length=2435)


In [8]:
limiting_idx = young_idx if len(young_idx) < len(old_idx) else old_idx
excess_idx = old_idx if len(young_idx) < len(old_idx) else young_idx
excess_idx

Int64Index([   192,    241,    294,    338,    395,    415,    459,    544,
               564,    623,
            ...
            578545, 578861, 579226, 579231, 579240, 579362, 580248, 580613,
            581040, 581496],
           dtype='int64', name='id', length=4603)

In [9]:
print('limiting set size:', len(limiting_idx))
print('target balanced ds size:', len(limiting_idx)*2)

limiting set size: 2435
target balanced ds size: 4870


In [10]:
reduced_idx = excess_idx[0:len(limiting_idx)]
reduced_idx

Int64Index([   192,    241,    294,    338,    395,    415,    459,    544,
               564,    623,
            ...
            254994, 255274, 255315, 255322, 255627, 255649, 255662, 255863,
            256031, 256067],
           dtype='int64', name='id', length=2435)

In [11]:
df_balanced = df.loc[limiting_idx.union(reduced_idx)]
df_balanced

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
192,old
241,old
294,old
328,young
338,old
...,...
576629,young
577403,young
579438,young
580847,young


In [12]:
df2 = df_balanced if REDUCE_BALANCED else df
len(df2)

7038

# Rename columns

In [13]:
df_rn = df2.rename(columns={'label': 'bb_age'})
df_rn.index.name = 'img_id'
df_rn

Unnamed: 0_level_0,bb_age
img_id,Unnamed: 1_level_1
192,old
241,old
294,old
328,young
338,old
...,...
580613,old
580847,young
581040,old
581496,old


## Save csv

In [14]:
overwrite = True

if os.path.exists(DST_CSV):
    while True:
        ans = str(input('A csv file already exists. Overwrite? [y]/n: ')).lower()
        if ans == 'y' or ans == 'yes':
            overwrite = True
            break
        elif ans == 'n' or ans == 'no':
            overwrite = False
            break

if overwrite:
    if not os.path.exists(os.path.dirname(DST_CSV)):
        os.makedirs(os.path.dirname(DST_CSV))
    df_rn.to_csv(DST_CSV, index_label='img_id')
    print(f'Saved at {DST_CSV}')

A csv file already exists. Overwrite? [y]/n: y
Saved at annotations/annotations_merged_renamed.csv
