In [None]:
import os
import shutil

import pandas as pd
from tqdm.notebook import tqdm

In [None]:
df_mod = pd.read_csv("modified.csv", index_col=0)
df_mod.head()

### Verified

In [None]:
splits = sorted(list(set(df_mod.split)))
labels = sorted(list(set(df_mod.verified)))

for split in splits:
    for label in tqdm(labels):
        dst = os.path.join('data','verified',split,label)
        if not os.path.isdir(dst):
            os.makedirs(dst)
        buf = df_mod[(df_mod.split==split)&(df_mod.verified==label)]
        for row in buf.itertuples():
            if not os.path.exists(os.path.join(dst,row.file)):
                shutil.copy(os.path.join(row.path), os.path.join(dst,row.file))

### Clean

In [None]:
splits = sorted(list(set(df_mod.split)))
labels = sorted(list(set(df_mod.verified)))

for split in splits:
    for label in tqdm(labels):
        dst = os.path.join('data','clean',split,label)
        if not os.path.isdir(dst):
            os.makedirs(dst)
        buf = df_mod[(df_mod.split==split)&(df_mod.verified==label)&(df_mod.legible==1)]
        for row in buf.itertuples():
            if not os.path.exists(os.path.join(dst,row.file)):
                shutil.copy(os.path.join(row.path), os.path.join(dst,row.file))

### Easy

In [None]:
splits = sorted(list(set(df_mod.split)))
labels = sorted(list(set(df_mod.verified)))

for split in splits:
    for label in tqdm(labels):
        dst = os.path.join('data','easy',split,label)
        if not os.path.isdir(dst):
            os.makedirs(dst)
        buf = df_mod[(df_mod.split==split)&(df_mod.verified==label)&(df_mod.legible==1)&(df_mod.difficult==0)]
        for row in buf.itertuples():
            if not os.path.exists(os.path.join(dst,row.file)):
                shutil.copy(os.path.join(row.path), os.path.join(dst,row.file))

### Balanced - Oversample

In [None]:
splits = sorted(list(set(df_mod.split)))
labels = sorted(list(set(df_mod.verified)))

df_train = df_mod[(df_mod.legible==1)&(df_mod.split=='train')].groupby('verified').sample(
    n=max(df_mod[(df_mod.legible==1)&(df_mod.split=='train')].groupby('verified').count()['file']), 
    random_state=42, 
    replace=True
)
df_train.groupby(['split','verified']).count()[['file']]

In [None]:
df_val = df_mod[(df_mod.legible==1)&(df_mod.split=='val')].groupby('verified').sample(
    n=max(df_mod[(df_mod.legible==1)&(df_mod.split=='val')].groupby('verified').count()['file']), 
    random_state=42, 
    replace=True
)
df_val.groupby(['split','verified']).count()[['file']]

In [None]:
data = {
    'train': df_train,
    'val': df_val,
}

for split in splits:
    for label in tqdm(labels):
        dst = os.path.join('data','balanced',split,label)
        if not os.path.isdir(dst):
            os.makedirs(dst)
        buf = data[split][(data[split].split==split)&(data[split].verified==label)]
        for row in buf.itertuples():
            if not os.path.exists(os.path.join(dst,row.file)):
                shutil.copy(os.path.join(row.path), os.path.join(dst,row.file))
            else:
                base, extension = os.path.splitext(row.file)
                i = 1
                while os.path.exists(os.path.join(dst, '{}_{}{}'.format(base, i, extension))):
                    i += 1
                shutil.copy(os.path.join(row.path), os.path.join(dst, '{}_{}{}'.format(base, i, extension)))