In [27]:
import os
import random
import shutil
import glob
from tqdm import tqdm

In [40]:
base_path = "/home/wk247/data/enwiki_books_128_20_ver2"
dst_path = "/home/wk247/data/enwiki_books_128_20_ver3"
seed = 33
remain_count = 500
step1_count = 300
step2_count = 470
finetune_count = 10

In [43]:
for i in range(4):
    # get file lists
    path = os.path.join(base_path, f"set{i}")
    train_file_list = sorted(glob.glob(os.path.join(path, "train_*.hdf5")))
    val_file_list = sorted(glob.glob(os.path.join(path, "test_*.hdf5")))
    print(f"Set {i} - total training files: {len(train_file_list)}, val: {len(val_file_list)}")
    
    assert len(train_file_list) == remain_count + step1_count + step2_count + finetune_count, f"train_file_list: {len(train_file_list)}"

    # check duplicates
    index_list = sorted(
        [int(train_filename.split("/")[-1].replace(".hdf5", "").split("_")[2]) for train_filename in train_file_list]
    )
    duplicates = [index for index in index_list if index_list.count(index) > 1]
    assert len(duplicates) == 0, f"Duplicate files exist: {duplicates}"
    
    # random shuffle
    random.seed(seed)
    random.shuffle(train_file_list)
    remain_train_shards = train_file_list[:remain_count]
    move_train_shards = train_file_list[remain_count:]
    
    # remain dst path
    remain_dst_path = os.path.join(dst_path, f"set{i}_{remain_count}")
    os.makedirs(remain_dst_path, exist_ok=False)
    print(f"copy shards to {remain_dst_path}")
    
    for train_file in tqdm(remain_train_shards):
        shutil.copy(train_file, remain_dst_path)
    for val_file in tqdm(val_file_list):
        shutil.copy(val_file, remain_dst_path)
        
    # step1 dst path
    move_dst_path = os.path.join(dst_path, f"step1_{step1_count*4}")
    if i == 0: os.makedirs(move_dst_path, exist_ok=False)
    print(f"copy shards to {move_dst_path}")
    
    for train_file in tqdm(move_train_shards[:step1_count]):
        shutil.copy(train_file, move_dst_path)
    if i == 0:
        for val_file in tqdm(val_file_list):
            shutil.copy(val_file, move_dst_path)
    
    move_train_shards = move_train_shards[step1_count:]
        
    # step2 dst path
    move_dst_path = os.path.join(dst_path, f"step2_{step2_count*4}")
    if i == 0: os.makedirs(move_dst_path, exist_ok=False)
    print(f"copy shards to {move_dst_path}")
    
    for train_file in tqdm(move_train_shards[:step2_count]):
        shutil.copy(train_file, move_dst_path)
    if i == 0:
        for val_file in tqdm(val_file_list):
            shutil.copy(val_file, move_dst_path)
        
    move_train_shards = move_train_shards[step2_count:]
    
    
    # finetune dst path
    move_dst_path = os.path.join(dst_path, f"finetune_{finetune_count*4}")
    if i == 0: os.makedirs(move_dst_path, exist_ok=False)
    print(f"copy shards to {move_dst_path}")
    
    for train_file in tqdm(move_train_shards[:finetune_count]):
        shutil.copy(train_file, move_dst_path)
    if i == 0:
        for val_file in tqdm(val_file_list):
            shutil.copy(val_file, move_dst_path)
        
    move_train_shards = move_train_shards[finetune_count:]
    
    # move train shared should be empty
    assert len(move_train_shards) == 0, f"move_train_shards: {len(move_train_shards)}"

Set 0 - total training files: 1280, val: 64
copy shards to /home/wk247/data/enwiki_books_128_20_ver3/set0_500


100%|██████████| 500/500 [01:00<00:00,  8.23it/s]
100%|██████████| 64/64 [00:01<00:00, 34.03it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step1_1200


100%|██████████| 300/300 [01:50<00:00,  2.71it/s]
100%|██████████| 64/64 [00:02<00:00, 31.54it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step2_1880


100%|██████████| 470/470 [02:44<00:00,  2.86it/s]
100%|██████████| 64/64 [00:02<00:00, 29.65it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/finetune


100%|██████████| 10/10 [00:06<00:00,  1.46it/s]
100%|██████████| 64/64 [00:01<00:00, 33.04it/s]


Set 1 - total training files: 1280, val: 64
copy shards to /home/wk247/data/enwiki_books_128_20_ver3/set1_500


100%|██████████| 500/500 [02:02<00:00,  4.09it/s]
100%|██████████| 64/64 [00:05<00:00, 11.55it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step1_1200


100%|██████████| 300/300 [01:51<00:00,  2.69it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step2_1880


100%|██████████| 470/470 [02:57<00:00,  2.65it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/finetune


100%|██████████| 10/10 [00:01<00:00,  6.02it/s]


Set 2 - total training files: 1280, val: 64
copy shards to /home/wk247/data/enwiki_books_128_20_ver3/set2_500


100%|██████████| 500/500 [03:04<00:00,  2.70it/s]
100%|██████████| 64/64 [00:05<00:00, 10.72it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step1_1200


100%|██████████| 300/300 [02:06<00:00,  2.38it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step2_1880


100%|██████████| 470/470 [02:44<00:00,  2.85it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/finetune


100%|██████████| 10/10 [00:02<00:00,  4.60it/s]


Set 3 - total training files: 1280, val: 64
copy shards to /home/wk247/data/enwiki_books_128_20_ver3/set3_500


100%|██████████| 500/500 [02:40<00:00,  3.12it/s]
100%|██████████| 64/64 [00:04<00:00, 13.51it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step1_1200


100%|██████████| 300/300 [02:00<00:00,  2.50it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/step2_1880


100%|██████████| 470/470 [02:53<00:00,  2.70it/s]


copy shards to /home/wk247/data/enwiki_books_128_20_ver3/finetune


100%|██████████| 10/10 [00:04<00:00,  2.41it/s]


In [35]:
470*4

1880