In [None]:
#| default_exp split_out_val_datasets

In [None]:
#| exporti
import os
import webdataset as wds
from pathlib import Path
from fastprogress import progress_bar
from fastcore.script import call_parse
import numpy as np
import random
from collections import Counter
from whisperspeech import utils

In [None]:
ds = wds.WebDataset(utils.shard_glob('../wolnelektury-wds2/wolnelektury-eqvad-000000.tar.gz'))

In [None]:
for s in ds: break
s.keys()

dict_keys(['__key__', '__url__', 'spk_emb.npy', 'vad.npy'])

In [None]:
#| exporti
@call_parse
def split_dataset(
    shard_spec:str,
    splits:str,
):
    shards = utils.shard_glob(shard_spec)
    splits = splits.split()
    
    bufs = {k:[] for k in splits}
    outputs = {k:wds.TarWriter(str(Path(shard_spec).parent/(Path(k).name+".tar.gz"))) for k in splits}
    needles = {k:bufs[split] for split in splits for k in utils.readlines(split)}
    
#     with open(Path(shard_spec).parent/"validation-samples", "w") as f:
#         for k in needles.keys():
#             f.write(k+'\n')

    print(f"Generating splits: {' '.join(outputs.keys())}")
    print(f"Looking for {len(needles)} samples...")
    
    ds = wds.WebDataset(shards).compose(
        wds.select(lambda x: x['__key__'] in needles),
    )
        
    dl = wds.WebLoader(ds, num_workers=32, batch_size=None)

    for s in progress_bar(dl, total='noinfer'):
        needles[s['__key__']].append(s)
        del needles[s['__key__']]
        pass

    for split,buf in bufs.items():
        for s in sorted(buf, key=lambda x: x['__key__']):
            outputs[split].write(s)
    
    if len(needles) > 0:
        print(f"Missed {len(needles)} samples!")
        os.exit(1)

In [None]:
%pdb

In [None]:
split_dataset('../wolnelektury-wds2/wolnelektury-eqvad-stoks-*.tar.gz', '../wolnelektury-wds2/validation-eqvad')

['../wolnelektury-wds2/wolnelektury-eqvad-stoks-000014.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000008.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000010.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000004.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000011.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000003.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000002.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000007.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000013.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000005.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000009.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000000.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000006.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000012.tar.gz', '../wolnelektury-wds2/wolnelektury-eqvad-stoks-000001.tar.gz']
{'../wolnelektury-wds2/validation-eqvad': <webdataset.

In [None]:
split_dataset('whisperspeech-s2a-512c-tts-r/*.tar.gz', 's2a-dim64-ttsr-valfix')

# writing s2a-dim64-ttsr-valfix/train-000000.tar.gz 0 0.0 GB 00350 00:13<00:00]
# writing s2a-dim64-ttsr-valfix/train-000001.tar.gz 400 0.0 GB 4000<04:44]]
# writing s2a-dim64-ttsr-valfix/train-000002.tar.gz 400 0.0 GB 8000<02:47]
# writing s2a-dim64-ttsr-valfix/train-000003.tar.gz 400 0.0 GB 12000<02:12]
# writing s2a-dim64-ttsr-valfix/train-000004.tar.gz 400 0.0 GB 16001<02:01]
# writing s2a-dim64-ttsr-valfix/train-000005.tar.gz 400 0.0 GB 20001<01:45]
# writing s2a-dim64-ttsr-valfix/train-000006.tar.gz 400 0.0 GB 24001<01:39]
# writing s2a-dim64-ttsr-valfix/train-000007.tar.gz 400 0.0 GB 28001<01:35]
# writing s2a-dim64-ttsr-valfix/train-000008.tar.gz 400 0.0 GB 32001<01:32]
# writing s2a-dim64-ttsr-valfix/train-000009.tar.gz 400 0.0 GB 36001<01:26]
# writing s2a-dim64-ttsr-valfix/train-000010.tar.gz 400 0.0 GB 40002<01:24]
# writing s2a-dim64-ttsr-valfix/train-000011.tar.gz 400 0.0 GB 44002<01:22]
# writing s2a-dim64-ttsr-valfix/train-000012.tar.gz 400 0.0 GB 48002<01:21]
# writing

# writing s2a-dim64-ttsr-valfix/train-000211.tar.gz 400 0.0 GB 8440036<00:32]
# writing s2a-dim64-ttsr-valfix/train-000212.tar.gz 400 0.0 GB 8480036<00:32]
# writing s2a-dim64-ttsr-valfix/train-000213.tar.gz 400 0.0 GB 8520036<00:31]
# writing s2a-dim64-ttsr-valfix/train-000214.tar.gz 400 0.0 GB 8560036<00:31]
# writing s2a-dim64-ttsr-valfix/train-000215.tar.gz 400 0.0 GB 86000
# writing s2a-dim64-ttsr-valfix/train-000216.tar.gz 400 0.0 GB 8640036<00:31]
# writing s2a-dim64-ttsr-valfix/train-000217.tar.gz 400 0.0 GB 8680036<00:31]
# writing s2a-dim64-ttsr-valfix/train-000218.tar.gz 400 0.0 GB 8720037<00:30]
# writing s2a-dim64-ttsr-valfix/train-000219.tar.gz 400 0.0 GB 8760037<00:30]
# writing s2a-dim64-ttsr-valfix/train-000220.tar.gz 400 0.0 GB 8800037<00:30]
# writing s2a-dim64-ttsr-valfix/train-000221.tar.gz 400 0.0 GB 88400
# writing s2a-dim64-ttsr-valfix/train-000222.tar.gz 400 0.0 GB 8880037<00:30]
# writing s2a-dim64-ttsr-valfix/train-000223.tar.gz 400 0.0 GB 8920037<00:29]
# wr