In [1]:
from dlcliche.notebook import *
from dlcliche.torch_utils import *

## Goal

Check if webdataset is useful for downstream datasets which are typically small.

### Preparing webdataset shards

Used `create_wds_fsd50k.py` to make tar-shards encupslating local 16kHz FSD50K files.
Resulted in making four tar files: `fsd50k-eval-16k-{000000..000003}.tar`.

### Test result

The result show that webdataset is not effective small data regime.

In [24]:
%%timeit

import webdataset  as wds
import io
import librosa

url = '/data/A/fsd50k/fsd50k-eval-16k-{000000..000003}.tar'
ds = (
    wds.WebDataset(url)
    .shuffle(1000)
    .to_tuple('wav', 'labels')
)
for i, (wav, labels) in enumerate(ds):
    wav = librosa.load(io.BytesIO(wav))
    labels = labels.decode()
    if i > 100:
        break

9.86 s ± 534 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit

import io
import librosa

def IterativeDataset(root, files, label_set):
    root = Path(root)
    for fname, labels in zip(files, label_set):
        data = librosa.load(root/fname)
        labels = labels
        yield data, labels

df = pd.read_csv('/lab/AR2021/evar/metadata/fsd50k.csv')
df = df[df.split == 'test']

for i, (binary, labels) in enumerate(IterativeDataset('work/16k/fsd50k', df.file_name.values, df.label.values)):
    wav = binary
    labels = labels
    if i > 100:
        break
#print(wav, labels)

9.06 s ± 8.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Note: create tar shard files by codes

In [33]:
def fsd50k_metadata(FSD50K_root):
    FSD = Path(FSD50K_root)
    df = pd.read_csv(FSD/f'FSD50K.ground_truth/dev.csv')
    df['key'] = df.split + '_' + df.fname.apply(lambda s: str(s))
    df['fname'] = df.fname.apply(lambda s: f'FSD50K.dev_audio/{s}.wav')
    dftest = pd.read_csv(FSD/f'FSD50K.ground_truth/eval.csv')
    dftest['key'] = 'eval_' + dftest.fname.apply(lambda s: str(s))
    dftest['split'] = 'eval'
    dftest['fname'] = dftest.fname.apply(lambda s: f'FSD50K.eval_audio/{s}.wav')
    df = pd.concat([df, dftest], ignore_index=True)
    return df


df = fsd50k_metadata(FSD50K_root='/data/A/fsd50k/')
df[:3]

Unnamed: 0,fname,labels,mids,split,key
0,FSD50K.dev_audio/64760.wav,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train,train_64760
1,FSD50K.dev_audio/16399.wav,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train,train_16399
2,FSD50K.dev_audio/16401.wav,"Electric_guitar,Guitar,Plucked_string_instrume...","/m/02sgy,/m/0342h,/m/0fx80y,/m/04szw,/m/04rlf",train,train_16401


In [56]:
import librosa


def load_resampled_mono_wav(fpath, sr):
    y, org_sr = librosa.load('/data/A/fsd50k/FSD50K.dev_audio/382455.wav', sr=None, mono=True)
    if org_sr != sr:
        y = librosa.resample(y, orig_sr=org_sr, target_sr=sr)
    return y


def fsd50k_generator(root, split, sr):
    root = Path(root)
    df = fsd50k_metadata(FSD50K_root=root)
    df = df[df.split == split]
    print(f'Processing {len(df)} {split} samples.')
    for file_name, labels, key in df[['fname', 'labels', 'key']].values:
        fpath = root/file_name
        print(fpath, labels, key)

        sample = {
            '__key__': key,
            'npy': load_resampled_mono_wav(fpath, sr),
            'labels': labels,
        }
        yield sample

gen = fsd50k_generator('/data/A/fsd50k/', 'train', 16000)
next(iter(gen))

Processing 36796 train samples.
/data/A/fsd50k/FSD50K.dev_audio/64760.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_64760


{'__key__': 'train_64760',
 'npy': array([-0.00026427, -0.00128246,  0.00068087, ..., -0.00253225,
        -0.00244647,  0.        ], dtype=float32),
 'labels': 'Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music'}

In [57]:
import webdataset as wds
from itertools import islice


source_dir = '/data/A/fsd50k/'
split = 'train'
sr = 16000
output_name = f'/data/A/fsd50k/{split}-%06d.tar'
max_count = 10000

with wds.ShardWriter(output_name, max_count) as sink:
    for sample in islice(fsd50k_generator(source_dir, split, sr), 0, 100):
        sink.write(sample)

# writing /data/A/fsd50k/train-000000.tar 0 0.0 GB 0
Processing 36796 train samples.
/data/A/fsd50k/FSD50K.dev_audio/64760.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_64760
/data/A/fsd50k/FSD50K.dev_audio/16399.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_16399
/data/A/fsd50k/FSD50K.dev_audio/16401.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_16401
/data/A/fsd50k/FSD50K.dev_audio/16402.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_16402
/data/A/fsd50k/FSD50K.dev_audio/16404.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_16404
/data/A/fsd50k/FSD50K.dev_audio/64761.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_64761
/data/A/fsd50k/FSD50K.dev_audio/268259.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_268259
/data/A/fsd50k/FS

/data/A/fsd50k/FSD50K.dev_audio/64721.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_64721
/data/A/fsd50k/FSD50K.dev_audio/64722.wav Electric_guitar,Guitar,Plucked_string_instrument,Musical_instrument,Music train_64722


KeyboardInterrupt: 

## Note: creating dataset tar archives with command-line


### Install go and tarp commands

https://github.com/webdataset/tarp

- `sudo apt install golang-go`
- `go get -v github.com/tmbdev/tarp/tarp`

### Create tar archive

- `tar --sort=name -cf your_archive.tar your_folders`
- `find your_folder - type f -print| sort | tar -cf your_archive.tar - T -'

### Shuffle and split

- `tar --sorted -cf - your_folders | tarp