In [2]:
import pandas as pd
import tqdm

In [3]:
def read_audiodir(
    dataset_path,
    subsample=None,
    dataset=None,
    regex_groups=None,
    filter_list=None,
    partition_lists=None,
    filter_mode='include',
):
    if not isinstance(dataset_path, list):
        dataset_path = [dataset_path]

    all_files = []

    for p in dataset_path:
        all_files_i = list(Path(p).rglob('*.wav')) + list(Path(p).rglob('*.flac'))
        all_files.extend(all_files_i)

    if filter_list is not None:
        with open(filter_list, 'r') as f:
            keep_values = set(f.read().splitlines())
        n_slashes = len(next(iter(keep_values)).split('/')) - 1
        stem_to_f = {'/'.join(v.parts[-n_slashes - 1 :]): v for v in all_files}
        if filter_mode == 'include':
            all_files = [stem_to_f[k] for k in keep_values]
        elif filter_mode == 'discard':
            all_files = [v for k, v in stem_to_f.items() if k not in keep_values]
        else:
            raise Exception("Unrecognized filter_mode {}".format(filter_mode))
    rows = []
    if subsample is not None:
        subsample_idx = np.random.choice(
            np.arange(len(all_files)), size=subsample, replace=False
        )
        all_files = np.array(all_files)[subsample_idx]
    for f in tqdm(all_files):
        try:
            finfo = sf.info(f)
            metadata = {
                'filename': str(f.resolve()),
                'sr': finfo.samplerate,
                'channels': finfo.channels,
                'frames': finfo.frames,
                'duration': finfo.duration,
            }

            if regex_groups is not None:
                regex_data = re.match(
                    regex_groups, str(f.relative_to(dataset_path[0]))
                ).groupdict()
                metadata.update(regex_data)
            rows.append(metadata)
        except Exception as e:
            print(f'Failed reading {f}. {e}')
    df = pd.DataFrame(rows)
    if dataset is not None:
        df['dataset'] = dataset
    df['rel_path'] = df['filename'].apply(
        lambda x: str(Path(x).relative_to(dataset_path[0]))
    )
    if partition_lists is not None:
        remainder = None
        map_to_partitions = {}
        for k, v in partition_lists.items():
            if v is not None:
                list_path = Path(dataset_path[0], v)
                with open(list_path, 'r') as f:
                    list_files = f.read().splitlines()
                for l in list_files:
                    map_to_partitions[str(l)] = k
            else:
                remainder = k
        df['partition'] = df['rel_path'].apply(
            lambda x: map_to_partitions[x] if x in map_to_partitions else remainder
        )
        df = df.drop('rel_path', axis=1)
    return df