# 0 bytes

In [None]:
import os
from tqdm import tqdm

PATH = 'data/voxceleb2/'

problems = []

for idx in tqdm(os.listdir(PATH)):
    currentidx_path = f'{PATH}{idx}/'
    for f in os.listdir(currentidx_path):
        currentfolder_path = f'{currentidx_path}{f}/'
        for w in os.listdir(currentfolder_path):
            current_file = f'{currentfolder_path}{w}'
            if os.stat(current_file).st_size == 0:
                problems.append(current_file)

In [None]:
print('Problems found:')
print(len(problems))
print('Folders:')
print(set([p.split('/')[2] for p in problems]))

# Slow loading

In [None]:
import os
from tqdm import tqdm

PATH = 'data/voxceleb2/'

problems = []

for idx in tqdm(os.listdir(PATH)):
    currentidx_path = f'{PATH}{idx}/'
    for f in os.listdir(currentidx_path):
        currentfolder_path = f'{currentidx_path}{f}/'
        for w in os.listdir(currentfolder_path):
            current_file = f'{currentfolder_path}{w}'
            if os.stat(current_file).st_size == 0:
                loadWAV(current_file)

In [None]:
from datasets.Sampler import Sampler
import torch
import importlib
from easydict import EasyDict

config = {
    "train_dataset": "VoxCeleb2",
    "train_list": "data/train_list.txt",
    "train_path": "data/voxceleb2/",
    "max_frames": 200,
    "max_epoch": 500,
    "batch_size": 400,
    "nDataLoaderThread": 5,
    "max_seg_per_spk": 500,
    "sampler": True,
    "nPerSpeaker": 2,
    "seed": 1337,

    "distributed": False,
}

config = EasyDict(config)

TrainDataset = importlib.import_module(
                'datasets.' + config.train_dataset).__getattribute__(config.train_dataset)
train_dataset = TrainDataset(**vars(config))
sampler = Sampler(train_dataset, **vars(config)) if config.sampler else None
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    num_workers=config.nDataLoaderThread,
    sampler=sampler,
    pin_memory=False,
    #worker_init_fn=worker_init_fn,
    drop_last=True,
)

times = []

In [None]:
import time

total_iterations=len(train_loader)
loop_time = time.time()
for x, y, f in train_loader:
    loop_time = time.time()-loop_time
    #times.append((loop_time, 0))
    print(f'Loop: {loop_time}s')
    loop_time = time.time()

# Custom train list

In [None]:
import os
from tqdm import tqdm

PATH = 'data/voxceleb2/'

ids = []
files = []

for idx in tqdm(os.listdir(PATH)):
    currentidx_path = f'{PATH}{idx}/'
    for f in os.listdir(currentidx_path):
        currentfolder_path = f'{currentidx_path}{f}/'
        for w in os.listdir(currentfolder_path):
            files.append(f'{idx}/{f}/{w}')
            ids.append(idx)

In [None]:
LIST_FILE = 'data/train_list.txt'

with open(LIST_FILE, 'w') as f:
    for i, n in zip(ids, files):
        f.write(f'{i} {n}\n')

# Custom test list

In [None]:
import os
from tqdm import tqdm

PATH = 'data/voxceleb1/'

existing_ids = []
existing_files = []

for idx in tqdm(os.listdir(PATH)):
    currentidx_path = f'{PATH}{idx}/'
    for f in os.listdir(currentidx_path):
        currentfolder_path = f'{currentidx_path}{f}/'
        for w in os.listdir(currentfolder_path):
            existing_files.append(f'{idx}/{f}/{w}')
            existing_ids.append(idx)

In [None]:
from random import sample

unique_ids = set(existing_ids)

to_write = []

for idx in tqdm(unique_ids):
    equivalent_indices = [i for i, x in enumerate(existing_ids) if x == idx]
    equivalent_list = [f'1 {existing_files[equivalent_indices[0]]} {existing_files[e]}' for e in equivalent_indices[1:]]
    not_equivalent_indices = [i for i, x in enumerate(existing_ids) if x != idx]
    not_equivalent_indices = sample(not_equivalent_indices, len(equivalent_list))
    not_equivalent_list = [f'0 {existing_files[equivalent_indices[0]]} {existing_files[e]}' for e in not_equivalent_indices]
    merge = [None]*(len(equivalent_list)+len(not_equivalent_list))
    merge[::2] = equivalent_list
    merge[1::2] = not_equivalent_list
    to_write.extend(merge)

In [None]:
LIST_FILE = 'data/test_list.txt'

with open(LIST_FILE, 'w') as f:
    for l in to_write:
        f.write(f'{l}\n')