In [1]:
# Base
import itertools
from glob import glob
from tqdm import tqdm
import math
import textgrid

# ML
import torch
import torch.nn.functional as F
from torch.utils.data import DistributedSampler, DataLoader
from torch.utils.tensorboard import SummaryWriter

# Local
from utils.misc import dict_to_object, plot_specgram, plot_waveform
from utils.audio import spectogram, load_mono_audio
from utils.alignment import init_alignment, alignment
from voicebox.model_duration import DurationPredictor
from voicebox.tokenizer import Tokenizer

In [2]:
# Load text grid files
files = glob("datasets/vctk-aligned/**/*.TextGrid")
files = [textgrid.TextGrid.fromFile(f) for f in files]

In [4]:
# Model and tokenizer
tokenizer = Tokenizer()
model = DurationPredictor(tokenizer.n_tokens)

# Extract data
def extract_data(src):

    # Prepare
    token_duration = 0.01
    tokens = src[1]
    time = 0
    output_tokens = []
    output_durations = []

    # Iterate over tokens
    for t in tokens:

        # Resolve durations
        ends = t.maxTime
        duration = math.floor((ends - time) / token_duration)
        time = ends

        # Resolve token
        tok = t.mark
        if tok == '':
            tok = tokenizer.silence_token

        # Apply
        output_tokens.append(tok)
        output_durations.append(duration)

    # Trim start silence
    if output_tokens[0] == 'SIL' and output_durations[0] > 1:
        output_durations[0] = 1

    # Outputs
    return output_tokens, output_durations

sample_tokens, sample_durations = extract_data(files[0])
sample_tokens = tokenizer(sample_tokens)
model(sample_tokens.unsqueeze(0), torch.Tensor(sample_durations).unsqueeze(0), torch.ones(len(sample_durations)).unsqueeze(0).bool(), torch.Tensor(sample_durations).unsqueeze(0))
    


(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0]]),
 tensor(8.7147, grad_fn=<MeanBackward0>))