In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tts.datasets.ljspeech_dataset import LJSpeechDataset
from torch.utils.data import DataLoader
from tts.collate_fn.collate import collate_fn

dataset = LJSpeechDataset(
    dataset_path='data/LJSpeech-1.1', is_train=True, train_size=1.0, text_cleaners=['english_cleaners'],
    mel_spec_path='mels', alignment_path='alignments', sr=16000
)
train_loader = DataLoader(dataset, batch_size=3, collate_fn=collate_fn)

In [3]:
batch = next(iter(train_loader))

In [4]:
batch['mel_target'].size()

torch.Size([3, 80, 833])

In [5]:
from tts.model.fastspeech1 import FastSpeechV1
from tts.text.symbols import symbols

model = FastSpeechV1(
    max_len=5000, vocab_size=len(symbols), pad_idx=0, n_blocks=3, n_heads=2, fft_kernel=3, lr_kernel=3, embed_dim=32, n_mels=80 
)



In [6]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
batch['text'] = batch['text'].to(device)
batch['duration'] = batch['duration'].to(device)
batch['mel_target'] = batch['mel_target'].to(device)
model = model.to(device)

In [8]:
batch['alignment_path']

['alignments/0.npy', 'alignments/1.npy', 'alignments/2.npy']

In [9]:
import numpy as np
print(np.load('alignments/0.npy').sum())
print(np.load('alignments/1.npy').sum())
print(np.load('alignments/2.npy').sum())

832
164
833


In [10]:
batch['duration'].sum(dim=-1)

tensor([832, 164, 833], device='cuda:0')

In [11]:
o=model(**batch)

torch.Size([3, 155, 32])
torch.Size([3, 155])
tensor([[ 1,  4,  7,  4,  6, 18,  4,  9, 18,  4,  7,  3,  0,  3,  0,  7,  0, 16,
          3,  4,  8,  0, 14, 12,  7,  2,  5,  0,  5,  6,  0,  4,  0,  6,  0,  8,
          4,  2,  0,  7,  9,  0, 10,  0,  8,  0,  6,  3,  0,  7,  4,  8,  7,  3,
          3,  2,  0,  7,  6,  5, 12,  9, 14,  4,  0,  6, 23, 16,  2, 10,  3,  3,
          4, 12, 11, 12,  2,  2,  4,  5,  0,  8, 15,  9,  3,  0,  7,  7,  0,  8,
         10,  5,  0,  6,  2,  5,  7,  9,  9,  5,  6,  0,  4,  0,  7,  0,  8,  8,
          6,  6,  0,  2,  2,  4,  0,  6,  8, 16,  6,  6,  7,  0, 10,  8,  5,  3,
          4,  6,  8,  2,  3,  7,  4,  6,  9,  6,  0,  0,  0,  7,  0,  8,  7,  8,
          3,  6,  9,  5,  5,  7, 19,  0,  0,  0,  0],
        [ 6,  5,  0,  4,  9,  8,  0,  2,  0,  4,  6,  4,  5, 11,  7,  6,  5,  8,
          4,  0,  8,  9,  0,  8, 15,  3,  3,  6,  9,  9,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         

In [13]:
o['pred_mel'].size()

torch.Size([1, 80, 833])

In [14]:
batch['duration'].size()

torch.Size([1, 155])

In [15]:
print(batch['text'].size())
print(batch['duration'].size())
print(batch['mel_target'].size())
print(batch['duration'].sum())
print(batch['mel_spec_path'])
print(batch['alignment_path'])
print(batch['raw_text'])

torch.Size([1, 155])
torch.Size([1, 155])
torch.Size([1, 80, 833])
tensor(833, device='cuda:0')
mels/ljspeech-mel-00003.npy
alignments/2.npy
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar proces


In [16]:
import numpy as np
alignment = np.load('alignments/2.npy')
mel = np.load('mels/ljspeech-mel-00003.npy')

print(alignment.shape)
print(alignment.sum())
print(mel.shape)

(155,)
833
(833, 80)


In [17]:
from tts import vocode_utils

WaveGlow = vocode_utils.get_WaveGlow()
WaveGlow = WaveGlow.cuda()



In [18]:
batch['mel_target']

tensor([[[-8.5495, -8.2685, -8.6421,  ..., -7.4538, -7.5008, -6.5141],
         [-7.5030, -7.4359, -7.5968,  ..., -7.1632, -7.3558, -6.5252],
         [-6.1300, -6.4027, -6.7162,  ..., -6.7940, -7.0461, -7.2345],
         ...,
         [-5.3136, -4.8807, -5.2312,  ..., -6.4740, -6.9076, -7.3200],
         [-4.8725, -4.5233, -4.9753,  ..., -6.4573, -6.8114, -7.0957],
         [-5.1428, -4.6071, -4.8014,  ..., -7.8065, -7.8546, -8.0541]]],
       device='cuda:0')

In [26]:
batch['mel_target'].size()

torch.Size([1, 80, 833])

In [28]:
o['pred_mel'].size()

torch.Size([1, 819, 80])

: 

In [22]:
from tts import waveglow

wav = waveglow.inference.inference_audio(batch['mel_target'], WaveGlow)

In [25]:
import IPython.display as ipd

ipd.Audio(wav, rate=16000)