Основано на другом ноутбуке: https://github.com/tugstugi/pytorch-dc-tts/blob/master/notebooks/EnglishTTS.ipynb

DeepWiki: https://deepwiki.com/tugstugi/pytorch-dc-tts

## Установка зависимостей

In [None]:
import os
from os.path import exists

project_name = "pytorch-dc-tts"
if not exists(project_name):
  ! git clone --quiet https://github.com/tugstugi/{project_name}
  ! pip install -q -r requirements.txt

## Скачивание предобученных моделей

In [3]:
# скачиваем text2mel
if not exists("ljspeech-text2mel.pth"):
  ! curl -s -L -o ljspeech-text2mel.pth https://www.dropbox.com/s/4t13ugxzzgnocbj/step-300K.pth

# скачиваем SSRN
if not exists("ljspeech-ssrn.pth"):
  ! curl -s -L -o ljspeech-ssrn.pth https://www.dropbox.com/s/gw4aqrgcvccmg0g/step-100K.pth

In [None]:
import sys
sys.path.append(project_name)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import torch

from tqdm import *
import IPython
from IPython.display import Audio

from hparams import HParams as hp
from audio import save_to_wav
from models import Text2Mel, SSRN
from datasets.lj_speech import vocab, get_test_data

In [9]:
torch.set_grad_enabled(False)
text2mel = Text2Mel(vocab)
text2mel.load_state_dict(torch.load("ljspeech-text2mel.pth", weights_only=False).state_dict())
text2mel = text2mel.eval()
ssrn = SSRN()
ssrn.load_state_dict(torch.load("ljspeech-ssrn.pth", weights_only=False).state_dict())
ssrn = ssrn.eval()

In [16]:
SENTENCES = [
  "Set the world on fire!",
  "I'll do anything to get what I want",
  "Aim even higher",
  "I'll do anything to be the one"
]

In [20]:
# генерируем одно предложение за раз
for i in range(len(SENTENCES)):
  sentence = SENTENCES[i]
  normalized_sentence = "".join([c if c.lower() in vocab else '' for c in sentence])
  print(normalized_sentence)

  sentences = [normalized_sentence]
  max_N = len(normalized_sentence)
  L = torch.from_numpy(get_test_data(sentences, max_N))
  zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32))
  Y = zeros
  A = None

  for t in range(hp.max_T):
    _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
    Y = torch.cat((zeros, Y_t), -1)
    _, attention = torch.max(A[0, :, -1], 0)
    attention = attention.item()
    if L[0, attention] == vocab.index('E'):  # end of file
        break

  _, Z = ssrn(Y)

  Z = Z.cpu().detach().numpy()

  # пишем в файл и выводим
  save_to_wav(Z[0, :, :].T, '%d.wav' % (i + 1))
  IPython.display.display(Audio('%d.wav' % (i + 1), rate=hp.sr))

Set the world on fire


I'll do anything to get what I want


Aim even higher


I'll do anything to be the one
