In [1]:
!apt-get update && apt-get install -y git-lfs festival espeak-ng mbrola
# !git-lfs clone https://github.com/camenduru/HierSpeech_TTS-hf

Hit:1 http://archive.ubuntu.com/ubuntu focal InRelease
Get:2 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]        
Get:3 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]      
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease [1581 B]
Get:5 https://deb.nodesource.com/node_16.x focal InRelease [4583 B]            
Get:6 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]      
Get:7 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease [18.1 kB]
Get:8 http://archive.ubuntu.com/ubuntu focal-updates/multiverse amd64 Packages [32.0 kB]
Get:9 http://archive.ubuntu.com/ubuntu focal-updates/restricted amd64 Packages [3279 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1444 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [3761 kB]
Get:12 http://archive.ubuntu.com/ubuntu focal-backports/universe amd64 Packages [28.6 kB]
Get:1

## go into app.py file and edit .launch() on line 237 to have share=true, e.g. .launch(share=True) this will make any gradio app from huggingface work

In [2]:
%cd HierSpeech_TTS
!pip install -r requirements.txt
!pip install gradio 
!pip install utils

/notebooks/HierSpeech_TTS
Collecting AMFM_decompy==1.0.11
  Downloading AMFM_decompy-1.0.11.tar.gz (751 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m751.5/751.5 kB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Cython==3.0.3
  Downloading Cython-3.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting einops==0.7.0
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib==1.3.2
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matplotlib==3.8.1
  Downloading

In [5]:
import os
import torch
import argparse
import numpy as np
from scipy.io.wavfile import write
import torchaudio
import utils
from Mels_preprocess import MelSpectrogramFixed
from hierspeechpp_speechsynthesizer import SynthesizerTrn
from ttv_v1.text import text_to_sequence
from ttv_v1.t2w2v_transformer import SynthesizerTrn as Text2W2V
from speechsr24k.speechsr import SynthesizerTrn as AudioSR
from speechsr48k.speechsr import SynthesizerTrn as AudioSR48
from denoiser.generator import MPNet
from denoiser.infer import denoise
from hierspeechpp_speechsynthesizer import (
    SynthesizerTrn
)


In [7]:
def tts(text, a, hierspeech):
    
    net_g, text2w2v, audiosr, denoiser, mel_fn = hierspeech

    os.makedirs(a.output_dir, exist_ok=True)
    text = text_to_sequence(str(text), ["english_cleaners2"])
    token = add_blank_token(text).unsqueeze(0).cuda()
    token_length = torch.LongTensor([token.size(-1)]).cuda() 

    # Prompt load
    audio, sample_rate = torchaudio.load(a.input_prompt)

    # support only single channel
    audio = audio[:1,:] 
    # Resampling
    if sample_rate != 16000:
        audio = torchaudio.functional.resample(audio, sample_rate, 16000, resampling_method="kaiser_window") 
    if a.scale_norm == 'prompt':
        prompt_audio_max = torch.max(audio.abs())

    # We utilize a hop size of 320 but denoiser uses a hop size of 400 so we utilize a hop size of 1600
    ori_prompt_len = audio.shape[-1]
    p = (ori_prompt_len // 1600 + 1) * 1600 - ori_prompt_len
    audio = torch.nn.functional.pad(audio, (0, p), mode='constant').data

    file_name = os.path.splitext(os.path.basename(a.input_prompt))[0]

    # If you have a memory issue during denosing the prompt, try to denoise the prompt with cpu before TTS 
    # We will have a plan to replace a memory-efficient denoiser 
    if a.denoise_ratio == 0:
        audio = torch.cat([audio.cuda(), audio.cuda()], dim=0)
    else:
        with torch.no_grad():
            denoised_audio = denoise(audio.squeeze(0).cuda(), denoiser, hps_denoiser)
        audio = torch.cat([audio.cuda(), denoised_audio[:,:audio.shape[-1]]], dim=0)

    
    audio = audio[:,:ori_prompt_len]  # 20231108 We found that large size of padding decreases a performance so we remove the paddings after denosing.

    src_mel = mel_fn(audio.cuda())

    src_length = torch.LongTensor([src_mel.size(2)]).to(device)
    src_length2 = torch.cat([src_length,src_length], dim=0)

    ## TTV (Text --> W2V, F0)
    with torch.no_grad():
        w2v_x, pitch = text2w2v.infer_noise_control(token, token_length, src_mel, src_length2, noise_scale=a.noise_scale_ttv, denoise_ratio=a.denoise_ratio)
   
        src_length = torch.LongTensor([w2v_x.size(2)]).cuda()  
        
        ## Pitch Clipping
        pitch[pitch<torch.log(torch.tensor([55]).cuda())]  = 0

        ## Hierarchical Speech Synthesizer (W2V, F0 --> 16k Audio)
        converted_audio = \
            net_g.voice_conversion_noise_control(w2v_x, src_length, src_mel, src_length2, pitch, noise_scale=a.noise_scale_vc, denoise_ratio=a.denoise_ratio)
                
        ## SpeechSR (Optional) (16k Audio --> 24k or 48k Audio)
        if a.output_sr == 48000 or 24000:
            converted_audio = audiosr(converted_audio)

    converted_audio = converted_audio.squeeze()
    
    if a.scale_norm == 'prompt':
        converted_audio = converted_audio / (torch.abs(converted_audio).max()) * 32767.0 * prompt_audio_max
    else:
        converted_audio = converted_audio / (torch.abs(converted_audio).max()) * 32767.0 * 0.999 

    converted_audio = converted_audio.cpu().numpy().astype('int16')

    file_name2 = "{}.wav".format(file_name)
    output_file = os.path.join(a.output_dir, file_name2)
    
    if a.output_sr == 48000:
        write(output_file, 48000, converted_audio)
    elif a.output_sr == 24000:
        write(output_file, 24000, converted_audio)
    else:
        write(output_file, 16000, converted_audio)

# Run the Gradio application

In [None]:
#run the application from the notebook
%cd HierSpeech_TTS
!python app.py

[Errno 2] No such file or directory: 'HierSpeech_TTS'
/notebooks/HierSpeech_TTS
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/notebooks/HierSpeech_TTS, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/notebooks/HierSpeech_TTS, universal_newlines=False, shell=None, istream=None)
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
DEBUG:jaxlib.mlir._mlir_libs:Initializing MLIR with module: _site_initialize_0
DEBUG:jaxlib.mlir._mlir_libs:Registering dialects from initializer <module 'jaxlib.mlir._mlir_libs._site_initialize_0' from '/usr/local/lib/python3.9/dist-packages/jaxlib/mlir/_mlir_libs/_site_initialize_0.so'>
DEBUG:jax._src.path:etils.epath found. Using etils.epath for file I/O.
DEBUG:matplotlib:matplotlib data path: /usr/local/lib/python3.9/dist-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIG