<a href="https://colab.research.google.com/github/cris-her/sb_audio_dataset/blob/master/Audio_dataset_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.
Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# # If you're using Google Colab and not running locally, uncomment and run this cell.
# !apt-get install sox libsndfile1 ffmpeg
# !pip install wget unidecode
BRANCH = 'v1.0.0b3'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[tts]
!pip install omegaconf



In [2]:
new_list = []
with open('sentences2audio.txt') as my_doc:
    for line in my_doc.readlines():
        new_list.append(line.strip('\n'))

#sizes = [(len(new_list)//6),(len(new_list)//6)*2,(len(new_list)//6)*3,(len(new_list)//6)*4,(len(new_list)//6)*5]
#print(sizes)

In [3]:
from omegaconf import OmegaConf, open_dict
import torch
from nemo.collections.asr.parts import parsers
from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder

supported_spec_gen = ["tacotron2", "glow_tts"]
supported_audio_gen = ["waveglow", "squeezewave", "two_stages"]

spectrogram_generator = "tacotron2"
audio_generator = "waveglow"
assert spectrogram_generator in supported_spec_gen
assert audio_generator in supported_audio_gen

SAMPLE_RATE = 22050
NFFT = 1024
NMEL = 80
FMAX = None


def load_spectrogram_model():
    if spectrogram_generator == "tacotron2":
        from nemo.collections.tts.models import Tacotron2Model
        pretrained_model = "Tacotron2-22050Hz"       
    elif spectrogram_generator == "glow_tts":
        from nemo.collections.tts.models import GlowTTSModel
        pretrained_model = "GlowTTS-22050Hz"
    else:
        raise NotImplementedError

    model = SpectrogramGenerator.from_pretrained(pretrained_model)
    with open_dict(model._cfg):
        global SAMPLE_RATE
        global NFFT
        global NMEL
        global FMAX
        SAMPLE_RATE = model._cfg.sample_rate or SAMPLE_RATE
        NFFT = model._cfg.n_fft or NFFT
        NMEL = model._cfg.n_mels or NMEL
        FMAX = model._cfg.fmax or FMAX
    return model


def validate_parameters(sample_rate, n_fft, n_mels, fmax):
    global SAMPLE_RATE
    global NFFT
    global NMEL
    global FMAX
    if sample_rate is not None and SAMPLE_RATE is not None:
        assert sample_rate == SAMPLE_RATE
    if n_fft is not None and NFFT is not None:
        assert n_fft == NFFT
    if n_mels is not None and NMEL is not None:
        assert n_mels == NMEL
    if fmax is not None and FMAX is not None:
        assert fmax == FMAX
        
def load_vocoder_model():
    RequestPseudoInverse = False
    TwoStagesModel = False
    
    if audio_generator == "waveglow":
        from nemo.collections.tts.models import WaveGlowModel
        pretrained_model = "WaveGlow-22050Hz"
    elif audio_generator == "squeezewave":
        from nemo.collections.tts.models import SqueezeWaveModel
        pretrained_model = "SqueezeWave-22050Hz"
    elif audio_generator == "two_stages":
        from nemo.collections.tts.models import TwoStagesModel
        cfg = {'linvocoder':  {'_target_': 'nemo.collections.tts.models.two_stages.GriffinLimModel',
                             'cfg': {'n_iters': 64, 'n_fft': NFFT, 'l_hop': 256}},
               'mel2spec': {'_target_': 'nemo.collections.tts.models.two_stages.MelPsuedoInverseModel',
                           'cfg': {'sampling_rate': SAMPLE_RATE, 'n_fft': NFFT, 
                                   'mel_fmin': 0, 'mel_fmax': FMAX, 'mel_freq': NMEL}}}
        model = TwoStagesModel(cfg)
        if mel2spec == "encoder_decoder":
            from nemo.collections.tts.models.ed_mel2spec import EDMel2SpecModel
            pretrained_mel2spec_model = "EncoderDecoderMelToSpec-22050Hz"
            mel2spec_model = EDMel2SpecModel.from_pretrained(pretrained_mel2spec_model)
            model.set_mel_to_spec_model(mel2spec_model)

        if linvocoder == "degli":
            from nemo.collections.tts.models.degli import DegliModel
            pretrained_linvocoder_model = "DeepGriffinLim-22050Hz"
            linvocoder_model = DegliModel.from_pretrained(pretrained_linvocoder_model)
            model.set_linear_vocoder(linvocoder_model)
            
        TwoStagesModel = True

    else:
        raise NotImplementedError

    if not TwoStagesModel:
        model = Vocoder.from_pretrained(pretrained_model)
        with open_dict(model._cfg):
            validate_parameters(model._cfg.sample_rate, model._cfg.n_fft, model._cfg.n_mels, model._cfg.fmax)
    return model

spec_gen = load_spectrogram_model().cuda()
vocoder = load_vocoder_model().cuda()


# Infer function
def infer(spec_gen_model, vocder_model, str_input):
    with torch.no_grad():
        parsed = spec_gen.parse(str_input)
        spectrogram = spec_gen.generate_spectrogram(tokens=parsed)
        audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
    if isinstance(spectrogram, torch.Tensor):
        spectrogram = spectrogram.to('cpu').numpy()
    if len(spectrogram.shape) == 3:
        spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


[NeMo W 2021-02-24 16:00:19 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2021-02-24 16:00:19 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2021-02-24 16:00:19 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2021-02-24 16:00:19 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2021-02-24 16:00:19 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset'> is experimental,

[NeMo I 2021-02-24 16:00:20 cloud:66] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemottsmodels/versions/1.0.0a5/files/Tacotron2-22050Hz.nemo to /root/.cache/torch/NeMo/NeMo_1.0.0b3/Tacotron2-22050Hz/59f6603364e11524c03c95920e7afaa0/Tacotron2-22050Hz.nemo
[NeMo I 2021-02-24 16:00:23 common:399] Instantiating model from pre-trained checkpoint


[NeMo W 2021-02-24 16:00:24 tacotron2:80] Your config is using an old NeMo yaml configuration. Please ensure that the yaml matches the current version in the main branch for future compatibility.
    Config key 'cls' is deprecated since Hydra 1.0 and will be removed in Hydra 1.1.
    Use '_target_' instead of 'cls'.
    See https://hydra.cc/docs/next/upgrades/0.11_to_1.0/object_instantiation_changes
    
    Field 'params' is deprecated since Hydra 1.0 and will be removed in Hydra 1.1.
    Inline the content of params directly at the containing node.
    See https://hydra.cc/docs/next/upgrades/0.11_to_1.0/object_instantiation_changes
    


[NeMo I 2021-02-24 16:00:24 features:241] PADDING: 16
[NeMo I 2021-02-24 16:00:24 features:254] STFT using conv
[NeMo I 2021-02-24 16:00:39 modelPT:257] Model Tacotron2Model was successfully restored from /root/.cache/torch/NeMo/NeMo_1.0.0b3/Tacotron2-22050Hz/59f6603364e11524c03c95920e7afaa0/Tacotron2-22050Hz.nemo.
[NeMo I 2021-02-24 16:00:39 cloud:66] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemottsmodels/versions/1.0.0a5/files/WaveGlow-22050Hz.nemo to /root/.cache/torch/NeMo/NeMo_1.0.0b3/WaveGlow-22050Hz/63a329dc3e8b44ec2e07cd4209eeab2a/WaveGlow-22050Hz.nemo
[NeMo I 2021-02-24 16:00:58 common:399] Instantiating model from pre-trained checkpoint
[NeMo I 2021-02-24 16:01:07 features:241] PADDING: 16
[NeMo I 2021-02-24 16:01:07 features:254] STFT using conv
[NeMo I 2021-02-24 16:01:12 modelPT:257] Model WaveGlowModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.0.0b3/WaveGlow-22050Hz/63a329dc3e8b44ec2e07cd4209eeab2a/WaveGlow-22050Hz.nemo.


In [5]:
import numpy as np
import soundfile as sf

for a in range(len(new_list)):
  text_to_generate = new_list[a]
  spec, audio = infer(spec_gen, vocoder, text_to_generate)
  
  audio.resize((audio.shape[1]//2,2))
  sf.write(f"audios/file-{a+1}.wav", audio, SAMPLE_RATE//2, subtype='PCM_16')

In [6]:
!zip -r /content/my_audios.zip /content/audios

  adding: content/audios/ (stored 0%)
  adding: content/audios/file-174.wav (deflated 15%)
  adding: content/audios/file-193.wav (deflated 15%)
  adding: content/audios/file-1000.wav (deflated 18%)
  adding: content/audios/file-209.wav (deflated 16%)
  adding: content/audios/file-1019.wav (deflated 14%)
  adding: content/audios/file-1187.wav (deflated 14%)
  adding: content/audios/file-1031.wav (deflated 14%)
  adding: content/audios/file-341.wav (deflated 15%)
  adding: content/audios/file-311.wav (deflated 15%)
  adding: content/audios/file-480.wav (deflated 15%)
  adding: content/audios/file-894.wav (deflated 15%)
  adding: content/audios/file-302.wav (deflated 15%)
  adding: content/audios/file-1076.wav (deflated 14%)
  adding: content/audios/file-941.wav (deflated 14%)
  adding: content/audios/file-26.wav (deflated 16%)
  adding: content/audios/file-404.wav (deflated 15%)
  adding: content/audios/file-377.wav (deflated 14%)
  adding: content/audios/file-1166.wav (deflated 15%)
  a

In [7]:
from google.colab import files
files.download("/content/my_audios.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>