In [1]:
# Import NeMo and it's ASR, NLP and TTS collections
from typing import Any

import nemo
# Import Speech Recognition collection
import nemo.collections.asr as nemo_asr
# Import Natural Language Processing collection
import nemo.collections.nlp as nemo_nlp
# Import Speech Synthesis collection
import nemo.collections.tts as nemo_tts

#We will use this to listen to audio
from IPython.display import Audio

# Here is an example of all CTC-based models:
nemo_asr.models.EncDecCTCModel.list_available_models()
# More ASR Models are available - see: nemo_asr.models.ASRModel.list_available_models()

# Speech Recognition model - Citrinet initially trained on Multilingual LibriSpeech English corpus, and fine-tuned on the open source Aishell-2
asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="stt_ru_quartznet15x5")

# Neural Machine Translation model
nmt_model = nemo_nlp.models.MTEncDecModel.from_pretrained(model_name='nmt_ru_en_transformer6x6')

# Spectrogram generator which takes text as an input and produces spectrogram
spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_en_fastpitch")
# Set the model to evaluation mode
spectrogram_generator.eval()

# Vocoder model which takes spectrogram and produces actual audio
vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_en_hifigan")
vocoder.eval()

#Audio file
audio_sample = 'assets/common_voice_ru_1.wav'

transcribed_text = asr_model.transcribe([audio_sample])
print(transcribed_text)

english_text = nmt_model.translate(transcribed_text)
print(english_text)

# A helper function which combines FastPitch and HifiGan to go directly from
# text to audio

def text_to_audio(text: str) -> Any:
  parsed: Any = spectrogram_generator.parse(text)
  spectrogram: Any = spectrogram_generator.generate_spectrogram(tokens=parsed)
  audio: Any = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
  return audio.to('cpu').detach().numpy()


# Listen to generated audio in English
Audio(text_to_audio(english_text[0]), rate=22050, autoplay=True)

[NeMo I 2023-11-29 16:08:43 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/stt_ru_quartznet15x5/92506570b7206ea395e295b3fbbf07e3/stt_ru_quartznet15x5.nemo.
[NeMo I 2023-11-29 16:08:43 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/stt_ru_quartznet15x5/92506570b7206ea395e295b3fbbf07e3/stt_ru_quartznet15x5.nemo
[NeMo I 2023-11-29 16:08:43 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-29 16:08:44 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /raid/noneval.json
    sample_rate: 16000
    labels:
    - ' '
    - а
    - б
    - в
    - г
    - д
    - е
    - ё
    - ж
    - з
    - и
    - й
    - к
    - л
    - м
    - н
    - о
    - п
    - р
    - с
    - т
    - у
    - ф
    - х
    - ц
    - ч
    - ш
    - щ
    - ъ
    - ы
    - ь
    - э
    - ю
    - я
    batch_size: 16
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    num_workers: 8
    pin_memory: true
    
[NeMo W 2023-11-29 16:08:44 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation da

[NeMo I 2023-11-29 16:08:44 features:289] PADDING: 16
[NeMo I 2023-11-29 16:08:45 save_restore_connector:249] Model EncDecCTCModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/stt_ru_quartznet15x5/92506570b7206ea395e295b3fbbf07e3/stt_ru_quartznet15x5.nemo.
[NeMo I 2023-11-29 16:08:45 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/nmt_ru_en_transformer6x6/3db82426b17db1ae7cc7ae4ee3e3679b/nmt_ru_en_transformer6x6.nemo.
[NeMo I 2023-11-29 16:08:45 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/nmt_ru_en_transformer6x6/3db82426b17db1ae7cc7ae4ee3e3679b/nmt_ru_en_transformer6x6.nemo
[NeMo I 2023-11-29 16:08:45 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-11-29 16:08:50 tokenizer_utils:179] Getting YouTokenToMeTokenizer with model: /tmp/tmpn9k026w8/tokenizer.all.32000.BPE.model with r2l: False.
[NeMo I 2023-11-29 16:08:50 tokenizer_utils:179] Getting YouTokenToMeTokenizer with model: /tmp/

[NeMo W 2023-11-29 16:08:50 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    src_file_name: /home/sandeepsub/Datasets/wmt/wmt20_en_zh/processed/batches.tokens.cmwt.septokenizer.16000.pkl
    tgt_file_name: /home/sandeepsub/Datasets/wmt/wmt20_en_zh/processed/batches.tokens.cmwt.septokenizer.16000.pkl
    tokens_in_batch: 16000
    clean: true
    max_seq_length: 512
    cache_ids: false
    cache_data_per_node: false
    use_cache: false
    shuffle: true
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8
    load_from_cached_dataset: true
    reverse_lang_direction: true
    
[NeMo W 2023-11-29 16:08:50 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the valid

[NeMo I 2023-11-29 16:08:52 save_restore_connector:249] Model MTEncDecModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/nmt_ru_en_transformer6x6/3db82426b17db1ae7cc7ae4ee3e3679b/nmt_ru_en_transformer6x6.nemo.
[NeMo I 2023-11-29 16:08:52 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo.
[NeMo I 2023-11-29 16:08:53 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo
[NeMo I 2023-11-29 16:08:53 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-29 16:09:15 en_us_arpabet:66] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2023-11-29 16:09:15 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /ws/LJSpeech/nvidia_ljspeech_train_clean_ngc.json
      sample_rate: 22050
      sup_data_path: /raid/LJSpeech/supplementary
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null
      

[NeMo I 2023-11-29 16:09:15 features:289] PADDING: 1
[NeMo I 2023-11-29 16:09:16 save_restore_connector:249] Model FastPitchModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo.
[NeMo I 2023-11-29 16:09:16 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2023-11-29 16:09:16 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2023-11-29 16:09:16 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-29 16:09:18 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2023-11-29 16:09:18 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2023-11-29 16:09:18 features:289] PADDING: 0


[NeMo W 2023-11-29 16:09:18 features:266] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2023-11-29 16:09:18 features:289] PADDING: 0


    


[NeMo I 2023-11-29 16:09:18 save_restore_connector:249] Model HifiGanModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

['я предлагаю пока оставить этот пункт до его прояснения']
['I suggest that this paragraph be left for the time being until it is clarified .']
