In [9]:
# Ignore pre-production warnings
import warnings
from typing import Any

warnings.filterwarnings('ignore')
import nemo
# Import Speech Recognition collection
import nemo.collections.asr as nemo_asr
# Import Natural Language Processing collection
import nemo.collections.nlp as nemo_nlp
# Import Speech Synthesis collection
import nemo.collections.tts as nemo_tts
# We'll use this to listen to audio
from IPython.display import Audio

# Download audio sample which we'll try
# This is a sample from LibriSpeech Dev Clean dataset - the model hasn't seen it before
audio_sample = '../assets/engl.wav'

#Speetch Recognition model -- Quartznet
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="stt_en_quartznet15x5")

#Punctuation and Capitalization model
punctuation = nemo_nlp.models.PunctuationCapitalizationModel.from_pretrained(model_name="punctuation_en_distilbert")

# Spectogram generator which takes text as an input and produces spectogram
spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_en_fastpitch")
spectrogram_generator.eval()

# Vocoder module which takes spectogram and produces actual audio
vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_en_hifigan")
vocoder.eval()

#Convert our audio sample to text
files = [audio_sample]
raw_text = ''
text = ''
for fname, transcription in zip(files, quartznet.transcribe(paths2audio_files=files)):
    raw_text = transcription

#Add capitalization and punctuation
res = punctuation.add_punctuation_capitalization(queries=[raw_text])
text = res[0]
print(f"\nRaw recognized text: {raw_text}. \nText with capitalization and punctuation: {text}")

#Helper function which combines TTS model to go directly from text to audio
def text_to_audio(txt: str) -> Any:
    parsed = spectrogram_generator.parse(txt)
    spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
    audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
    return audio.to('cpu').detach().numpy()

#Withput punctuation
Audio(text_to_audio(raw_text), autoplay=True, rate=22050)

#Final result - with punctuation
# Audio(text_to_audio(text), autoplay=True, rate=22050)


[NeMo I 2023-11-30 11:56:49 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo.
[NeMo I 2023-11-30 11:56:49 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo
[NeMo I 2023-11-30 11:56:49 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-30 11:56:50 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data2/voices/train_1k.json
    sample_rate: 16000
    labels:
    - ' '
    - a
    - b
    - c
    - d
    - e
    - f
    - g
    - h
    - i
    - j
    - k
    - l
    - m
    - 'n'
    - o
    - p
    - q
    - r
    - s
    - t
    - u
    - v
    - w
    - x
    - 'y'
    - z
    - ''''
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: /asr_set_1.2/train/train_{0..1023}.tar
    num_workers: 20
    
[NeMo W 2023-11-30 11:56:50 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
   

[NeMo I 2023-11-30 11:56:50 features:289] PADDING: 16
[NeMo I 2023-11-30 11:56:51 save_restore_connector:249] Model EncDecCTCModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/stt_en_quartznet15x5/16661021d16e679bdfd97a2a03944c49/stt_en_quartznet15x5.nemo.
[NeMo I 2023-11-30 11:56:51 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/punctuation_en_distilbert/6bdea9786c4395fbbe02e4143d2e1cee/punctuation_en_distilbert.nemo.
[NeMo I 2023-11-30 11:56:51 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/punctuation_en_distilbert/6bdea9786c4395fbbe02e4143d2e1cee/punctuation_en_distilbert.nemo
[NeMo I 2023-11-30 11:56:51 common:913] Instantiating model from pre-trained checkpoint
[NeMo I 2023-11-30 11:56:53 tokenizer_utils:130] Getting HuggingFace AutoTokenizer with pretrained_model_name: distilbert-base-uncased, vocab_file: /tmp/tmp4rgvge34/tokenizer.vocab_file, merges_files: None, special_tokens_dict: {}, and use_fast: F

Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.
[NeMo W 2023-11-30 11:56:53 modelPT:251] You tried to register an artifact under config key=tokenizer.vocab_file but an artifact for it has already been registered.
[NeMo W 2023-11-30 11:56:53 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_audio: false
    audio_file: null
    sample_rate: 16000
    use_bucketing: true
    batch_size: 32
    preload_audios: true
    use_tarred_dataset: false
    label_info_save_dir: null
    text_file: text_train.txt
    labels_file: labels_train.txt
    tokens_in_batch: null
    max_seq_length: 128
    num_samples: -1
    use_cache: true
    cache_dir: null
    get_label_frequences: false
    verbose: true
    n_jobs: 0
    tar_metadata_file: null
    tar_shuffle_n: 1
    shard_strategy: scatter
    shuffle: true

[NeMo I 2023-11-30 11:56:57 save_restore_connector:249] Model PunctuationCapitalizationModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/punctuation_en_distilbert/6bdea9786c4395fbbe02e4143d2e1cee/punctuation_en_distilbert.nemo.
[NeMo I 2023-11-30 11:56:57 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo.
[NeMo I 2023-11-30 11:56:57 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo
[NeMo I 2023-11-30 11:56:57 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-30 11:57:28 en_us_arpabet:66] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2023-11-30 11:57:28 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /ws/LJSpeech/nvidia_ljspeech_train_clean_ngc.json
      sample_rate: 22050
      sup_data_path: /raid/LJSpeech/supplementary
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null
      

[NeMo I 2023-11-30 11:57:28 features:289] PADDING: 1
[NeMo I 2023-11-30 11:57:29 save_restore_connector:249] Model FastPitchModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo.
[NeMo I 2023-11-30 11:57:29 cloud:58] Found existing object /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2023-11-30 11:57:29 cloud:64] Re-using file from: /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2023-11-30 11:57:29 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-11-30 11:57:36 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2023-11-30 11:57:36 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2023-11-30 11:57:37 features:289] PADDING: 0


[NeMo W 2023-11-30 11:57:37 features:266] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2023-11-30 11:57:37 features:289] PADDING: 0
[NeMo I 2023-11-30 11:57:38 save_restore_connector:249] Model HifiGanModel was successfully restored from /home/gena/.cache/torch/NeMo/NeMo_1.21.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-11-30 11:57:39 punctuation_capitalization_model:1167] Using batch size 1 for inference
[NeMo I 2023-11-30 11:57:39 punctuation_capitalization_infer_dataset:127] Max length: 28
[NeMo I 2023-11-30 11:57:39 data_preprocessing:404] Some stats of the lengths of the sequences:
[NeMo I 2023-11-30 11:57:39 data_preprocessing:406] Min: 26 |                  Max: 26 |                  Mean: 26.0 |                  Median: 26.0
[NeMo I 2023-11-30 11:57:39 data_preprocessing:412] 75 percentile: 26.00
[NeMo I 2023-11-30 11:57:39 data_preprocessing:413] 99 percentile: 26.00


100%|██████████| 1/1 [00:00<00:00, 18.11batch/s]


Raw recognized text: well i don't wish to see it any more observed phoebe turning away her eyes it is certainly very likt the old portrait. 
Text with capitalization and punctuation: Well, I don't wish to see it any more, observed Phoebe, turning away her eyes. It is certainly very likt the old portrait.



