diff --git a/README.md b/README.md index 5b967b6a94..71ab91121e 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,6 @@ Please leave us [your feedback](https://forms.gle/i64fowQmiVhMMC7f9) on how we c [Automatic Spelling Correction](http://docs.deeppavlov.ai/en/master/features/models/spelling_correction.html) | [ELMo training and fine-tuning](http://docs.deeppavlov.ai/en/master/apiref/models/elmo.html) -[Speech recognition and synthesis (ASR and TTS)](http://docs.deeppavlov.ai/en/master/features/models/nemo.html) based on [NVIDIA NeMo](https://nvidia.github.io/NeMo/index.html) - [Entity Linking](http://docs.deeppavlov.ai/en/master/features/models/entity_linking.html) | [Multitask BERT](http://docs.deeppavlov.ai/en/master/features/models/multitask_bert.html) **Skills** diff --git a/deeppavlov/configs/nemo/asr.json b/deeppavlov/configs/nemo/asr.json deleted file mode 100644 index 410e0ac560..0000000000 --- a/deeppavlov/configs/nemo/asr.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chainer": { - "in": "speech", - "pipe": [ - { - "class_name": "nemo_asr", - "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml", - "load_path": "{NEMO_PATH}/quartznet15x5", - "in": ["speech"], - "out": ["text"] - } - ], - "out": ["text"] - }, - "metadata": { - "variables": { - "NEMO_PATH": "~/.deeppavlov/models/nemo" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz", - "subdir": "{NEMO_PATH}" - } - ] - } -} diff --git a/deeppavlov/configs/nemo/asr_tts.json b/deeppavlov/configs/nemo/asr_tts.json deleted file mode 100644 index 8ecc10c304..0000000000 --- a/deeppavlov/configs/nemo/asr_tts.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "chainer": { - "in": "speech_in_encoded", - "pipe": [ - { - "class_name": "base64_decode_bytesIO", - "in": ["speech_in_encoded"], - "out": ["speech_in"] - }, - { - "class_name": "nemo_asr", - "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml", - "load_path": "{NEMO_PATH}/quartznet15x5", - "in": ["speech_in"], - "out": ["text"] - }, - { - "class_name": "nemo_tts", - "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml", - "load_path": "{TTS_PATH}", - "in": ["text"], - "out": ["speech_out"] - }, - { - "class_name": "bytesIO_encode_base64", - "in": ["speech_out"], - "out": ["speech_out_encoded"] - } - ], - "out": ["text", "speech_out_encoded"] - }, - "metadata": { - "variables": { - "NEMO_PATH": "~/.deeppavlov/models/nemo", - "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz", - "subdir": "{NEMO_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz", - "subdir": "{NEMO_PATH}" - } - ] - } -} diff --git a/deeppavlov/configs/nemo/tts.json b/deeppavlov/configs/nemo/tts.json deleted file mode 100644 index 6cbac9a043..0000000000 --- a/deeppavlov/configs/nemo/tts.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "chainer": { - "in": ["text", "filepath"], - "pipe": [ - { - "class_name": "nemo_tts", - "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml", - "load_path": "{TTS_PATH}", - "in": ["text", "filepath"], - "out": ["saved_path"] - } - ], - "out": ["saved_path"] - }, - "metadata": { - "variables": { - "NEMO_PATH": "~/.deeppavlov/models/nemo", - "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz", - "subdir": "{NEMO_PATH}" - } - ] - } -} diff --git a/deeppavlov/core/common/base.py b/deeppavlov/core/common/base.py index 91067cae29..e18d548d05 100644 --- a/deeppavlov/core/common/base.py +++ b/deeppavlov/core/common/base.py @@ -55,21 +55,6 @@ def __init__(self, x: Optional[Union[str, list]] = None, out: Names of pipeline inference outputs. y: Names of additional inputs (targets) for pipeline training and evaluation. pipe: List of pipeline elements. - - - Example: - .. code:: python - - >>> from deeppavlov.models.nemo.asr import NeMoASR - >>> from deeppavlov import Element, Model - >>> asr = NeMoASR(nemo_params_path="~/.deeppavlov/models/nemo/quartznet15x5/quartznet15x5.yaml", - load_path="~/.deeppavlov/models/nemo/quartznet15x5") - >>> upper = lambda batch: list(map(str.upper, batch)) - >>> model = Model(x=["speech"], - out=["upper_text"], - pipe=[Element(asr, "speech", "text"), Element(upper, "text", "upper_text")]) - >>> model(["8088-284756-0037.wav"]) - ['I WALKED ALONG BRISKLY FOR PERHAPS FIVE MINUTES'] """ super().__init__(in_x=x, out_params=out, in_y=y) if pipe is not None: diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index a870e00a75..d30baf5da7 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -2,7 +2,6 @@ "UD_pymorphy_lemmatizer": "deeppavlov.models.morpho_tagger.lemmatizer:UDPymorphyLemmatizer", "api_requester": "deeppavlov.models.api_requester.api_requester:ApiRequester", "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter", - "base64_decode_bytesIO": "deeppavlov.models.nemo.common:ascii_to_bytes_io", "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator", "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader", "bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel", @@ -20,7 +19,6 @@ "bilstm_nn": "deeppavlov.models.ranking.bilstm_siamese_network:BiLSTMSiameseNetwork", "boolqa_reader": "deeppavlov.dataset_readers.boolqa_reader:BoolqaReader", "bow": "deeppavlov.models.embedders.bow_embedder:BoWEmbedder", - "bytesIO_encode_base64": "deeppavlov.models.nemo.common:bytes_io_to_ascii", "capitalization_featurizer": "deeppavlov.models.preprocessors.capitalization:CapitalizationPreprocessor", "char_splitter": "deeppavlov.models.preprocessors.char_splitter:CharSplitter", "char_splitting_lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:CharSplittingLowercasePreprocessor", @@ -89,8 +87,6 @@ "multi_squad_retr_iterator": "deeppavlov.dataset_iterators.squad_iterator:MultiSquadRetrIterator", "multitask_iterator": "deeppavlov.dataset_iterators.multitask_iterator:MultiTaskIterator", "multitask_reader": "deeppavlov.dataset_readers.multitask_reader:MultiTaskReader", - "nemo_asr": "deeppavlov.models.nemo.asr:NeMoASR", - "nemo_tts": "deeppavlov.models.nemo.tts:NeMoTTS", "ner": "deeppavlov.models.ner.network:NerNetwork", "ner_bio_converter": "deeppavlov.models.ner.bio:BIOMarkupRestorer", "ner_chunker": "deeppavlov.models.kbqa.entity_linking:NerChunker", diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json index 05b2350d75..6d092e721f 100644 --- a/deeppavlov/core/common/requirements_registry.json +++ b/deeppavlov/core/common/requirements_registry.json @@ -167,18 +167,12 @@ "static_dictionary": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], - "base64_decode_bytesIO": [ - "{DEEPPAVLOV_PATH}/requirements/nemo.txt" - ], "wikitionary_100K_vocab": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], "huggingface_dataset_iterator": [ "{DEEPPAVLOV_PATH}/requirements/datasets.txt" ], - "bytesIO_encode_base64": [ - "{DEEPPAVLOV_PATH}/requirements/nemo.txt" - ], "typos_custom_reader": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], @@ -243,18 +237,6 @@ "typos_kartaslov_reader": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], - "nemo_asr": [ - "{DEEPPAVLOV_PATH}/requirements/pytorch14.txt", - "{DEEPPAVLOV_PATH}/requirements/nemo.txt", - "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt" - ], - "nemo_tts": [ - "{DEEPPAVLOV_PATH}/requirements/pytorch14.txt", - "{DEEPPAVLOV_PATH}/requirements/nemo.txt", - "{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt", - "{DEEPPAVLOV_PATH}/requirements/transformers28.txt", - "{DEEPPAVLOV_PATH}/requirements/nemo-tts.txt" - ], "spelling_error_model": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], diff --git a/deeppavlov/models/nemo/__init__.py b/deeppavlov/models/nemo/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/deeppavlov/models/nemo/asr.py b/deeppavlov/models/nemo/asr.py deleted file mode 100644 index 70527adea3..0000000000 --- a/deeppavlov/models/nemo/asr.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright 2020 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from io import BytesIO -from pathlib import Path -from typing import List, Optional, Tuple, Union, Dict - -import torch -from nemo.collections.asr import AudioToMelSpectrogramPreprocessor, JasperEncoder, JasperDecoderForCTC, GreedyCTCDecoder -from nemo.collections.asr.helpers import post_process_predictions -from nemo.collections.asr.parts.features import WaveformFeaturizer -from nemo.core.neural_types import AudioSignal, NeuralType, LengthsType -from nemo.utils.decorators import add_port_docs -from torch import Tensor -from torch.utils.data import Dataset, DataLoader - -from deeppavlov.core.common.registry import register -from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase - -log = logging.getLogger(__name__) - - -class AudioInferDataset(Dataset): - def __init__(self, audio_batch: List[Union[str, BytesIO]], sample_rate: int, int_values: bool, trim=False) -> None: - """Dataset reader for AudioInferDataLayer. - - Args: - audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects. - sample_rate: Audio files sample rate. - int_values: If true, load samples as 32-bit integers. - trim: Trim leading and trailing silence from an audio signal if True. - - """ - self.audio_batch = audio_batch - self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values) - self.trim = trim - - def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]: - """Processes audio batch item and extracts features. - - Args: - index: Audio batch item index. - - Returns: - features: Audio file's extracted features tensor. - features_length: Features length tensor. - - """ - sample = self.audio_batch[index] - features = self.featurizer.process(sample, trim=self.trim) - features_length = torch.tensor(features.shape[0]).long() - - return features, features_length - - def __len__(self) -> int: - return len(self.audio_batch) - - -class AudioInferDataLayer(CustomDataLayerBase): - """Data Layer for ASR pipeline inference.""" - - @property - @add_port_docs() - def output_ports(self) -> Dict[str, NeuralType]: - return { - "audio_signal": NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)), - "a_sig_length": NeuralType(tuple('B'), LengthsType()) - } - - def __init__(self, *, - audio_batch: List[Union[str, BytesIO]], - batch_size: int = 32, - sample_rate: int = 16000, - int_values: bool = False, - trim_silence: bool = False, - **kwargs) -> None: - """Initializes Data Loader. - - Args: - audio_batch: Batch to be read. Elements could be either paths to audio files or Binary I/O objects. - batch_size: How many samples per batch to load. - sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if - it is not already. - int_values: If true, load data as 32-bit integers. - trim_silence: Trim leading and trailing silence from an audio signal if True. - - """ - self._sample_rate = sample_rate - - dataset = AudioInferDataset(audio_batch=audio_batch, sample_rate=sample_rate, int_values=int_values, - trim=trim_silence) - - dataloader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=self.seq_collate_fn) - super(AudioInferDataLayer, self).__init__(dataset, dataloader, **kwargs) - - @staticmethod - def seq_collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]]) -> Tuple[Optional[Tensor], Optional[Tensor]]: - """Collates batch of audio signal and audio length, zero pads audio signal. - - Args: - batch: A tuple of tuples of audio signals and signal lengths. This collate function assumes the signals - are 1d torch tensors (i.e. mono audio). - - Returns: - audio_signal: Zero padded audio signal tensor. - audio_length: Audio signal length tensor. - - """ - _, audio_lengths = zip(*batch) - max_audio_len = 0 - has_audio = audio_lengths[0] is not None - if has_audio: - max_audio_len = max(audio_lengths).item() - - audio_signal = [] - for sig, sig_len in batch: - if has_audio: - sig_len = sig_len.item() - if sig_len < max_audio_len: - pad = (0, max_audio_len - sig_len) - sig = torch.nn.functional.pad(sig, pad) - audio_signal.append(sig) - - if has_audio: - audio_signal = torch.stack(audio_signal) - audio_lengths = torch.stack(audio_lengths) - else: - audio_signal, audio_lengths = None, None - - return audio_signal, audio_lengths - - -@register('nemo_asr') -class NeMoASR(NeMoBase): - """ASR model on NeMo modules.""" - - def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None: - """Initializes NeuralModules for ASR. - - Args: - load_path: Path to a directory with pretrained checkpoints for JasperEncoder and JasperDecoderForCTC. - nemo_params_path: Path to a file containig labels and params for AudioToMelSpectrogramPreprocessor, - JasperEncoder, JasperDecoderForCTC and AudioInferDataLayer. - - """ - super(NeMoASR, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs) - - self.labels = self.nemo_params['labels'] - - self.data_preprocessor = AudioToMelSpectrogramPreprocessor( - **self.nemo_params['AudioToMelSpectrogramPreprocessor'] - ) - self.jasper_encoder = JasperEncoder(**self.nemo_params['JasperEncoder']) - self.jasper_decoder = JasperDecoderForCTC(num_classes=len(self.labels), **self.nemo_params['JasperDecoder']) - self.greedy_decoder = GreedyCTCDecoder() - self.modules_to_restore = [self.jasper_encoder, self.jasper_decoder] - - self.load() - - def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]: - """Transcripts audio batch to text. - - Args: - audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects. - - Returns: - text_batch: Batch of transcripts. - - """ - data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer']) - audio_signal, audio_signal_len = data_layer() - processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal, - length=audio_signal_len) - encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len) - log_probs = self.jasper_decoder(encoder_output=encoded) - predictions = self.greedy_decoder(log_probs=log_probs) - eval_tensors = [predictions] - tensors = self.neural_factory.infer(tensors=eval_tensors) - text_batch = post_process_predictions(tensors[0], self.labels) - - return text_batch diff --git a/deeppavlov/models/nemo/common.py b/deeppavlov/models/nemo/common.py deleted file mode 100644 index 883483c5d6..0000000000 --- a/deeppavlov/models/nemo/common.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2020 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import base64 -from io import BytesIO -from logging import getLogger -from pathlib import Path -from typing import Union - -import nemo -import torch -from nemo.backends.pytorch import DataLayerNM -from torch.utils.data import Dataset, DataLoader - -from deeppavlov.core.commands.utils import expand_path -from deeppavlov.core.common.file import read_yaml -from deeppavlov.core.common.registry import register -from deeppavlov.core.models.component import Component -from deeppavlov.core.models.serializable import Serializable - -log = getLogger(__name__) - - -@register('base64_decode_bytesIO') -def ascii_to_bytes_io(batch: Union[str, list]) -> Union[BytesIO, list]: - """Recursively searches for strings in the input batch and converts them into the base64-encoded bytes wrapped in - Binary I/O objects. - - Args: - batch: A string or an iterable container with strings at some level of nesting. - - Returns: - The same structure where all strings are converted into the base64-encoded bytes wrapped in Binary I/O objects. - - """ - if isinstance(batch, str): - return BytesIO(base64.decodebytes(batch.encode())) - - return list(map(ascii_to_bytes_io, batch)) - - -@register('bytesIO_encode_base64') -def bytes_io_to_ascii(batch: Union[BytesIO, list]) -> Union[str, list]: - """Recursively searches for Binary I/O objects in the input batch and converts them into ASCII-strings. - - Args: - batch: A BinaryIO object or an iterable container with BinaryIO objects at some level of nesting. - - Returns: - The same structure where all BinaryIO objects are converted into strings. - - """ - if isinstance(batch, BytesIO): - return base64.encodebytes(batch.read()).decode('ascii') - - return list(map(bytes_io_to_ascii, batch)) - - -class NeMoBase(Component, Serializable): - """Base class for NeMo Chainer's pipeline components.""" - - def __init__(self, load_path: Union[str, Path], nemo_params_path: Union[str, Path], **kwargs) -> None: - """Initializes NeuralModuleFactory on CPU or GPU and reads nemo modules params from yaml. - - Args: - load_path: Path to a directory with pretrained checkpoints for NeMo modules. - nemo_params_path: Path to a file containig NeMo modules params. - - """ - super(NeMoBase, self).__init__(save_path=None, load_path=load_path, **kwargs) - placement = nemo.core.DeviceType.GPU if torch.cuda.is_available() else nemo.core.DeviceType.CPU - self.neural_factory = nemo.core.NeuralModuleFactory(placement=placement) - self.modules_to_restore = [] - self.nemo_params = read_yaml(expand_path(nemo_params_path)) - - def __call__(self, *args, **kwargs): - raise NotImplementedError - - def load(self) -> None: - """Loads pretrained checkpoints for modules from self.modules_to_restore list.""" - module_names = [str(module) for module in self.modules_to_restore] - checkpoints = nemo.utils.get_checkpoint_from_dir(module_names, self.load_path) - for module, checkpoint in zip(self.modules_to_restore, checkpoints): - log.info(f'Restoring {module} from {checkpoint}') - module.restore_from(checkpoint) - - def save(self, *args, **kwargs) -> None: - pass - - -class CustomDataLayerBase(DataLayerNM): - def __init__(self, dataset: Dataset, dataloader: DataLoader, **kwargs) -> None: - super(CustomDataLayerBase, self).__init__() - self._dataset = dataset - self._dataloader = dataloader - - def __len__(self) -> int: - return len(self._dataset) - - @property - def dataset(self) -> None: - return None - - @property - def data_iterator(self) -> torch.utils.data.DataLoader: - return self._dataloader diff --git a/deeppavlov/models/nemo/tts.py b/deeppavlov/models/nemo/tts.py deleted file mode 100644 index d31fa0bcfb..0000000000 --- a/deeppavlov/models/nemo/tts.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright 2020 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from functools import partial -from io import BytesIO -from logging import getLogger -from pathlib import Path -from typing import List, Optional, Tuple, Union, Dict - -import torch -from nemo.collections.asr.parts import collections, parsers -from nemo.collections.asr.parts.dataset import TranscriptDataset -from nemo.collections.tts import TextEmbedding, Tacotron2Encoder, Tacotron2DecoderInfer, Tacotron2Postnet -from nemo.core.neural_types import NeuralType, LabelsType, LengthsType -from nemo.utils.decorators import add_port_docs -from nemo.utils.misc import pad_to -from scipy.io import wavfile -from torch import Tensor - -from deeppavlov.core.commands.utils import expand_path -from deeppavlov.core.common.registry import register -from deeppavlov.models.nemo.common import CustomDataLayerBase, NeMoBase -from deeppavlov.models.nemo.vocoder import WaveGlow, GriffinLim - -log = getLogger(__name__) - - -class TextDataset(TranscriptDataset): - def __init__(self, - text_batch: List[str], - labels: List[str], - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - lowercase: bool = True) -> None: - """Text dataset reader for TextDataLayer. - - Args: - text_batch: Texts to be used for speech synthesis. - labels: List of string labels to use when to str2int translation. - bos_id: Label position of beginning of string symbol. - eos_id: Label position of end of string symbol. - lowercase: Whether to convert all uppercase characters in a text batch into lowercase characters. - - """ - parser = parsers.make_parser(labels, do_lowercase=lowercase) - self.texts = collections.Text(text_batch, parser) - self.bos_id = bos_id - self.eos_id = eos_id - - -class TextDataLayer(CustomDataLayerBase): - @property - @add_port_docs() - def output_ports(self) -> Dict[str, NeuralType]: - return { - 'texts': NeuralType(('B', 'T'), LabelsType()), - "texts_length": NeuralType(tuple('B'), LengthsType()) - } - - def __init__(self, *, - text_batch: List[str], - labels: List[str], - batch_size: int = 32, - bos_id: Optional[int] = None, - eos_id: Optional[int] = None, - pad_id: Optional[int] = None, - **kwargs) -> None: - """A simple Neural Module for loading text data. - - Args: - text_batch: Texts to be used for speech synthesis. - labels: List of string labels to use when to str2int translation. - batch_size: How many strings per batch to load. - bos_id: Label position of beginning of string symbol. If None is initialized as `len(labels)`. - eos_id: Label position of end of string symbol. If None is initialized as `len(labels) + 1`. - pad_id: Label position of pad symbol. If None is initialized as `len(labels) + 2`. - - """ - len_labels = len(labels) - if bos_id is None: - bos_id = len_labels - if eos_id is None: - eos_id = len_labels + 1 - if pad_id is None: - pad_id = len_labels + 2 - - dataset = TextDataset(text_batch=text_batch, labels=labels, bos_id=bos_id, eos_id=eos_id) - - dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, - collate_fn=partial(self._collate_fn, pad_id=pad_id)) - super(TextDataLayer, self).__init__(dataset, dataloader, **kwargs) - - @staticmethod - def _collate_fn(batch: Tuple[Tuple[Tensor], Tuple[Tensor]], pad_id: int) -> Tuple[Tensor, Tensor]: - """Collates batch of texts. - - Args: - batch: A tuple of tuples of audio signals and signal lengths. - pad_id: Label position of pad symbol. - - Returns: - texts: Padded texts tensor. - texts_len: Text lengths tensor. - - """ - texts_list, texts_len = zip(*batch) - max_len = max(texts_len) - max_len = pad_to(max_len, 8) - - texts = torch.empty(len(texts_list), max_len, dtype=torch.long) - texts.fill_(pad_id) - - for i, text in enumerate(texts_list): - texts[i].narrow(0, 0, text.size(0)).copy_(text) - - if len(texts.shape) != 2: - raise ValueError(f'Texts in collate function have shape {texts.shape}, should have 2 dimensions.') - - return texts, torch.stack(texts_len) - - -@register('nemo_tts') -class NeMoTTS(NeMoBase): - """TTS model on NeMo modules.""" - def __init__(self, - load_path: Union[str, Path], - nemo_params_path: Union[str, Path], - vocoder: str = 'waveglow', - **kwargs) -> None: - """Initializes NeuralModules for TTS. - - Args: - load_path: Path to a directory with pretrained checkpoints for TextEmbedding, Tacotron2Encoder, - Tacotron2DecoderInfer, Tacotron2Postnet and, if Waveglow vocoder is selected, WaveGlowInferNM. - nemo_params_path: Path to a file containig sample_rate, labels and params for TextEmbedding, - Tacotron2Encoder, Tacotron2Decoder, Tacotron2Postnet and TranscriptDataLayer. - vocoder: Vocoder used to convert from spectrograms to audio. Available options: `waveglow` (needs pretrained - checkpoint) and `griffin-lim`. - - """ - super(NeMoTTS, self).__init__(load_path=load_path, nemo_params_path=nemo_params_path, **kwargs) - - self.sample_rate = self.nemo_params['sample_rate'] - self.text_embedding = TextEmbedding( - len(self.nemo_params['labels']) + 3, # + 3 special chars - **self.nemo_params['TextEmbedding'] - ) - self.t2_enc = Tacotron2Encoder(**self.nemo_params['Tacotron2Encoder']) - self.t2_dec = Tacotron2DecoderInfer(**self.nemo_params['Tacotron2Decoder']) - self.t2_postnet = Tacotron2Postnet(**self.nemo_params['Tacotron2Postnet']) - self.modules_to_restore = [self.text_embedding, self.t2_enc, self.t2_dec, self.t2_postnet] - - if vocoder == 'waveglow': - self.vocoder = WaveGlow(**self.nemo_params['WaveGlowNM']) - self.modules_to_restore.append(self.vocoder) - elif vocoder == 'griffin-lim': - self.vocoder = GriffinLim(**self.nemo_params['GriffinLim']) - else: - raise ValueError(f'{vocoder} vocoder is not supported.') - - self.load() - - def __call__(self, - text_batch: List[str], - path_batch: Optional[List[str]] = None) -> Union[List[BytesIO], List[str]]: - """Creates wav files or file objects with speech. - - Args: - text_batch: Text from which human audible speech should be generated. - path_batch: i-th element of `path_batch` is the path to save i-th generated speech file. If argument isn't - specified, the synthesized speech will be stored to Binary I/O objects. - - Returns: - List of Binary I/O objects with generated speech if `path_batch` was not specified, list of paths to files - with synthesized speech otherwise. - - """ - if path_batch is None: - path_batch = [BytesIO() for _ in text_batch] - elif len(text_batch) != len(path_batch): - raise ValueError('Text batch length differs from path batch length.') - else: - path_batch = [expand_path(path) for path in path_batch] - - data_layer = TextDataLayer(text_batch=text_batch, **self.nemo_params['TranscriptDataLayer']) - transcript, transcript_len = data_layer() - transcript_embedded = self.text_embedding(char_phone=transcript) - transcript_encoded = self.t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) - mel_decoder, gate, alignments, mel_len = self.t2_dec(char_phone_encoded=transcript_encoded, - encoded_length=transcript_len) - mel_postnet = self.t2_postnet(mel_input=mel_decoder) - infer_tensors = [self.vocoder(mel_postnet), mel_len] - evaluated_tensors = self.neural_factory.infer(tensors=infer_tensors) - synthesized_batch = self.vocoder.get_audio(*evaluated_tensors) - - for fout, synthesized_audio in zip(path_batch, synthesized_batch): - wavfile.write(fout, self.sample_rate, synthesized_audio) - - return path_batch diff --git a/deeppavlov/models/nemo/vocoder.py b/deeppavlov/models/nemo/vocoder.py deleted file mode 100644 index 3ec918d266..0000000000 --- a/deeppavlov/models/nemo/vocoder.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2020 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from logging import getLogger -from typing import List - -import librosa -import numpy as np -from nemo.core.neural_types import NmTensor -from nemo.collections.tts import WaveGlowInferNM -from numpy import ndarray - -log = getLogger(__name__) - - -class BaseVocoder: - """Class is used to maintain consistency in the construction of the TTS pipeline based on NeMo modules.""" - - def __call__(self, tensor: NmTensor) -> NmTensor: - """Should return the tensor after the evaluation of which speech could be synthesized with `get_audio` method""" - raise NotImplementedError - - def get_audio(self, evaluated_tensor: list, mel_len: list): - """Synthesizes audio from the evaluated tensor constructed by `__call__` method.""" - raise NotImplementedError - - -class WaveGlow(BaseVocoder): - def __init__(self, *, denoiser_strength: float = 0.0, n_window_stride: int = 160, **kwargs) -> None: - """Wraps WaveGlowInferNM module. - - Args: - denoiser_strength: Denoiser strength for waveglow. - n_window_stride: Stride of window for FFT in samples used in model training. - kwargs: Named arguments for WaveGlowInferNM constructor. - - """ - self.waveglow = WaveGlowInferNM(**kwargs) - self.denoiser_strength = denoiser_strength - self.n_window_stride = n_window_stride - - def __call__(self, mel_postnet: NmTensor) -> NmTensor: - return self.waveglow(mel_spectrogram=mel_postnet) - - def __str__(self): - return str(self.waveglow) - - def restore_from(self, path: str) -> None: - """Wraps WaveGlowInferNM restore_from method.""" - self.waveglow.restore_from(path) - if self.denoiser_strength > 0: - log.info('Setup denoiser for WaveGlow') - self.waveglow.setup_denoiser() - - def get_audio(self, evaluated_audio: list, mel_len: list) -> List[ndarray]: - """Unpacks audio data from evaluated tensor and denoises it if `denoiser_strength` > 0.""" - audios = [] - for i, batch in enumerate(evaluated_audio): - audio = batch.cpu().numpy() - for j, sample in enumerate(audio): - sample_len = mel_len[i][j] * self.n_window_stride - sample = sample[:sample_len] - if self.denoiser_strength > 0: - sample, _ = self.waveglow.denoise(sample, strength=self.denoiser_strength) - audios.append(sample) - return audios - - -class GriffinLim(BaseVocoder): - def __init__(self, *, - sample_rate: float = 16000.0, - n_fft: int = 1024, - mag_scale: float = 2048.0, - power: float = 1.2, - n_iters: int = 50, - **kwargs) -> None: - """Uses Griffin Lim algorithm to generate speech from spectrograms. - - Args: - sample_rate: Generated audio data sample rate. - n_fft: The number of points to use for the FFT. - mag_scale: Multiplied with the linear spectrogram to avoid audio sounding muted due to mel filter - normalization. - power: The linear spectrogram is raised to this power prior to running the Griffin Lim algorithm. A power - of greater than 1 has been shown to improve audio quality. - n_iters: Number of iterations of convertion magnitude spectrograms to audio signal. - - """ - self.mag_scale = mag_scale - self.power = power - self.n_iters = n_iters - self.n_fft = n_fft - self.filterbank = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, **kwargs) - - def __call__(self, mel_postnet: NmTensor) -> NmTensor: - return mel_postnet - - def get_audio(self, mel_spec: list, mel_len: list) -> List[ndarray]: - audios = [] - for i, batch in enumerate(mel_spec): - log_mel = batch.cpu().numpy().transpose(0, 2, 1) - mel = np.exp(log_mel) - magnitudes = np.dot(mel, self.filterbank) * self.mag_scale - for j, sample in enumerate(magnitudes): - sample = sample[:mel_len[i][j], :] - audio = self.griffin_lim(sample.T ** self.power) - audios.append(audio) - return audios - - def griffin_lim(self, magnitudes): - """Griffin-Lim algorithm to convert magnitude spectrograms to audio signals.""" - phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape)) - complex_spec = magnitudes * phase - signal = librosa.istft(complex_spec) - - for _ in range(self.n_iters): - _, phase = librosa.magphase(librosa.stft(signal, n_fft=self.n_fft)) - complex_spec = magnitudes * phase - signal = librosa.istft(complex_spec) - return signal diff --git a/deeppavlov/requirements/nemo-asr.txt b/deeppavlov/requirements/nemo-asr.txt deleted file mode 100644 index 1a072b36b7..0000000000 --- a/deeppavlov/requirements/nemo-asr.txt +++ /dev/null @@ -1,7 +0,0 @@ -frozendict==1.2 -kaldi-io==0.9.4 -inflect==4.1.0 -unidecode==1.1.1 -librosa==0.7.2 -torch-stft==0.1.4 -numba==0.48 \ No newline at end of file diff --git a/deeppavlov/requirements/nemo-tts.txt b/deeppavlov/requirements/nemo-tts.txt deleted file mode 100644 index a0f3139b34..0000000000 --- a/deeppavlov/requirements/nemo-tts.txt +++ /dev/null @@ -1,3 +0,0 @@ -matplotlib==3.2.1 -sentencepiece==0.1.85 -youtokentome==1.0.6 \ No newline at end of file diff --git a/deeppavlov/requirements/nemo.txt b/deeppavlov/requirements/nemo.txt deleted file mode 100644 index e6f8ff402a..0000000000 --- a/deeppavlov/requirements/nemo.txt +++ /dev/null @@ -1 +0,0 @@ -nemo-toolkit==0.10.0 \ No newline at end of file diff --git a/deeppavlov/requirements/pytorch14.txt b/deeppavlov/requirements/pytorch14.txt deleted file mode 100644 index f940e921a8..0000000000 --- a/deeppavlov/requirements/pytorch14.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch==1.4.0 -torchvision==0.5.0 \ No newline at end of file diff --git a/docs/apiref/models/nemo.rst b/docs/apiref/models/nemo.rst deleted file mode 100644 index 27c2054336..0000000000 --- a/docs/apiref/models/nemo.rst +++ /dev/null @@ -1,32 +0,0 @@ -deeppavlov.models.nemo -====================== - -.. autoclass:: deeppavlov.models.nemo.asr.NeMoASR - - .. automethod:: __init__ - .. automethod:: __call__ - -.. autoclass:: deeppavlov.models.nemo.tts.NeMoTTS - - .. automethod:: __init__ - .. automethod:: __call__ - -.. autofunction:: deeppavlov.models.nemo.common.ascii_to_bytes_io - -.. autofunction:: deeppavlov.models.nemo.common.bytes_io_to_ascii - -.. autoclass:: deeppavlov.models.nemo.asr.AudioInferDataLayer - - .. automethod:: __init__ - -.. autoclass:: deeppavlov.models.nemo.tts.TextDataLayer - - .. automethod:: __init__ - -.. autoclass:: deeppavlov.models.nemo.vocoder.WaveGlow - - .. automethod:: __init__ - -.. autoclass:: deeppavlov.models.nemo.vocoder.GriffinLim - - .. automethod:: __init__ diff --git a/docs/conf.py b/docs/conf.py index b3a4f11237..bf2c5039b1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -190,10 +190,10 @@ # -- Extension configuration ------------------------------------------------- -autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa', - 'lxml', 'nemo', 'nemo_asr', 'nemo_tts', 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa', - 'russian_tagsets', 'sacremoses', 'sortedcontainers', 'spacy', 'tensorflow', 'tensorflow_hub', - 'torch', 'transformers', 'udapi', 'ufal_udpipe', 'whapi', 'xeger'] +autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa', 'lxml', + 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa', 'russian_tagsets', 'sacremoses', 'sortedcontainers', + 'spacy', 'tensorflow', 'tensorflow_hub', 'torch', 'transformers', 'udapi', 'ufal_udpipe', + 'whapi', 'xeger'] extlinks = { 'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None) diff --git a/docs/features/models/nemo.rst b/docs/features/models/nemo.rst deleted file mode 100644 index bfa3bd4421..0000000000 --- a/docs/features/models/nemo.rst +++ /dev/null @@ -1,164 +0,0 @@ -Speech recognition and synthesis (ASR and TTS) -============================================== - -DeepPavlov contains models for automatic speech recognition (ASR) and text synthesis (TTS) based on pre-build modules -from `NeMo `__ (v0.10.0) - NVIDIA toolkit for defining and building -Conversational AI applications. Named arguments for modules initialization are taken from the NeMo config file (please -do not confuse with the DeepPavlov config file that defines model pipeline). - -Speech recognition ------------------- - -The ASR pipeline is based on Jasper: an CTC-based end-to-end model. The model transcripts speech samples without -any additional alignment information. :class:`~deeppavlov.models.nemo.asr.NeMoASR` contains following modules: - -- `AudioToMelSpectrogramPreprocessor `_ - uses arguments from ``AudioToMelSpectrogramPreprocessor`` section of the NeMo config file. -- `JasperEncoder `__ - uses arguments from ``JasperEncoder`` section of the NeMo config file. Needs pretrained checkpoint. -- `JasperDecoderForCTC `__ - uses arguments from ``JasperDecoder`` section of the NeMo config file. Needs pretrained checkpoint. -- `GreedyCTCDecoder `__ - doesn't use any arguments. -- :class:`~deeppavlov.models.nemo.asr.AudioInferDataLayer` - uses arguments from ``AudioToTextDataLayer`` section of the NeMo config file. - -NeMo config file for ASR should contain ``labels`` argument besides named arguments for the modules above. ``labels`` is -a list of characters that can be output by the ASR model used in model training. - -Speech synthesis ----------------- - -The TTS pipeline that creates human audible speech from text is based on Tacotron 2 and Waveglow models. -:class:`~deeppavlov.models.nemo.tts.NeMoTTS` contains following modules: - -- `TextEmbedding `__ - uses arguments from ``TextEmbedding`` section of the NeMo config file. Needs pretrained checkpoint. -- `Tacotron2Encoder `__ - uses arguments from ``Tacotron2Encoder`` section of the NeMo config file. Needs pretrained checkpoint. -- `Tacotron2DecoderInfer `__ - uses arguments from ``Tacotron2Decoder`` section of the NeMo config file. Needs pretrained checkpoint. -- `Tacotron2Postnet `__ - uses arguments from ``Tacotron2Postnet`` section of the NeMo config file. Needs pretrained checkpoint. -- :class:`~deeppavlov.models.nemo.vocoder.WaveGlow` - uses arguments from ``WaveGlowNM`` section of the NeMo config file. Needs pretrained checkpoint. -- :class:`~deeppavlov.models.nemo.vocoder.GriffinLim` - uses arguments from ``GriffinLim`` section of the NeMo config file. -- :class:`~deeppavlov.models.nemo.tts.TextDataLayer` - uses arguments from ``TranscriptDataLayer`` section of the NeMo config file. - -NeMo config file for TTS should contain ``labels`` and ``sample_rate`` args besides named arguments for the modules -above. ``labels`` is a list of characters used in TTS model training. - -Audio encoding end decoding. ----------------------------- - -:func:`~deeppavlov.models.nemo.common.ascii_to_bytes_io` and :func:`~deeppavlov.models.nemo.common.bytes_io_to_ascii` -was added to the library to achieve uniformity at work with both text and audio data. Components can be used to encode -binary data to ascii string and decode back. - -Quck Start ----------- - -Preparation -~~~~~~~~~~~ - -Install requirements and download model files. - -.. code:: bash - - python -m deeppavlov install asr_tts - python -m deeppavlov download asr_tts - -Examples below use `sounddevice `_ library. Install -it with ``pip install sounddevice==0.3.15``. You may need to install ``libportaudio2`` package with -``sudo apt-get install libportaudio2`` to make ``sounddevice`` work. - -.. note:: - ASR reads and TTS generates single channel WAV files. Files transferred to ASR are resampled to the frequency - specified in the NeMo config file (16 kHz for models from DeepPavlov configs). - -Speech recognition -~~~~~~~~~~~~~~~~~~ - -DeepPavlov :config:`asr ` config contains minimal pipeline for english speech recognition using -`QuartzNet15x5En `_ pretrained model. -To record speech on your computer and print transcription run following script: - -.. code:: python - - from io import BytesIO - - import sounddevice as sd - from scipy.io.wavfile import write - - from deeppavlov import build_model, configs - - sr = 16000 - duration = 3 - - print('Recording...') - myrecording = sd.rec(duration*sr, samplerate=sr, channels=1) - sd.wait() - print('done') - - out = BytesIO() - write(out, sr, myrecording) - - model = build_model(configs.nemo.asr) - text_batch = model([out]) - - print(text_batch[0]) - -Speech synthesis -~~~~~~~~~~~~~~~~ - -DeepPavlov :config:`tts ` config contains minimal pipeline for speech synthesis using -`Tacotron2 `_ and -`WaveGlow `_ pretrained models. -To generate audiofile and save it to hard drive run following script: - -.. code:: python - - from deeppavlov import build_model, configs - - model = build_model(configs.nemo.tts) - filepath_batch = model(['Hello world'], ['~/hello_world.wav']) - - print(f'Generated speech has successfully saved at {filepath_batch[0]}') - -Speech to speech -~~~~~~~~~~~~~~~~ - -Previous examples assume files with speech to recognize and files to be generated are on the same system where the -DeepPavlov is running. DeepPavlov :config:`asr_tts ` config allows sending files with speech to -recognize and receiving files with generated speech from another system. This config is recognizes received speech and -re-sounds it. - -Run ``asr_tts`` in REST Api mode: - -.. code:: bash - - python -m deeppavlov riseapi asr_tts - -This python script supposes that you already have file with speech to recognize. You can use code from speech -recognition example to record speech on your system. ``127.0.0.1`` should be replased by address of system where -DeepPavlov has started. - -.. code:: python - - from base64 import encodebytes, decodebytes - - from requests import post - - with open('/path/to/wav/file/with/speech', 'rb') as fin: - input_speech = fin.read() - - input_ascii = encodebytes(input_speech).decode('ascii') - - resp = post('http://127.0.0.1:5000/model', json={"speech_in_encoded": [input_ascii]}) - text, generated_speech_ascii = resp.json()[0] - generated_speech = decodebytes(generated_speech_ascii.encode()) - - with open('/path/where/to/save/generated/wav/file', 'wb') as fout: - fout.write(generated_speech) - - print(f'Speech transcriptions is: {text}') - -.. warning:: - NeMo library v0.10.0 doesn't allow to infer batches longer than one without compatible NVIDIA GPU. - -Models training ---------------- - -To get your own pre-trained checkpoints for NeMo modules see `Speech recognition `_ -and `Speech Synthesis `_ tutorials. Pre-trained models list could be found -`here `_. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index d6ca9489b3..2557de9120 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,7 +35,6 @@ Welcome to DeepPavlov's documentation! Morphological Tagger Named Entity Recognition Neural Ranking - Speech recognition and synthesis Spelling Correction Syntactic Parser TF-IDF Ranking diff --git a/tests/test_configs/nemo/tts2asr_test.json b/tests/test_configs/nemo/tts2asr_test.json deleted file mode 100644 index fbc46bd6f0..0000000000 --- a/tests/test_configs/nemo/tts2asr_test.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "chainer": { - "in": ["text"], - "pipe": [ - { - "class_name": "nemo_tts", - "nemo_params_path": "{TTS_PATH}/tacotron2_waveglow.yaml", - "load_path": "{TTS_PATH}", - "in": ["text"], - "out": ["speech"] - }, - { - "class_name": "bytesIO_encode_base64", - "in": ["speech"], - "out": ["ascii"] - }, - { - "class_name": "base64_decode_bytesIO", - "in": ["ascii"], - "out": ["speech_restored"] - }, - { - "class_name": "nemo_asr", - "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml", - "load_path": "{NEMO_PATH}/quartznet15x5", - "in": ["speech_restored"], - "out": ["transcription"] - } - ], - "out": ["transcription"] - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "NEMO_PATH": "{ROOT_PATH}/models/nemo", - "TTS_PATH": "{NEMO_PATH}/tacotron2_waveglow" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/tacotron2_waveglow.tar.gz", - "subdir": "{NEMO_PATH}" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz", - "subdir": "{NEMO_PATH}" - } - ] - } -} \ No newline at end of file diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 4f5315bb09..d6be223549 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -271,9 +271,6 @@ "syntax_tagger": { ("syntax/syntax_ru_syntagrus_bert.json", "syntax_ru_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("syntax/ru_syntagrus_joint_parsing.json", "syntax_ru_bert", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] - }, - "nemo": { - ("nemo/tts2asr_test.json", "nemo", ('IP',)): [ONE_ARGUMENT_INFER_CHECK] } }