
This notebook contains examples of 4 different types of pipelines:
* Text to Embedding
* Text to Text translation
* Speech to Embedding
* Speech to Text translation

### Setup common config

In [1]:
import torch
device = torch.device("cpu")

### Loading Models (speech encoder + text decoder + tokenizer)

In [3]:
from sonar.models.sonar_speech.loader import load_sonar_speech_model
from sonar.models.sonar_text import (
    load_sonar_text_decoder_model,
    load_sonar_text_encoder_model,
    load_sonar_tokenizer,
)

In [4]:
speech_encoder_model = load_sonar_speech_model("sonar_speech_encoder_eng", device=device).eval()

Using the cached checkpoint of sonar_speech_encoder_eng. Set `force` to `True` to download again.


In [6]:
text_encoder_model = load_sonar_text_encoder_model("text_sonar_basic_encoder", device=device).eval()

Ignoring the cached checkpoint of text_sonar_basic_encoder. `force` is set to `True`.
Downloading the checkpoint of text_sonar_basic_encoder...
python(41160) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
100%|██████████| 2.85G/2.85G [01:59<00:00, 25.7MB/s] 


In [7]:
text_decoder_model = load_sonar_text_decoder_model("text_sonar_basic_decoder", device=device).eval()

Downloading the checkpoint of text_sonar_basic_decoder...
100%|██████████| 4.21G/4.21G [01:07<00:00, 67.4MB/s]


In [8]:
# tokenizer is compatible with nllb tokenizer logic already
text_tokenizer = load_sonar_tokenizer("text_sonar_basic_encoder")

Downloading the tokenizer of text_sonar_basic_encoder...
100%|██████████| 4.63M/4.63M [00:04<00:00, 1.21MB/s]


### Running Speech Data Pipelines

In [9]:
from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline, SpeechToTextModelPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
s2vec_model = SpeechToEmbeddingModelPipeline(encoder=speech_encoder_model)
s2vec_model.predict(["../tests/integration_tests/data/audio_files/audio_1.wav",
                     "../tests/integration_tests/data/audio_files/audio_2.wav"])

tensor([[ 3.6588e-03,  1.7128e-03,  9.0594e-03,  ...,  5.7862e-05,
          4.5227e-03, -2.3103e-03],
        [ 7.0922e-04,  6.1351e-03, -2.6858e-03,  ...,  2.6961e-03,
         -6.0307e-03,  1.7454e-03]])

In [11]:
s2t_model = SpeechToTextModelPipeline(encoder=speech_encoder_model,
                                      decoder=text_decoder_model,
                                      tokenizer=text_tokenizer)

s2t_model.predict(["../tests/integration_tests/data/audio_files/audio_1.wav",
                   "../tests/integration_tests/data/audio_files/audio_2.wav"], target_lang="eng_Latn")

DataPipelineError: The map operation has failed. See nested exception for details.

### Running Text Data Pipeline

In [None]:
from sonar.inference_pipelines.text import TextToTextModelPipeline, TextToEmbeddingModelPipeline

: 

In [None]:
text_embedding_pipeline = TextToEmbeddingModelPipeline(text_encoder_model, text_tokenizer)
text_to_text_pipeline = TextToTextModelPipeline(text_encoder_model, text_decoder_model, text_tokenizer)

: 

#### Applying models

In [None]:
data_source = "./eng_flores200_dev_sample.tsv"
text_emb = text_embedding_pipeline.predict(data_source, source_lang="eng_Latn")
text_emb

: 

In [None]:
text_translation = text_to_text_pipeline.predict(data_source, source_lang="eng_Latn", target_lang="fra_Latn")
text_translation

: 

: 