# Inference examples

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import torch.nn.functional as F

import pandas as pd
from huggingface_hub import hf_hub_download

from IPython.display import Markdown, HTML

## The whole pipeline

In [None]:
from whisperspeech.pipeline import Pipeline

  def backtrace(trace: np.ndarray):


In [None]:
pipe = Pipeline()

In [None]:
pipe.generate_to_file('generated-audio.wav', """
I am Whisper Speech, a new open-source text to speech model. We choose to train a text
to speech system this year not because this is easy,
but because this is hard, because that goal will serve to organize and measure
the best of our energies and skills, because that challenge is one that we are willing
to accept, one we are unwilling to postpone, and one we intend to win!
""")

## Semantic token quantization model

In [None]:
from whisperspeech.extract_stoks import RQBottleneckTransformer

In [None]:
vqmodel = RQBottleneckTransformer.load_model().cuda()

In [None]:
stoks_txt = pd.read_feather(hf_hub_download('collabora/whisperspeech', 'stoks-txt/whisperspeech-librilight-stoks-txt-small-medium-A.feather', repo_type='dataset'))

In [None]:
display(Markdown(f"""
**Original:** {stoks_txt.iloc[0]['txt_tiny']}

**Quantized:** {vqmodel.decode_text(stoks_txt.iloc[0]['stoks'])[0].text}
"""))


**Original:**  Preface 2. Murder in the Gunroom. This is a Libervox recording. All Libervox recordings are in the public domain. For more information or to volunteer, please visit Libervox.org. Recording by Anthony Wilson.

**Quantized:** 2. Murder in the Gunroom This is a Libra Vox recording. All Libra Vox recordings are in the public domain. For more information or to volunteer, please visit LibraVox.org. Recording by Anthony Wilson.


## Text to semantic model

In [None]:
from whisperspeech.t2s_up import TSARTransformer

  def backtrace(trace: np.ndarray):


In [None]:
t2s = TSARTransformer.load_model(local_filename='nbs/t2s_up.model').cuda()

In [None]:
text = """We choose to train an Open Source text to speech system this year, not because this is easy,
but because this is hard! Because that goal will serve to organize and measure
the best of our energies and skills! Because that challenge is one that we are willing
to accept! one we are unwilling to postpone! and one we intend to win!""".replace('\n', '')
stoks = t2s.generate(text, T=.7)

In [None]:
vqmodel.decode_text(stoks.cpu())[0].text

'We choose to train an open source text to speak system this year, not because this is easy, but because this is hard. Because that goal will serve to organize and measure the best of our energies and skills, because that challenges one that we are willing to accept,'

## Semantic to acoustic model (multispeaker)

In [None]:
from whisperspeech.s2a_delar_mup import SADelARTransformer
from whisperspeech.a2wav import Vocoder

In [None]:
model = SADelARTransformer.load_model(local_filename='nbs/s2a_up.model').cuda()
vocoder = Vocoder()

In [None]:
# this takes a while since we did not yet implement key/query caching in the decoder
for spk in ["4078","2156","3645","6454"]:
    atoks = model.generate(stoks, [spk], T=.6, top_k=512)
    vocoder.decode_to_file(f"generated-audio-S{spk}.wav", atoks)

In [None]:
# this takes a while since we did not yet implement key/query caching in the decoder
for spk in ["8713","3157"]:
    atoks = model.generate(stoks, [spk], T=.5)
    vocoder.decode_to_file(f"generated-audio-S{spk}.wav", atoks)