https://huggingface.co/parler-tts/parler_tts_mini_v0.1

In [1]:
def install_lib(libname):
    print(f">>> {libname}")
    get_ipython().system(f"pip install -qqq {libname}")
    

In [2]:
%%time

libs = ["bitsandbytes", "transformers", "peft", "accelerate", "datasets",\
        "trl", "flash_attn", "huggingface_hub", "absl-py",
        "git+https://github.com/huggingface/parler-tts.git",
        "protobuf==3.20.3"
       ]

for lib in libs:
    install_lib(lib)

>>> bitsandbytes
>>> transformers
>>> peft
>>> accelerate
>>> datasets
>>> trl
>>> flash_attn
>>> huggingface_hub
>>> absl-py
>>> git+https://github.com/huggingface/parler-tts.git
>>> protobuf==3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible.[0m[31m
[0mCPU times: user 1.24 s, sys: 261 ms, total: 1.5 s
Wall time: 1min 14s


In [3]:
import accelerate
import bitsandbytes
import datasets
import peft
import torch
import trl

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("torch version:", torch.__version__)
print("bitsandbytes version:", bitsandbytes.__version__)
print("peft version:", peft.__version__)
print("accelerate version:", accelerate.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)
print(f"Device name: '{torch.cuda.get_device_name()}'")
print("Device:", device)
print(
    f"Device properties: '{torch.cuda.get_device_properties(torch.cuda.current_device())}'"
)
print(
    "Suporta bfloat16." if torch.cuda.is_bf16_supported() else "Não suporta bfloat16."
)
get_ipython().system("nvidia-smi")



torch version: 2.3.1
bitsandbytes version: 0.43.3
peft version: 0.12.0
accelerate version: 0.34.2
datasets version: 2.21.0
trl version: 0.10.1
Device name: 'NVIDIA GeForce RTX 4060 Ti'
Device: cuda
Device properties: '_CudaDeviceProperties(name='NVIDIA GeForce RTX 4060 Ti', major=8, minor=9, total_memory=16059MB, multi_processor_count=34)'
Suporta bfloat16.
Sun Dec 22 22:18:19 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 Ti     Off |   00000000:0

# Mini

In [None]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

In [11]:
import IPython

filename = "parler_tts_out.wav"

prompt = (
    "You have no need to light a night light on a light night like tonight."
)
description = (
    "Laura's voice is not monotone yet slightly fast in delivery, "
    "with a very close recording that almost has no background noise. "
)

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write(filename, audio_arr, model.config.sampling_rate)
IPython.display.Audio(filename, autoplay=True)

# Large


In [1]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import IPython


filename = "parler_tts_out.wav"

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")

prompt = "Hey, how are you doing today?"
description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write(filename, audio_arr, model.config.sampling_rate)
IPython.display.Audio(filename, autoplay=True)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


# SUNO

In [8]:
from transformers import pipeline
import scipy

# synthesiser = pipeline("text-to-speech", "suno/bark")

# speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"do_sample": True})

# scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"])


In [9]:
from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

# inputs = processor(
#     text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
#     return_tensors="pt",
# )

# speech_values = model.generate(**inputs, do_sample=True)


  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


In [15]:
%%time
import IPython
from IPython.display import Audio

inputs = processor(
    text=["Ola, mãe. Tudo bem?"],
    return_tensors="pt",
)

speech_values = model.generate(**inputs, do_sample=True)

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate, autoplay=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


CPU times: user 2min 41s, sys: 90 ms, total: 2min 41s
Wall time: 20.4 s


In [None]:
%%time
import IPython
from IPython.display import Audio

inputs = processor(
    text=["Ola pessoal, tudo joia?"],
    return_tensors="pt",
)

speech_values = model.generate(**inputs, do_sample=True)

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate, autoplay=True)
