In [None]:
import os
import torch
import scipy
from datetime import datetime
from scipy.io.wavfile import write as write_wav

from transformers import BarkModel
from transformers import AutoProcessor, AutoModel

torch.cuda.get_device_name(), torch.cuda.get_device_capability(), torch.cuda.temperature()

## Config

## Working, CPU only

In [16]:
import scipy
from transformers import AutoProcessor, AutoModel

In [17]:
processor = AutoProcessor.from_pretrained("suno/bark-small", cache_dir="model")
model = AutoModel.from_pretrained("suno/bark-small", cache_dir="model")



In [19]:
sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA"
voice_preset="v2/fr_speaker_2"

inputs = processor(
    text=[sequence],
    voice_preset=voice_preset,
    return_tensors="pt",
)

speech_values = model.generate(
    **inputs,
    do_sample=True
)

scipy.io.wavfile.write(
    f"output/bark_" + voice_preset.split("/").pop() + ".wav",
    rate=model.generation_config.sample_rate,
    data=speech_values.cpu().numpy().squeeze()
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [None]:
!chown -R 1000:1000 output/

## Dev, on GPU

In [1]:
import os
offload_models = False
use_small_models = True
os.environ["SUNO_OFFLOAD_CPU"] = str(offload_models)
os.environ["SUNO_USE_SMALL_MODELS"] = str(use_small_models)

In [2]:
import os

from bark.generation import (
    generate_text_semantic,
    preload_models,
    models,
)
import bark.generation

from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

import torch
import time

In [3]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
print("Cuda mem", torch.cuda.memory_allocated())

preload_models(
    text_use_small=use_small_models,
    coarse_use_small=use_small_models,
    fine_use_small=use_small_models,
    force_reload=True,
)
print("Cuda mem", torch.cuda.memory_allocated())

Cuda mem 0




In [5]:
audio_array = generate_audio(
    "madam I'm adam",
    history_prompt="v2/en_speaker_5",
    silent=True
)

max_utilization = torch.cuda.max_memory_allocated()
print(f"Small models {use_small_models}, offloading to CPU: {offload_models}")
print(f"\tmax memory usage = {max_utilization / 1024 / 1024:.0f}MB")

Small models True, offloading to CPU: False
	max memory usage = 2949MB


In [15]:
import scipy
os.makedirs("/app/output", exist_ok=True)
scipy.io.wavfile.write(
    f"/app/output/bark_test.wav",
    rate=SAMPLE_RATE,
    data=audio_array
)

# Sample V2

In [None]:
import os

from IPython.display import Audio
import nltk  # we'll use this to split into sentences
import numpy as np

from bark.generation import (
    generate_text_semantic,
    preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

In [None]:
GEN_TEMP = 0.6
SPEAKER = "v2/fr_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

pieces = []
sentence = "madam I'm adam"
semantic_tokens = generate_text_semantic(
    sentence,
    history_prompt=SPEAKER,
    temp=GEN_TEMP,
    min_eos_p=0.05,  # this controls how likely the generation is to end
)

audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)
pieces += [audio_array, silence.copy()]


# WIP

In [None]:
device = "cuda"
model = AutoModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, cache_dir="/app/model")#.to(device)
#model.enable_cpu_offload()
processor = AutoProcessor.from_pretrained("suno/bark-small", cache_dir="/app/model")

voice_preset = "v2/fr_speaker_2"
sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe. Original pour un ours !"

inputs = processor(sequence, voice_preset=voice_preset)#.to(device)

audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
sample_rate = model.generation_config.sample_rate

scipy.io.wavfile.write("output/bark_generation.wav", rate=sample_rate, data=audio_array)

In [None]:
model.generation_config, model.config

In [None]:
sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe. Original pour un ours !"
tokens = processor.tokenizer.tokenize(sequence)
encoded = processor.tokenizer.encode(sequence)
processor.tokenizer.decode(encoded)

In [None]:
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
history_prompt = inputs["history_prompt"]

In [None]:
output_folder = os.path.join("output", datetime.utcnow().date().isoformat())
os.makedirs(output_folder, exist_ok=True)

sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe. Original pour un ours !"
attention_masks = [None] * 10
history_prompts = [None] * 10

for preset in range(0, 1):
    print("generate preset", preset)
    inputs = processor(
        text=sequence,
        voice_preset=f"v2/fr_speaker_{preset}",
        return_tensors="pt",
    )#.to("cuda")

    print("inputs :", inputs.keys())
    #print("attention mask :", len(inputs["attention_mask"]), sum(inputs["attention_mask"]))
    
    #inputs["attention_mask"] = attention_mask
    #inputs["history_prompt"] = history_prompt
    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    history_prompt = inputs["history_prompt"]

    #attention_masks[preset] = inputs["attention_mask"]
    #history_prompts[preset] = inputs["history_prompt"]
    
    speech_values = model.generate(
        #attention_mask=attention_mask,
        #input_ids=input_ids,
        #history_prompt=history_prompt,
        **inputs,
        do_sample=True,
    )

    scipy.io.wavfile.write(
        os.path.join(output_folder, f"bark_fr{preset}.wav"),
        rate=model.generation_config.sample_rate,
        data=speech_values.cpu().numpy().squeeze()
    )

In [None]:
!chown -R 1000:1000 output/