In [None]:
import os
import torch
import scipy
from datetime import datetime
from scipy.io.wavfile import write as write_wav

from transformers import BarkModel
from transformers import AutoProcessor, AutoModel

torch.cuda.get_device_name(), torch.cuda.get_device_capability(), torch.cuda.temperature()

## Config

## Working, CPU only

In [None]:
import scipy
from transformers import AutoProcessor, AutoModel

In [None]:
processor = AutoProcessor.from_pretrained("suno/bark-small", cache_dir="model")
model = AutoModel.from_pretrained("suno/bark-small", cache_dir="model")

In [None]:
sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe."
voice_preset="v2/fr_speaker_2"

inputs = processor(
    text=[sequence],
    voice_preset=voice_preset,
    return_tensors="pt",
)

speech_values = model.generate(
    **inputs,
    do_sample=True
)

scipy.io.wavfile.write(
    f"output/bark_" + voice_preset.split("/").pop() + ".wav",
    rate=model.generation_config.sample_rate,
    data=speech_values.cpu().numpy().squeeze()
)

In [None]:
!chown -R 1000:1000 output/

## Dev, on GPU

In [None]:
device = "cuda"
model = AutoModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, cache_dir="/app/model")#.to(device)
#model.enable_cpu_offload()
processor = AutoProcessor.from_pretrained("suno/bark-small", cache_dir="/app/model")

voice_preset = "v2/fr_speaker_2"
sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe. Original pour un ours !"

inputs = processor(sequence, voice_preset=voice_preset)#.to(device)

audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
sample_rate = model.generation_config.sample_rate

scipy.io.wavfile.write("output/bark_generation.wav", rate=sample_rate, data=audio_array)

In [None]:
model.generation_config, model.config

In [None]:
sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe. Original pour un ours !"
tokens = processor.tokenizer.tokenize(sequence)
encoded = processor.tokenizer.encode(sequence)
processor.tokenizer.decode(encoded)

In [None]:
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
history_prompt = inputs["history_prompt"]

In [None]:
output_folder = os.path.join("output", datetime.utcnow().date().isoformat())
os.makedirs(output_folder, exist_ok=True)

sequence = "Le petit Prince. Il était une fois un ours blanc qui vivait aux USA. Et il s'appelait Poulpe. Original pour un ours !"
attention_masks = [None] * 10
history_prompts = [None] * 10

for preset in range(0, 1):
    print("generate preset", preset)
    inputs = processor(
        text=sequence,
        voice_preset=f"v2/fr_speaker_{preset}",
        return_tensors="pt",
    )#.to("cuda")

    print("inputs :", inputs.keys())
    #print("attention mask :", len(inputs["attention_mask"]), sum(inputs["attention_mask"]))
    
    #inputs["attention_mask"] = attention_mask
    #inputs["history_prompt"] = history_prompt
    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    history_prompt = inputs["history_prompt"]

    #attention_masks[preset] = inputs["attention_mask"]
    #history_prompts[preset] = inputs["history_prompt"]
    
    speech_values = model.generate(
        #attention_mask=attention_mask,
        #input_ids=input_ids,
        #history_prompt=history_prompt,
        **inputs,
        do_sample=True,
    )

    scipy.io.wavfile.write(
        os.path.join(output_folder, f"bark_fr{preset}.wav"),
        rate=model.generation_config.sample_rate,
        data=speech_values.cpu().numpy().squeeze()
    )

In [None]:
!chown -R 1000:1000 output/