In [20]:
# Load Bark TTS model on Apple Silicon GPU
from transformers import pipeline
import torch
import os

# Enable MPS fallback for compatibility
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

print(f"MPS available: {torch.backends.mps.is_available()}")
print("Loading Bark TTS on Apple Silicon GPU...")

MPS available: True
Loading Bark TTS on Apple Silicon GPU...


In [21]:
# Load Bark with MPS workaround - use CPU for generation
from transformers import BarkModel, BarkProcessor
import torch

processor = BarkProcessor.from_pretrained("suno/bark-small")

# Load model on CPU first to avoid MPS dtype issues during generation
model = BarkModel.from_pretrained("suno/bark-small")
print("Bark model loaded (will use MPS for forward pass, CPU for generation logic)")

Bark model loaded (will use MPS for forward pass, CPU for generation logic)


In [22]:
# Generate speech on CPU (MPS has dtype compatibility issues with Bark)
import time

text = "Hello, this is Bark TTS. Due to MPS dtype issues, we're using CPU for now."

# Process input
inputs = processor(text, return_tensors="pt")

print("Generating audio on CPU (MPS has known issues with Bark)...")

start_time = time.time()
with torch.no_grad():
    audio_array = model.generate(**inputs)
end_time = time.time()

print(f"Audio generated in {end_time - start_time:.2f} seconds on CPU")
print(f"Audio shape: {audio_array.shape}")

# Convert to numpy for saving
audio_np = audio_array.cpu().numpy().squeeze()

# Store for next cell
speech = {"audio": audio_np, "sampling_rate": 24000}

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Generating audio on CPU (MPS has known issues with Bark)...
Audio generated in 46.40 seconds on CPU
Audio shape: torch.Size([1, 181120])


In [ ]:
# Save audio as MP3 in output directory
import soundfile as sf
from pydub import AudioSegment
import numpy as np
import os

# Create output directory if it doesn't exist
os.makedirs("output", exist_ok=True)

# Save as WAV first (soundfile doesn't support MP3 directly)
wav_file = "output/bark_output.wav"
mp3_file = "output/bark_output.mp3"

# Normalize audio to prevent clipping
audio_normalized = speech["audio"] / np.max(np.abs(speech["audio"]))

# Save as WAV
sf.write(wav_file, audio_normalized, speech["sampling_rate"])

# Convert WAV to MP3 using pydub
audio_segment = AudioSegment.from_wav(wav_file)
audio_segment.export(mp3_file, format="mp3")

print(f"Audio saved as {mp3_file}")
print(f"Duration: {len(audio_segment)/1000:.2f} seconds")