In [5]:
import torch, torchaudio, soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier

device = "cuda" if torch.cuda.is_available() else "cpu"


In [7]:

# ----- 1. Load your WAV (British female reference) -----
ref_wav_path = "/Users/saketm10/Downloads/videoplayback (mp3cut.net).wav"   # <-- your voice sample
wav, sr = torchaudio.load(ref_wav_path)

# Ensure mono
if wav.shape[0] > 1:
    wav = torch.mean(wav, dim=0, keepdim=True)

# Resample to 16k (required by speaker encoder)
if sr != 16000:
    wav = torchaudio.functional.resample(wav, sr, 16000)

# Normalize amplitude
wav = wav / (wav.abs().max() + 1e-9)



In [9]:
# ----- 2. Get 512-dim speaker embedding -----
spk_encoder = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": device}
)
with torch.no_grad():
    emb = spk_encoder.encode_batch(wav.to(device)).squeeze(0)  # shape [512]
speaker_embedding = emb.unsqueeze(0).to(device)  # shape [1, 512]



In [10]:
# ----- 3. Load SpeechT5 + vocoder -----
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)


SyntaxError: invalid syntax (1146026170.py, line 6)

In [11]:
# ----- 3. Load SpeechT5 + vocoder -----
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# ----- 4. Generate TTS -----
text = "Good evening, Saket. This is your assistant speaking with a British accent."
inputs = processor(text=text, return_tensors="pt").to(device)

with torch.no_grad():
    speech = model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings=speaker_embedding,
        vocoder=vocoder
    )

sf.write("british_female_tts.wav", speech.cpu().numpy(), 16000)
print("✅ Saved british_female_tts.wav")

RuntimeError: expand(torch.FloatTensor{[1, 1, 1, 512]}, size=[-1, 1, -1]): the number of sizes provided (3) must be greater or equal to the number of dimensions in the tensor (4)

In [13]:
import torch, torchaudio, soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Load reference WAV (British female)
ref_wav_path = "/Users/saketm10/Downloads/videoplayback (mp3cut.net).wav"   # <-- your voice sample
wav, sr = torchaudio.load(ref_wav_path)
if wav.shape[0] > 1:
    wav = wav.mean(dim=0, keepdim=True)
if sr != 16000:
    wav = torchaudio.functional.resample(wav, sr, 16000)
wav = wav / (wav.abs().max() + 1e-9)

# 2) Speaker embedding -> [1, 512]
spk_encoder = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": device}
)
with torch.no_grad():
    enc = spk_encoder.encode_batch(wav.to(device))  # e.g., [1,1,1,512] or [1,512]
enc = enc.squeeze()
if enc.dim() == 1:
    speaker_embedding = enc.unsqueeze(0)
elif enc.dim() == 2 and enc.size(0) == 1:
    speaker_embedding = enc
else:
    if enc.numel() == 512:
        speaker_embedding = enc.view(1, 512)
    else:
        while enc.dim() > 1 and enc.size(0) > 1:
            enc = enc.mean(dim=0, keepdim=False)
        speaker_embedding = enc.view(1, 512)
speaker_embedding = speaker_embedding.to(device).float()

# 3) SpeechT5 + HiFiGAN
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# 4) Synthesize
text = "Good evening, this is a British female voice generated using Speech T five."
inputs = processor(text=text, return_tensors="pt").to(device)

with torch.no_grad():
    speech = model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings=speaker_embedding,
        vocoder=vocoder
    )

sf.write("british_female_tts.wav", speech.cpu().numpy(), 16000)
print("Saved: british_female_tts.wav")

Saved: british_female_tts.wav
