In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
import numpy as np
import librosa


from datasets import load_dataset
from pdfminer.high_level import extract_text

In [None]:
# Extract text from pdf
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

text = extract_text_from_pdf(pdf_path="inputs/cognita_test_lite.pdf")
print("Text extraction completed.")
#print(text)

In [None]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

inputs = processor(text=text, return_tensors="pt")

In [None]:
# TOGGLE 1: LOAD ARCTIC DATASET EMBEDDINGS FOR SPEAKER CHARACTERISTICS

"""# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[10]["xvector"]).unsqueeze(0)
print(speaker_embeddings.shape)"""

In [None]:
# TOGGLE 2: EXTRACT VOICE CHARACTERISTICS FROM  LOCAL AUDIO FILE
def extract_voice_characteristics(audio_file, output_file, sr=22050, embedding_dim=512):

    y, sr = librosa.load(audio_file, sr=sr)

    # Assuming you have a method to extract a 512-dimensional embedding from the audio
    # Replace this with your actual method to get the 512-dimensional embedding
    embedding = extract_embedding(y, sr, embedding_dim)

    # Convert the embedding to a torch tensor of size [1, 512]
    embedding_tensor = torch.tensor(embedding).unsqueeze(0)

    # Save the embedding tensor to a .npy file
    np.save(output_file, embedding_tensor.numpy())

def extract_embedding(y, sr, embedding_dim):
    """
    Placeholder function to extract a 512-dimensional embedding from audio data.
    Replace this with your actual method to get the desired embedding.

    Parameters:
    y (np.ndarray): Audio signal.
    sr (int): Sample rate.
    embedding_dim (int): Dimensionality of the embedding to extract.

    Returns:
    np.ndarray: A 512-dimensional embedding.
    """
    # Example: Compute MFCCs and flatten to 512 dimensions
    n_mfcc = 13  # Example number of MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    embedding = mfccs.flatten()[:embedding_dim]  # Example flattening to 512 dimensions

    return embedding

In [None]:
# TOGGLE 2: EXTRACT voice characteristics from the given recording file

audio_file = 'inputs/Rec.wav'
output_file = 'Voice_npy/voice_characteristics_4.npy'
extract_voice_characteristics(audio_file, output_file)

In [None]:
#Load local voice characteristics from the .npy file
local_embeddings = np.load('Voice_npy/voice_characteristics_4.npy')

# Convert the numpy array to a PyTorch tensor
speaker_embeddings = torch.tensor(local_embeddings)

# Example to use with the rest of your code
print(speaker_embeddings.shape)

In [None]:
#Generate speech from the text using the speaker characteristics
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

sf.write("outputs/speechFromTTSOnly8.wav", speech.numpy(), samplerate=16000)