In [1]:
# Get audio files

import os
import requests
import tarfile

url = "https://huggingface.co/datasets/MLCommons/peoples_speech/resolve/main/train/clean/clean_000000.tar"

download_folder = os.path.join(os.path.dirname(os.getcwd()), "data")
os.makedirs(download_folder, exist_ok=True)
tar_path = os.path.join(download_folder, "clean_000000.tar")

response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(tar_path, 'wb') as f:
        f.write(response.raw.read())
    print(f"Downloaded tar file to {tar_path}")
else:
    print(f"Failed to download file: {response.status_code}")

try:
    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=download_folder)
    print(f"Extracted tar file to {download_folder}")
except tarfile.TarError as e:
    print(f"Error extracting tar file: {e}")

try:
    os.remove(tar_path)
    print(f"Removed tar file: {tar_path}")
except OSError as e:
    print(f"Error removing tar file: {e}")


Downloaded tar file to /home/denis/Desktop/data/clean_000000.tar
Extracted tar file to /home/denis/Desktop/data
Removed tar file: /home/denis/Desktop/data/clean_000000.tar


In [24]:
# Make dirs

import os
resultFolderName = "audioResults"
results = os.path.join(os.path.dirname(os.getcwd()), resultFolderName)
os.makedirs(results, exist_ok=True)

model1Name = "SpeechT5"
model1Dir = os.path.join(results, model1Name)
os.makedirs(model1Dir, exist_ok=True)

model2Name = "VitsModel"
model2Dir = os.path.join(results, model2Name)
os.makedirs(model2Dir, exist_ok=True)

model3Name = "Tacotron2"
model3Dir = os.path.join(results, model3Name)
os.makedirs(model3Dir, exist_ok=True)

In [20]:
# Load dataset information

from datasets import load_dataset

dataset = load_dataset("json", data_files="clean.json")

In [21]:
printfirstDataElement = False

if printfirstDataElement:
    print(dataset['train']['training_data'][0]['name'][0])
    print(dataset['train']['training_data'][0]['label'][0])

In [22]:
# Input arrays

textInput = [] 
audioPathInput = [] 

for data in dataset['train']['training_data']:
    textInput.extend(data['label'])
    audioPathInput.extend(data['name'])

printInputArrays = False

if printInputArrays:
    for i in range(len(textInput)):
        print(textInput[i])

    for i in range(len(audioPathInput)):
        print(audioPathInput[i])

In [12]:
# SpeechT5

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from scipy.io.wavfile import write
from datasets import load_dataset
import torch
from transformers import SpeechT5HifiGan
import time

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.to(device)
# vocoder.to(device)

# print(device)

start_time = time.time()

# for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
inputs = processor(text=textInput[0], return_tensors="pt")

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

output_path = os.path.join(model1Dir, f"{audioPathInput[0]}.wav")
output_waveform = speech.squeeze().cpu().numpy()
write(output_path, 16000, output_waveform)


end_time = time.time()

total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21.3M/21.3M [00:02<00:00, 10.2MB/s]
Generating validation split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7931/7931 [00:00<00:00, 62904.01 examples/s]


Total processing time: 26.11 seconds


In [25]:
import torch
import time
from tqdm import tqdm
import os
from scipy.io.wavfile import write
config = {
    'mask_padding': False,
    'n_mels': 80,
    'n_symbol': 148,
    'n_frames_per_step': 1,
    'symbol_embedding_dim': 512,
    'encoder_embedding_dim': 512,
    'encoder_n_convolution': 3,
    'encoder_kernel_size': 5,
    'decoder_rnn_dim': 1024,
    'decoder_max_step': 5000,
    'decoder_dropout': 0.1,
    'decoder_early_stopping': True,
    'attention_rnn_dim': 1024,
    'attention_hidden_dim': 128,
    'attention_location_n_filter': 32,
    'attention_location_kernel_size': 31,
    'attention_dropout': 0.1,
    'prenet_dim': 256,
    'postnet_n_convolution': 5,
    'postnet_kernel_size': 5,
    'postnet_embedding_dim': 512,
    'gate_threshold': 0.5
}


tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16', config)
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()


utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')

#outputs_TacotronModel = []
tacotron2.max_decoder_steps = 5000
tacotron2.decoder_max_step = 5000

start_time = time.time()
for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
    
    sequences, lengths = utils.prepare_input_sequence([textInput[i]])
    
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)
        audio = waveglow.infer(mel)
    output_path = os.path.join(model3Dir, f"{audioPathInput[i]}")
    audio_numpy = audio[0].data.cpu().numpy()
    write(output_path, 22050, audio_numpy)

    #outputs_TacotronModel.append(output)

end_time = time.time()

total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Processing:   0%|                                                                                                                                                   | 0/336 [00:00<?, ?file/s]



Processing:   0%|▍                                                                                                                                        | 1/336 [00:10<1:00:26, 10.82s/file]



Processing:   0%|▍                                                                                                                                        | 1/336 [00:21<2:00:20, 21.55s/file]


KeyboardInterrupt: 

In [21]:
# # SpeechT5

# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
# from scipy.io.wavfile import write
# from datasets import load_dataset
# import torch
# import os
# from tqdm import tqdm
# import time

# # Initialize processor and models
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# # Check if a GPU is available and move the model to GPU if possible
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.to(device)
# vocoder.to(device)

# print(f"Using device: {device}")

# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[0 % len(embeddings_dataset)]["xvector"]).unsqueeze(0).to(device)

# outputs_SpeechT5 = []

# start_time = time.time()
# # for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
# text = textInput[0]
# inputs = processor(text=text, return_tensors="pt").to(device)

# with torch.no_grad():
#     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

# # output_path = os.path.join(model1Dir, f"{audioPathInput[i]}.wav")
# # output_waveform = speech.squeeze().cpu().numpy()
# # write(output_path, 16000, output_waveform)

# outputs_SpeechT5.append(speech)

# end_time = time.time()

# total_time = end_time - start_time
# print(f"Total processing time: {total_time:.2f} seconds")

In [42]:
# VitsModel generating speech audio

from transformers import VitsModel, AutoTokenizer
import torch
from scipy.io.wavfile import write
import os
from tqdm import tqdm
import time

model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

print(f"Using device: {device}")

outputs_VitsModel = []

start_time = time.time()
for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
    text = textInput[i]
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model(**inputs).waveform
    output_path = os.path.join(model2Dir, f"{audioPathInput[i]}")
    output_waveform = output.squeeze().cpu().numpy()
    write(output_path, 16000, output_waveform)

    outputs_VitsModel.append(output)

end_time = time.time()

total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

Some weights of the model checkpoint at facebook/mms-tts-eng were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.wei

Using device: cuda


Processing:   5%|▍         | 19/397 [00:13<04:22,  1.44file/s]
