In [36]:
# Get audio files

import os
import requests
import tarfile

url = "https://huggingface.co/datasets/MLCommons/peoples_speech/resolve/main/train/clean/clean_000000.tar"

download_folder = "data"
tar_path = os.path.join(download_folder, "clean_000000.tar")

download_folder

resultFolderName = download_folder
results = resultFolderName
os.makedirs(results, exist_ok=True)

response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(tar_path, 'wb') as f:
        f.write(response.raw.read())
    print(f"Downloaded tar file to {tar_path}")
else:
    print(f"Failed to download file: {response.status_code}")

try:
    with tarfile.open(tar_path, "r") as tar:
        tar.extractall(path=download_folder)
    print(f"Extracted tar file to {download_folder}")
except tarfile.TarError as e:
    print(f"Error extracting tar file: {e}")

try:
    os.remove(tar_path)
    print(f"Removed tar file: {tar_path}")
except OSError as e:
    print(f"Error removing tar file: {e}")


Downloaded tar file to data\clean_000000.tar
Extracted tar file to data
Removed tar file: data\clean_000000.tar


In [25]:
# Make dirs

import os
resultFolderName = "audioResults"
results = resultFolderName
os.makedirs(results, exist_ok=True)

# model1Name = "SpeechT5"
# model1Dir = os.path.join(results, model1Name)
# os.makedirs(model1Dir, exist_ok=True)

model2Name = "VitsModel"
model2Dir = os.path.join(results, model2Name)
os.makedirs(model2Dir, exist_ok=True)

In [26]:
# Load dataset information

from datasets import load_dataset

dataset = load_dataset("json", data_files="clean.json")

In [27]:
printfirstDataElement = False

if printfirstDataElement:
    print(dataset['train']['training_data'][0]['name'][0])
    print(dataset['train']['training_data'][0]['label'][0])

In [28]:
# Input arrays

textInput = [] 
audioPathInput = [] 

for data in dataset['train']['training_data']:
    textInput.extend(data['label'])
    audioPathInput.extend(data['name'])

printInputArrays = False

if printInputArrays:
    for i in range(len(textInput)):
        print(textInput[i])

    for i in range(len(audioPathInput)):
        print(audioPathInput[i])

In [29]:
# SpeechT5

# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
# from scipy.io.wavfile import write
# from datasets import load_dataset
# import torch
# from transformers import SpeechT5HifiGan
# import time

# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

# # device = 'cuda' if torch.cuda.is_available() else 'cpu'
# # model.to(device)
# # vocoder.to(device)

# # print(device)

# start_time = time.time()

# # for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
# inputs = processor(text=textInput[0], return_tensors="pt")

# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# # spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

# output_path = os.path.join(model1Dir, f"{audioPathInput[0]}.wav")
# output_waveform = speech.squeeze().cpu().numpy()
# write(output_path, 16000, output_waveform)


# end_time = time.time()

# total_time = end_time - start_time
# print(f"Total processing time: {total_time:.2f} seconds")

In [30]:
# # SpeechT5

# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
# from scipy.io.wavfile import write
# from datasets import load_dataset
# import torch
# import os
# from tqdm import tqdm
# import time

# # Initialize processor and models
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# # Check if a GPU is available and move the model to GPU if possible
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.to(device)
# vocoder.to(device)

# print(f"Using device: {device}")

# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[0 % len(embeddings_dataset)]["xvector"]).unsqueeze(0).to(device)

# outputs_SpeechT5 = []

# start_time = time.time()
# # for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
# text = textInput[0]
# inputs = processor(text=text, return_tensors="pt").to(device)

# with torch.no_grad():
#     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

# # output_path = os.path.join(model1Dir, f"{audioPathInput[i]}.wav")
# # output_waveform = speech.squeeze().cpu().numpy()
# # write(output_path, 16000, output_waveform)

# outputs_SpeechT5.append(speech)

# end_time = time.time()

# total_time = end_time - start_time
# print(f"Total processing time: {total_time:.2f} seconds")

In [31]:
# VitsModel generating speech audio

from transformers import VitsModel, AutoTokenizer
import torch
from scipy.io.wavfile import write
import os
from tqdm import tqdm
import time

model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

print(f"Using device: {device}")

outputs_VitsModel = []

start_time = time.time()
for i in tqdm(range(len(textInput)), desc="Processing", unit="file"):
    text = textInput[i]
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model(**inputs).waveform
    output_path = os.path.join(model2Dir, f"{audioPathInput[i]}")
    output_waveform = output.squeeze().cpu().numpy()
    write(output_path, 16000, output_waveform)

    outputs_VitsModel.append(output)

end_time = time.time()

total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")

Some weights of the model checkpoint at facebook/mms-tts-eng were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.wei

Using device: cuda


Processing: 100%|██████████| 335/335 [01:34<00:00,  3.56file/s]

Total processing time: 94.11 seconds



