In [1]:
from datasets import load_dataset, Audio
from transformers import EncodecModel, AutoProcessor
import numpy as np
import torch
import torchaudio
from TTS.api import TTS

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Get the TTS model
model_name = 'tts_models/en/ljspeech/speedy-speech'
tts = TTS(model_name, gpu=False)

 > tts_models/en/ljspeech/speedy-speech is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: speedy_speech
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rm

### We know the sampling rate is 22050 according to the above text

In [None]:
# Sampling rate
sampling_rate = 22050

In [8]:
tts.get_models_file_path()

PosixPath('/home/gmongaras/miniconda3/lib/python3.10/site-packages/TTS/.models.json')

In [3]:
# load the model + processor (for pre-processing the audio)
model = EncodecModel.from_pretrained("facebook/encodec_24khz").eval().cuda()
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [34]:
# First, load in the sample data
stylized_audio, stylized_sr = torchaudio.load("sample.wav")

# Text for this audio
text = "Mr. Quilter is the Apostle of the Middle Class and we are glad to welcome his gospel."

In [35]:
# Generate an unstylized audio clip based off the same text
unstylized_audio = tts.tts(text)
sr = 22050

 > Text splitted to sentences.
['Mr. Quilter is the Apostle of the Middle Class and we are glad to welcome his gospel.']
 > Processing time: 0.16689443588256836
 > Real-time factor: 0.0296355359426188


In [36]:
# Resample both audio clips to 24khz
stylized_audio = torchaudio.transforms.Resample(stylized_sr, 24000)(stylized_audio)
unstylized_audio = torchaudio.transforms.Resample(sr, 24000)(torch.tensor(unstylized_audio)).unsqueeze(0)

In [37]:
# Save samples
torchaudio.save("stylized.wav", stylized_audio, 24000)
torchaudio.save("unstylized.wav", unstylized_audio, 24000)

In [38]:
# Preporcess the audio segments
stylized_audio = processor(stylized_audio.tolist(), sampling_rate=24000, return_tensors="pt")
unstylized_audio = processor(unstylized_audio.tolist(), sampling_rate=24000, return_tensors="pt")

In [39]:
# Encode inputs
encoder_outputs_stylized = model.encode(stylized_audio["input_values"].cuda(), stylized_audio["padding_mask"].cuda(), bandwidth=24.0)
encoder_outputs_unstylized = model.encode(unstylized_audio["input_values"].cuda(), unstylized_audio["padding_mask"].cuda(), bandwidth=24.0)

In [45]:
# Decode inputs
stylized_audio_recon = model.decode(encoder_outputs_stylized.audio_codes, encoder_outputs_stylized.audio_scales, stylized_audio["padding_mask"].cuda())[0]
unstylized_audio_recon = model.decode(encoder_outputs_unstylized.audio_codes, encoder_outputs_unstylized.audio_scales, unstylized_audio["padding_mask"].cuda())[0]

In [46]:
stylized_audio_recon.shape

torch.Size([1, 1, 140520])

In [47]:
unstylized_audio_recon.shape

torch.Size([1, 1, 135158])

In [48]:
# Save samples
torchaudio.save("stylized_recon.wav", stylized_audio_recon.cpu()[0], 24000)
torchaudio.save("unstylized_recon.wav", unstylized_audio_recon.cpu()[0], 24000)