# StyleTTS 2 Demo (LibriTTS)

Before you run the following cells, please make sure you have downloaded [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzipped it under the `demo` folder.

### Utils

#### Load Packages

In [None]:
import os
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"  # <-- adjust if different

import torch # Deep Learning Framework
torch.manual_seed(0) # Fixes starting point of random seed for torch
torch.backends.cudnn.benchmark = False # Fix convolution algorithm
torch.backends.cudnn.deterministic = True # Only use deterministic algorithms

import random # Python's built-in RNG
random.seed(0) # Fix random seed

import numpy as np # Numerical Computing Library
np.random.seed(0) # Fix random seed

%cd ..

# load packages
import time
import random
import numpy as np
import torch
import torchaudio # Loading/saving waveforms, resampling, transforms
import librosa # Python library for audio analysis
from munch import Munch # Turns dictionaries into objects with attribute-style access
from nltk.tokenize import word_tokenize # Tokenizers divide strings into lists of substrings
import time # Used for timing operations
import yaml # Required for config.yml to load model hyperparameters and paths

import soundfile as sf
import numpy as np
import os

from models import *
from utils import *
from text_utils import TextCleaner
textcleaner = TextCleaner() # Lowercasing & trimming, expanding numbers & symbols, handling punctuation, phoneme conversion, tokenization

import phonemizer # Splits words into phonemes (symbols that represent how words are pronounced)
global_phonemizer = phonemizer.backend.EspeakBackend(
    language='en-us',
    preserve_punctuation=True, # Keeps Punctuation such as , . ? !
    with_stress=True # Adds stress marks to vowels
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Tell matplotlib to, if used, render inline in the notebook outputs
%matplotlib inline

#### Helper Functions

In [None]:
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, # Number of Mel-frequency bins
    n_fft=2048, # Length of FFT
    win_length=1200, # Size of window in samples
    hop_length=300 # Step size between windows
)

mean, std = -4, 4 # Normalization parameters, roughly centers and scales mel values

def length_to_mask(lengths):
    mask = torch.arange(lengths.max()) # Creates a Vector [0,1,2,3,...,x], where x = biggest value in lengths
    mask = mask.unsqueeze(0) # Creates a Matrix [1,x] from Vector [x]
    mask = mask.expand(lengths.shape[0], -1) # Expands the matrix from [1,x] to [y,x], where y = number of elements in lengths
    mask = mask.type_as(lengths) # Assign mask the same type as lengths
    mask = torch.gt(mask+1, lengths.unsqueeze(1)) # gt = greater than, compares each value from lengths to a row of values in mask; unsqueeze = splits vector lengths into vectors of size 1
    return mask # returns a mask of shape (batch_size, max_length) where mask[i, j] = 1 if j < lengths[i] and mask[i, j] = 0 otherwise.

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def compute_style(path):
    wave, sr = librosa.load(path, sr=24000) # Loads audio file, returns N-dimensional array (ndarray) and sampling rate (sr)
    audio, index = librosa.effects.trim(wave, top_db=30) # Trims silence from the beginning and end of the audio
    if sr != 24000:
        audio = librosa.resample(audio, sr, 24000) # Resamples audio to 24kHz
    mel_tensor = preprocess(audio).to(device)

    with torch.no_grad():
        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

    return torch.cat([ref_s, ref_p], dim=1)

### Load models

In [None]:
config = yaml.safe_load(open("Models/LibriTTS/config.yml"))

# load pretrained ASR model
ASR_config = config.get('ASR_config', False)
ASR_path = config.get('ASR_path', False)
text_aligner = load_ASR_models(ASR_path, ASR_config)

# load pretrained F0 model
F0_path = config.get('F0_path', False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert
BERT_path = config.get('PLBERT_dir', False)
plbert = load_plbert(BERT_path)

In [None]:
model_params = recursive_munch(config['model_params']) # Allows attribute-style access to keys of model_params
model = build_model(
    model_params,
    text_aligner, # Automatic Speech Recognition model
    pitch_extractor, # F0 model
    plbert # BERT model
)

_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

In [None]:
params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
params = params_whole['net']

In [None]:
for key in model:
    if key in params:
        print('%s loaded' % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict
            state_dict = params[key]
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:] # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])
_ = [model[key].eval() for key in model]

In [None]:
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

In [None]:
sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
    clamp=False
)

### Synthesize speech

In [None]:
def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):
    text = text.strip()
    ps = global_phonemizer.phonemize([text])
    ps = word_tokenize(ps[0])
    ps = ' '.join(ps)

    tokens = textcleaner(ps)
    tokens.insert(0, 0)
    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
    
    with (torch.no_grad()):
        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
        text_mask = length_to_mask(input_lengths).to(device)

        t_en = model.text_encoder(tokens, input_lengths, text_mask)
        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) 

        s_pred = sampler(
            noise = torch.randn(1,1,256).to(device),
            embedding=bert_dur,
            embedding_scale=embedding_scale,
            features=ref_s, # reference from the same speaker as the embedding
            num_steps=diffusion_steps
        ).squeeze(1)

        # Split Style Vector
        s = s_pred[:, 128:] # Right Half = Acoustic Style Vector
        ref = s_pred[:, :128] # Left Half = Prosodic Style Vector

        # Interpolating Reference Audio (ref_s) with style vector from diffusion model
        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
        s = beta * s + (1 - beta)  * ref_s[:, 128:]

        d = model.predictor.text_encoder(d_en, 
                                         s, input_lengths, text_mask)

        x, _ = model.predictor.lstm(d)
        duration = model.predictor.duration_proj(x)

        duration = torch.sigmoid(duration).sum(axis=-1)
        pred_dur = torch.round(duration.squeeze()).clamp(min=1)


        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
        c_frame = 0
        for i in range(pred_aln_trg.size(0)):
            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
            c_frame += int(pred_dur[i].data)

        # encode prosody
        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(en)
            asr_new[:, :, 0] = en[:, :, 0]
            asr_new[:, :, 1:] = en[:, :, 0:-1]
            en = asr_new

        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
        if model_params.decoder.type == "hifigan":
            asr_new = torch.zeros_like(asr)
            asr_new[:, :, 0] = asr[:, :, 0]
            asr_new[:, :, 1:] = asr[:, :, 0:-1]
            asr = asr_new

        out = model.decoder(asr, 
                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))
    
        
    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later

#### Basic synthesis (5 diffusion steps, seen speakers)

In [None]:
text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''

In [None]:
reference_dicts = {}
reference_dicts['696_92939'] = "Demo/reference_audio/696_92939_000016_000006.wav"
reference_dicts['1789_142896'] = "Demo/reference_audio/1789_142896_000022_000005.wav"

In [None]:
for k, path in reference_dicts.items():
    ref_s = compute_style(path)
    
    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)

    import IPython.display as ipd
    print(k + ' Synthesized:')
    display(ipd.Audio(wav, rate=24000, normalize=False))
    print('Reference:')
    display(ipd.Audio(path, rate=24000, normalize=False))

#### With higher diffusion steps (more diverse)

Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed.

In [None]:
noise = torch.randn(1,1,256).to(device)
for k, path in reference_dicts.items():
    ref_s = compute_style(path)
    start = time.time()
    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    import IPython.display as ipd
    print(k + ' Synthesized:')
    display(ipd.Audio(wav, rate=24000, normalize=False))
    print(k + ' Reference:')
    display(ipd.Audio(path, rate=24000, normalize=False))

#### Basic synthesis (5 diffusion steps, umseen speakers)
The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2.

In [None]:
reference_dicts = {}
# format: (path, text)
reference_dicts['1221-135767'] = ("Demo/reference_audio/1221-135767-0014.wav", "Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.")
reference_dicts['5639-40744'] = ("Demo/reference_audio/5639-40744-0020.wav", "Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.")
reference_dicts['908-157963'] = ("Demo/reference_audio/908-157963-0027.wav", "And lay me down in my cold bed and leave my shining lot.")
reference_dicts['4077-13754'] = ("Demo/reference_audio/4077-13754-0000.wav", "The army found the people in poverty and left them in comparative wealth.")

In [None]:
noise = torch.randn(1,1,256).to(device)
for k, v in reference_dicts.items():
    path, text = v
    ref_s = compute_style(path)
    start = time.time()
    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    import IPython.display as ipd
    print(k + ' Synthesized: ' + text)
    display(ipd.Audio(wav, rate=24000, normalize=False))
    print(k + ' Reference:')
    display(ipd.Audio(path, rate=24000, normalize=False))

### Speech expressiveness

The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training. 

#### With `embedding_scale=1`
This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.



In [None]:
ref_s = compute_style("Demo/reference_audio/1221-135767-0014.wav")

In [None]:
texts = {}
texts['Happy'] = "We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands."
texts['Sad'] = "I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence."
texts['Angry'] = "The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!"
texts['Surprised'] = "I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?"

for k,v in texts.items():
    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)
    print(k + ": ")
    display(ipd.Audio(wav, rate=24000, normalize=False))

#### With `embedding_scale=2`

In [None]:
texts = {}
texts['Happy'] = "We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands."
texts['Sad'] = "I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence."
texts['Angry'] = "The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!"
texts['Surprised'] = "I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?"

for k,v in texts.items():
    noise = torch.randn(1,1,256).to(device)
    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)
    print(k + ": ")
    display(ipd.Audio(wav, rate=24000, normalize=False))

#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`
`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody. 

In [None]:
texts = {}
texts['Happy'] = "We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands."
texts['Sad'] = "I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence."
texts['Angry'] = "The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!"
texts['Surprised'] = "I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?"

for k,v in texts.items():
    noise = torch.randn(1,1,256).to(device)
    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)
    print(k + ": ")
    display(ipd.Audio(wav, rate=24000, normalize=False))

### Zero-shot speaker adaptation
This section recreates the "Acoustic Environment Maintenance" and "Speaker’s Emotion Maintenance" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance. 

#### Acoustic Environment Maintenance

Since we want to maintain the acoustic environment in the speaker (timbre), we set  `alpha = 0` to make the speaker as closer to the reference as possible while only changing the prosody according to the text.  

In [None]:
reference_dicts = {}
# format: (path, text)
reference_dicts['3'] = ("Demo/reference_audio/3.wav", "As friends thing I definitely I've got more male friends.")
reference_dicts['4'] = ("Demo/reference_audio/4.wav", "Everything is run by computer but you got to know how to think before you can do a computer.")
reference_dicts['5'] = ("Demo/reference_audio/5.wav", "Then out in LA you guys got a whole another ball game within California to worry about.")

In [None]:
noise = torch.randn(1,1,256).to(device)
for k, v in reference_dicts.items():
    path, text = v
    ref_s = compute_style(path)
    start = time.time()
    wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    import IPython.display as ipd
    print('Synthesized: ' + text)
    display(ipd.Audio(wav, rate=24000, normalize=False))
    print('Reference:')
    display(ipd.Audio(path, rate=24000, normalize=False))

#### Speaker’s Emotion Maintenance

Since we want to maintain the emotion in the speaker (prosody), we set  `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change.

In [None]:
reference_dicts = {}
# format: (path, text)
reference_dicts['Anger'] = ("Demo/reference_audio/anger.wav", "We have to reduce the number of plastic bags.")
reference_dicts['Sleepy'] = ("Demo/reference_audio/sleepy.wav", "We have to reduce the number of plastic bags.")
reference_dicts['Amused'] = ("Demo/reference_audio/amused.wav", "We have to reduce the number of plastic bags.")
reference_dicts['Disgusted'] = ("Demo/reference_audio/disgusted.wav", "We have to reduce the number of plastic bags.")

In [None]:
noise = torch.randn(1,1,256).to(device)
for k, v in reference_dicts.items():
    path, text = v
    ref_s = compute_style(path)
    start = time.time()
    wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    import IPython.display as ipd
    print(k + ' Synthesized: ' + text)
    display(ipd.Audio(wav, rate=24000, normalize=False))
    print(k + ' Reference:')
    display(ipd.Audio(path, rate=24000, normalize=False))

### Speech diversity

This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page. 

`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:
- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different). 
- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis). 


#### Default setting (`alpha = 0.3, beta=0.7`)
This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text. 

In [None]:
# unseen speaker
path = "Demo/reference_audio/1221-135767-0014.wav"
ref_s = compute_style(path)

text = "How much variation is there?"
for _ in range(5):
    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)
    display(ipd.Audio(wav, rate=24000, normalize=False))

#### Less diverse setting (`alpha = 0.1, beta=0.3`)
This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples. 

In [None]:
# unseen speaker
path = "Demo/reference_audio/1221-135767-0014.wav"
ref_s = compute_style(path)

text = "How much variation is there?"
for _ in range(5):
    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)
    display(ipd.Audio(wav, rate=24000, normalize=False))

#### More diverse setting (`alpha = 0.5, beta=0.95`)
This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker.  

In [None]:
# unseen speaker
path = "Demo/reference_audio/1221-135767-0014.wav"
ref_s = compute_style(path)

text = "How much variation is there?"
for _ in range(5):
    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)
    display(ipd.Audio(wav, rate=24000, normalize=False))

#### Extreme setting (`alpha = 1, beta=1`)
This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker. 

In [None]:
# unseen speaker
path = "Demo/reference_audio/1221-135767-0014.wav"
ref_s = compute_style(path)

text = "How much variation is there?"
for _ in range(5):
    wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)
    display(ipd.Audio(wav, rate=24000, normalize=False))

#### No variation (`alpha = 0, beta=0`)
This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very similar to the reference speaker, but there is no variation. 

In [None]:
# unseen speaker
path = "Demo/reference_audio/1221-135767-0014.wav"
ref_s = compute_style(path)

text = "How much variation is there?"
for _ in range(5):
    wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)
    display(ipd.Audio(wav, rate=24000, normalize=False))

### Extra fun!

Here we clone some of the authors' voice of the StyleTTS 2 papers with a few seconds of the recording in the wild. None of the voices is in the dataset and all authors agreed to have their voices cloned here.

In [None]:
def save(wave):
    # Convert to numpy if it's a tensor
    if isinstance(wave, torch.Tensor):
        wave = wave.detach().cpu().numpy()

    # Remove extra dimensions if necessary
    if wave.ndim > 1:
        wave = wave.squeeze()

    # Create an output directory (only the first time)
    os.makedirs("outputs", exist_ok=True)

    # Define your filename
    output_path = f"outputs/{k}_synthesized.wav"

    # Save audio (24 kHz)
    sf.write(output_path, wave, 24000)

    print(f"✅ Saved synthesized audio to: {output_path}")

In [None]:
reference_dicts = {}
reference_dicts['Trump'] = "Demo/reference_audio/Trump1.wav"

In [None]:
text = ''' We’re making America strong, and we’re doing it faster than anyone ever thought possible — believe ME!! '''

In [None]:
start = time.time()
noise = torch.randn(1,1,256).to(device)
for k, path in reference_dicts.items():
    ref_s = compute_style(path)
    
    wav = inference(text, ref_s, alpha=0.25, beta=0.5, diffusion_steps=10, embedding_scale=1)
    rtf = (time.time() - start) / (len(wav) / 24000)
    print('Speaker: ' + k)
    import IPython.display as ipd
    print('Synthesized:')
    display(ipd.Audio(wav, rate=24000, normalize=False))
    print('Reference:')
    display(ipd.Audio(path, rate=24000, normalize=False))

In [None]:
save(wav)