In [1]:
import torch
from TTS.api import TTS
import pandas as pd
import time
import os

In [2]:
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# List available 🐸TTS models
print("\n".join(TTS().list_models().list_models()))

tts_models/multilingual/multi-dataset/xtts_v2
tts_models/multilingual/multi-dataset/xtts_v1.1
tts_models/multilingual/multi-dataset/your_tts
tts_models/multilingual/multi-dataset/bark
tts_models/bg/cv/vits
tts_models/cs/cv/vits
tts_models/da/cv/vits
tts_models/et/cv/vits
tts_models/ga/cv/vits
tts_models/en/ek1/tacotron2
tts_models/en/ljspeech/tacotron2-DDC
tts_models/en/ljspeech/tacotron2-DDC_ph
tts_models/en/ljspeech/glow-tts
tts_models/en/ljspeech/speedy-speech
tts_models/en/ljspeech/tacotron2-DCA
tts_models/en/ljspeech/vits
tts_models/en/ljspeech/vits--neon
tts_models/en/ljspeech/fast_pitch
tts_models/en/ljspeech/overflow
tts_models/en/ljspeech/neural_hmm
tts_models/en/vctk/vits
tts_models/en/vctk/fast_pitch
tts_models/en/sam/tacotron-DDC
tts_models/en/blizzard2013/capacitron-t2-c50
tts_models/en/blizzard2013/capacitron-t2-c150_v2
tts_models/en/multi-dataset/tortoise-v2
tts_models/en/jenny/jenny
tts_models/es/mai/tacotron2-DDC
tts_models/es/css10/vits
tts_models/fr/mai/tacotron2-DDC

In [3]:
def test_model(model_name, text, speaker_path, save_csv=True, display=True):
    start = time.time()
    # Load model
    tts = TTS(model_name).to(device)
    time_init = time.time() - start

    output_name = get_output_name(model_name, speaker_path)

    # Synthesize
    if "multilingual" in model_name:
        try:
            start = time.time()
            tts.tts_to_file(text=text, speaker_wav=speaker_path, file_path=output_name + '.wav', language="fr-fr")
            time_infer = time.time() - start
        except:
            try:
                start = time.time()
                tts.tts_to_file(text=text, speaker_wav=speaker_path, file_path=output_name + '.wav', language="fr")
                time_infer = time.time() - start
            except:
                try:
                    start = time.time()
                    tts.tts_to_file(text=text, speaker_wav=speaker_path, file_path=output_name + '.wav', language="fra")
                    time_infer = time.time() - start
                except Exception as e:
                    raise e
    else:
        start = time.time()
        tts.tts_to_file(text=text, speaker_wav=speaker_path, file_path=output_name + '.wav')
        time_infer = time.time() - start
    
    if display:
        print(f"\n==============================================\nModel: {model_name}")
        print(f"Time to initialize the model: {time_init:.2f} s")
        print(f"Time to synthesize the text: {time_infer:.2f} s")

    if save_csv:
        save_to_csv(output_name, time_init, time_infer)

def get_output_name(model_name, speaker_path):
    if len(model_name.split('/')) != 4:
        raise ValueError('model_name must be in the form of "tts_model/language/model_tag/model_name"')
    _, language, model_tag, model_main = model_name.split('/')
    return model_main + '-' + model_tag + '-' + language + '-' + speaker_path.split('/')[-1].split('.')[0]

def save_to_csv(output_name, time_init, time_infer):
    df = pd.read_csv('time_analysis.csv', header=0)
    if output_name in df['model'].values:
        df = df[df['model'] != output_name]
    df = df.append({'model': output_name, 'time_init': time_init, 'time_infer': time_infer}, ignore_index=True)
    df.to_csv('time_analysis.csv', index=False)

In [10]:
tts_to_test = [
    # "tts_models/multilingual/multi-dataset/xtts_v2",
    # "tts_models/multilingual/multi-dataset/your_tts",
    "tts_models/fr/mai/tacotron2-DDC",
    "tts_models/fr/css10/vits",
    "tts_models/fra/fairseq/vits",
    "tts_models/acf/fairseq/vits",
]

voice_folder = "../../../data/stored/assistant/voices/"

speakers_to_test = [voice_folder + speaker for speaker in os.listdir(voice_folder) if speaker.endswith('.wav') or speaker.endswith('.flac') or speaker.endswith('.mp3')]
print(speakers_to_test)

text = "Connaissez-vous Wemby? Face aux New-Yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de San Antonio cent-trente à cent-vingt-six. A la fin du match, le Français a envoyé le ballon dans les tribunes! Résultat, une amende de vingt-cinq-mille dollars."


for model in tts_to_test:
    if "multilingual" in model:
        speakers = speakers_to_test
    else:
        speakers = [speakers_to_test[0]]
    for speaker in speakers:
        print(f"Testing {model} with speaker {speaker}")
        test_model(model, text, speaker, save_csv=True, display=True)
        print("==============================================")


['../../../data/stored/assistant/voices/eliot_christon.mp3', '../../../data/stored/assistant/voices/eugenie_declaron.mp3', '../../../data/stored/assistant/voices/example_reference.mp3', '../../../data/stored/assistant/voices/jean_pierre_pernaut.mp3', '../../../data/stored/assistant/voices/perrine_laffont.mp3', '../../../data/stored/assistant/voices/pierre_faury.mp3', '../../../data/stored/assistant/voices/ptisham.mp3', '../../../data/stored/assistant/voices/roberto_caurand.mp3', '../../../data/stored/assistant/voices/teddy_riner.mp3', '../../../data/stored/assistant/voices/thomas_oxisoglou.mp3']
Testing tts_models/fr/mai/tacotron2-DDC with speaker ../../../data/stored/assistant/voices/eliot_christon.mp3
 > tts_models/fr/mai/tacotron2-DDC is already downloaded.
 > vocoder_models/universal/libri-tts/fullband-melgan is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_l

  df = df.append({'model': output_name, 'time_init': time_init, 'time_infer': time_infer}, ignore_index=True)


 > Text splitted to sentences.
['Connaissez-vous Wemby?', 'Face aux New-Yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de San Antonio cent-trente à cent-vingt-six.', 'A la fin du match, le Français a envoyé le ballon dans les tribunes!', 'Résultat, une amende de vingt-cinq-mille dollars.']
 > Processing time: 1.478450059890747
 > Real-time factor: 0.08792892235400206

Model: tts_models/fr/css10/vits
Time to initialize the model: 0.59 s
Time to synthesize the text: 1.54 s
Testing tts_models/fra/fairseq/vits with speaker ../../../data/stored/assistant/voices/eliot_christon.mp3
 > tts_models/fra/fairseq/vits is already downloaded.
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 |

  df = df.append({'model': output_name, 'time_init': time_init, 'time_infer': time_infer}, ignore_index=True)


 > Text splitted to sentences.
['Connaissez-vous Wemby?', 'Face aux New-Yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de San Antonio cent-trente à cent-vingt-six.', 'A la fin du match, le Français a envoyé le ballon dans les tribunes!', 'Résultat, une amende de vingt-cinq-mille dollars.']
connaissez-vous wemby?
 [!] Character 'w' not found in the vocabulary. Discarding it.
connaissez-vous wemby?
 [!] Character '?' not found in the vocabulary. Discarding it.
face aux new-yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de san antonio cent-trente à cent-vingt-six.
 [!] Character ',' not found in the vocabulary. Discarding it.
face aux new-yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de san antonio cent-trente à cent-vingt-six.
 [!] Character '.' not found in the vocabulary. Discarding i

  df = df.append({'model': output_name, 'time_init': time_init, 'time_infer': time_infer}, ignore_index=True)


 > Text splitted to sentences.
['Connaissez-vous Wemby?', 'Face aux New-Yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de San Antonio cent-trente à cent-vingt-six.', 'A la fin du match, le Français a envoyé le ballon dans les tribunes!', 'Résultat, une amende de vingt-cinq-mille dollars.']
connaissez-vous wemby?
 [!] Character '?' not found in the vocabulary. Discarding it.
face aux new-yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de san antonio cent-trente à cent-vingt-six.
 [!] Character 'x' not found in the vocabulary. Discarding it.
face aux new-yorkais, il avait inscrit quarante points, pris vingt rebonds et délivré sept passes décisives lors de la victoire de san antonio cent-trente à cent-vingt-six.
 [!] Character ',' not found in the vocabulary. Discarding it.
face aux new-yorkais, il avait inscrit quarante points, pris vingt rebonds et déliv

  df = df.append({'model': output_name, 'time_init': time_init, 'time_infer': time_infer}, ignore_index=True)
