## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS

In [None]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter

### Initialization

In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases.

In [None]:
ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs_v2'

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

os.makedirs(output_dir, exist_ok=True)

### Obtain Tone Color Embedding
We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder.

In [None]:

reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)

In [None]:
from IPython.display import Audio
Audio(reference_speaker)

#### Use MeloTTS as Base Speakers

MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. 

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
from melo.api import TTS

texts = {
    'EN_NEWEST': "Did you ever hear a folk tale about a giant turtle?",  # The newest English base speaker model
    'EN': "Did you ever hear a folk tale about a giant turtle?",
    'ES': "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
    'FR': "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.",
    'ZH': "在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。",
    'JP': "彼は毎朝ジョギングをして体を健康に保っています。",
    'KR': "안녕하세요! 오늘은 날씨가 정말 좋네요.",
}


src_path = f'{output_dir}/tmp.wav'

# Speed is adjustable
speed = 1.0

output_files = []

for language, text in texts.items():
    model = TTS(language=language, device=device)
    speaker_ids = model.hps.data.spk2id
    
    for speaker_key in speaker_ids.keys():
        speaker_id = speaker_ids[speaker_key]
        speaker_key = speaker_key.lower().replace('_', '-')
        
        source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
        if torch.backends.mps.is_available() and device == 'cpu':
            torch.backends.mps.is_available = lambda: False
        model.tts_to_file(text, speaker_id, src_path, speed=speed)
        save_path = f'{output_dir}/output_v2_{speaker_key}.wav'
        
        output_files.append(save_path)

        # Run the tone color converter
        encode_message = "@MyShell"
        tone_color_converter.convert(
            audio_src_path=src_path, 
            src_se=source_se, 
            tgt_se=target_se, 
            output_path=save_path,
            message=encode_message)

In [None]:
from IPython.display import Audio

In [None]:
Audio(output_files[0])

In [None]:
Audio(output_files[1])

In [None]:
Audio(output_files[2])

In [None]:
Audio(output_files[3])

In [None]:
Audio(output_files[4])

In [None]:
Audio(output_files[5])

In [None]:
Audio(output_files[6])

In [None]:
Audio(output_files[7])

In [None]:
Audio(output_files[8])

In [None]:
Audio(output_files[9])

In [None]:
Audio(output_files[10])

# A Small Incident

In [None]:
# 一件小事⑴
# 鲁迅
# From https://www.comp.nus.edu.sg/~tanhw/chinese/literature/lu-xun/yi-jian-xiao-shi.html?utm_source=chatgpt.com
text_str = """
　　我从乡下跑到京城里，一转眼已经六年了。其间耳闻目睹的所谓国家大事
，算起来也很不少；但在我心里，都不留什么痕迹，倘要我寻出这些事的影响
来说，便只是增长了我的坏脾气，——老实说，便是教我一天比一天的看不起
人。

　　但有一件小事，却于我有意义，将我从坏脾气里拖开，使我至今忘记不得
。

　　这是民国六年的冬天，大北风刮得正猛，我因为生计关系，不得不一早在
路上走。一路几乎遇不见人，好容易才雇定了一辆人力车，教他拉到Ｓ门去。
不一会，北风小了，路上浮尘早已刮净，剩下一条洁白的大道来，车夫也跑得
更快。刚近Ｓ门，忽而车把上带着一个人，慢慢地倒了。

　　跌倒的是一个女人，花白头发，衣服都很破烂。伊从马路上突然向车前横
截过来；车夫已经让开道，但伊的破棉背心没有上扣，微风吹着，向外展开，
所以终于兜着车把。幸而车夫早有点停步，否则伊定要栽一个大斤斗，跌到头
破血出了。

　　伊伏在地上；车夫便也立住脚。我料定这老女人并没有伤，又没有别人看
见，便很怪他多事，要自己惹出是非，也误了我的路。

　　我便对他说，“没有什么的。走你的罢！”

　　车夫毫不理会，——或者并没有听到，——却放下车子，扶那老女人慢慢
起来，搀着臂膊立定，问伊说：

　　“你怎么啦？”

　　“我摔坏了。”

　　我想，我眼见你慢慢倒地，怎么会摔坏呢，装腔作势罢了，这真可憎恶。
车夫多事，也正是自讨苦吃，现在你自己想法去。

　　车夫听了这老女人的话，却毫不踌躇，仍然搀着伊的臂膊，便一步一步的
向前走。我有些诧异，忙看前面，是一所巡警分驻所，大风之后，外面也不见
人。这车夫扶着那老女人，便正是向那大门走去。

　　我这时突然感到一种异样的感觉，觉得他满身灰尘的后影，刹时高大了，
而且愈走愈大，须仰视才见。而且他对于我，渐渐的又几乎变成一种威压，甚
而至于要榨出皮袍下面藏着的“小”来。

　　我的活力这时大约有些凝滞了，坐着没有动，也没有想，直到看见分驻所
里走出一个巡警，才下了车。

　　巡警走近我说，“你自己雇车罢，他不能拉你了。”

　　我没有思索的从外套袋里抓出一大把铜元，交给巡警，说，“请你给他…
…”

　　风全住了，路上还很静。我走着，一面想，几乎怕敢想到自己。以前的事
姑且搁起，这一大把铜元又是什么意思？奖他么？我还能裁判车夫么？我不能
回答自己。

　　这事到了现在，还是时时记起。我因此也时时煞了苦痛，努力的要想到我
自己。几年来的文治武力，在我早如幼小时候所读过的“子曰诗云”⑵一般，
背不上半句了。独有这一件小事，却总是浮在我眼前，有时反更分明，教我惭
愧，催我自新，并且增长我的勇气和希望。

"""

In [None]:
import re

def split_chinese_text_into_paragraphs_and_sentences(text):
    """
    Splits Chinese text into paragraphs and then sentences within each paragraph.

    Args:
        text (str): The input Chinese text.

    Returns:
        list: A list of lists, where each inner list contains sentences from a paragraph.
    """
    # Normalize full-width spaces and strip leading/trailing whitespace
    text = text.replace('\u3000', '').strip()

    # Split into paragraphs by two or more newlines
    paragraphs = re.split(r'\n\s*\n', text)

    paragraph_sentences = []
    for paragraph in paragraphs:
        # Remove excess internal newlines within paragraphs
        paragraph = paragraph.replace('\n', '')
        # Split into sentences by Chinese punctuation (keep the punctuation)
        sentences = re.split(r'(?<=[。！？])', paragraph)
        # Remove empty strings and strip whitespace
        sentences = [s.strip() for s in sentences if s.strip()]
        paragraph_sentences.append(sentences)

    return paragraph_sentences

# Example usage:
sentences_by_paragraph = split_chinese_text_into_paragraphs_and_sentences(text_str)

# Display the results
for i, paragraph in enumerate(sentences_by_paragraph):
    print(f"Paragraph {i+1}:")
    for j, sentence in enumerate(paragraph):
        print(f"  Sentence {j+1}: {sentence}")
    print("-" * 20)


In [None]:
language = "ZH"

In [None]:
output_dir = 'outputs'

In [None]:
src_path = f'{output_dir}/tmp.wav'

In [None]:
#reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone
reference_speaker = 'resources/fiona_zh.m4a'
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)

In [None]:
source_se = torch.load(f'checkpoints_v2/base_speakers/ses/zh.pth', map_location=device)

In [None]:
language

In [None]:
model = TTS(language=language, device=device)
speaker_ids = model.hps.data.spk2id

In [None]:
list(speaker_ids.keys())

In [None]:
speaker_key = list(speaker_ids.keys())[0]
speaker_id = speaker_ids[speaker_key]

In [None]:
def generate_audio(save_path, text, output_dir, source_se, target_se, speaker_id):
    src_path = f'{output_dir}/tmp.wav'

    print("###save_path: ", save_path)
    
    if torch.backends.mps.is_available() and device == 'cpu':
        torch.backends.mps.is_available = lambda: False
    model.tts_to_file(text, speaker_id, src_path, speed=speed)

    output_files.append(save_path)

    # Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)

In [None]:
import os
import torchaudio as ta
from pydub import AudioSegment

# Create directory for sentence wav files
output_dir = "sentences"
os.makedirs(output_dir, exist_ok=True)

# Synthesize each sentence and save to a file
sentence_files = []
sentence_count = 0
for paragraph_index, paragraph_sentences in enumerate(sentences_by_paragraph):
    for sentence_index, sentence in enumerate(paragraph_sentences):
        # Ensure sentence is not empty after splitting
        if sentence.strip():
            sentence_count += 1
            print(f"Synthesizing sentence {sentence_count}: {sentence}")
            sentence = sentence.strip()
            
            file_path = os.path.join(output_dir, f"sentence_{sentence_count}.wav")
            
            generate_audio(file_path, sentence, output_dir, source_se, target_se, speaker_id)
            
            sentence_files.append(file_path)

# Merge all sentence wav files
merged_audio = None
pause_duration_ms = 500  # Adjust the pause duration as needed (in milliseconds)
paragraph_end_pause_ms = 1000 # Pause duration after each paragraph

file_index = 0
sentence_counter_for_paragraph = 0



In [None]:
!ls -la sentences

In [None]:
for paragraph_index, paragraph_sentences in enumerate(sentences_by_paragraph):
    sentence_counter_for_paragraph = 0
    for sentence_index, sentence in enumerate(paragraph_sentences):
        if sentence.strip():
            file_path = sentence_files[file_index]
            print(file_path)
            audio_segment = AudioSegment.from_wav(file_path)

            if merged_audio is None:
                merged_audio = audio_segment
            else:
                merged_audio += audio_segment

            file_index += 1
            sentence_counter_for_paragraph += 1

    # Add a pause after each paragraph (if it's not the last paragraph)
    if paragraph_index < len(sentences_by_paragraph) - 1:
         # Add a pause at the end of the paragraph
         pause = AudioSegment.silent(duration=paragraph_end_pause_ms)
         if merged_audio is not None:
            merged_audio += pause


# Save the final merged audio
if merged_audio is not None:
    output_filename = "A_Small_Incident.wav"
    merged_audio.export(output_filename, format="wav")
    print(f"Merged audio saved as {output_filename}")

In [None]:
from IPython.display import Audio

Audio('A_Small_Incident.wav')