## Cross-Lingual Voice Clone Demo

In [None]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter

In [None]:
# prompt: Prompt user for a Huggingface access token and save it as an environment variable

import getpass
import os

# Prompt user for Hugging Face access token
hf_token = getpass.getpass('Enter your Hugging Face access token: ')

# Save the token as an environment variable
os.environ['HF_TOKEN'] = hf_token

In [None]:
api_key = getpass.getpass('Enter your Hugging Face api key: ')
os.environ['OPENAI_API_KEY'] = api_key

### Initialization

In [None]:
ckpt_converter = 'checkpoints/converter'
device="cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs'

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

os.makedirs(output_dir, exist_ok=True)

In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`. We have also provided a Chinese base speaker model (see `demo_part1.ipynb`).

In [None]:
from openai import OpenAI
from dotenv import load_dotenv

# Please create a file named .env and place your
# OpenAI key as OPENAI_API_KEY=xxx
load_dotenv() 

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output_file = f"{output_dir}/openai_source_output.mp3"

with client.audio.speech.with_streaming_response.create(
    model="tts-1",
    voice="nova",
    input="This audio will be used to extract the base speaker tone color embedding. " + \
        "Typically a very short audio should be sufficient, but increasing the audio " + \
        "length will also improve the output audio quality."
) as response:
    response.stream_to_file(output_file)

In [None]:
from IPython.display import Audio

print(output_file)

# Path to your .wav file
Audio(output_file)

### Obtain Tone Color Embedding

The `source_se` is the tone color embedding of the base speaker. 
It is an average for multiple sentences with multiple emotions
of the base speaker. We directly provide the result here but
the readers feel free to extract `source_se` by themselves.

In [None]:
base_speaker = f"{output_dir}/openai_source_output.mp3"
source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True)

reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)

### Inference

In [None]:
# Run the base speaker tts
text = [
    "MyShell is a decentralized and comprehensive platform for discovering, creating, and staking AI-native apps.",
    "MyShell es una plataforma descentralizada y completa para descubrir, crear y apostar por aplicaciones nativas de IA.",
    "MyShell est une plateforme d√©centralis√©e et compl√®te pour d√©couvrir, cr√©er et miser sur des applications natives d'IA.",
    "MyShell ist eine dezentralisierte und umfassende Plattform zum Entdecken, Erstellen und Staken von KI-nativen Apps.",
    "MyShell √® una piattaforma decentralizzata e completa per scoprire, creare e scommettere su app native di intelligenza artificiale.",
    "MyShell„ÅØ„ÄÅAI„Éç„Ç§„ÉÜ„Ç£„Éñ„Ç¢„Éó„É™„ÅÆÁô∫Ë¶ã„ÄÅ‰ΩúÊàê„ÄÅ„Åä„Çà„Å≥„Çπ„ÉÜ„Éº„Ç≠„É≥„Ç∞„ÅÆ„Åü„ÇÅ„ÅÆÂàÜÊï£Âûã„Åã„Å§ÂåÖÊã¨ÁöÑ„Å™„Éó„É©„ÉÉ„Éà„Éï„Ç©„Éº„É†„Åß„Åô„ÄÇ",
    "MyShell ‚Äî —ç—Ç–æ –¥–µ—Ü–µ–Ω—Ç—Ä–∞–ª–∏–∑–æ–≤–∞–Ω–Ω–∞—è –∏ –≤—Å–µ–æ–±—ä–µ–º–ª—é—â–∞—è –ø–ª–∞—Ç—Ñ–æ—Ä–º–∞ –¥–ª—è –æ–±–Ω–∞—Ä—É–∂–µ–Ω–∏—è, —Å–æ–∑–¥–∞–Ω–∏—è –∏ —Å—Ç–µ–π–∫–∏–Ω–≥–∞ AI-–æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π.",
    "MyShell ŸáŸä ŸÖŸÜÿµÿ© ŸÑÿßŸÖÿ±ŸÉÿ≤Ÿäÿ© Ÿàÿ¥ÿßŸÖŸÑÿ© ŸÑÿßŸÉÿ™ÿ¥ÿßŸÅ Ÿàÿ•ŸÜÿ¥ÿßÿ° Ÿàÿ±ŸáÿßŸÜ ÿ™ÿ∑ÿ®ŸäŸÇÿßÿ™ ÿßŸÑÿ∞ŸÉÿßÿ° ÿßŸÑÿßÿµÿ∑ŸÜÿßÿπŸä ÿßŸÑÿ£ÿµŸÑŸäÿ©.",
    "MyShellÊòØ‰∏Ä‰∏™Âéª‰∏≠ÂøÉÂåñ‰∏îÂÖ®Èù¢ÁöÑÂπ≥Âè∞ÔºåÁî®‰∫éÂèëÁé∞„ÄÅÂàõÂª∫ÂíåÊäïËµÑAIÂéüÁîüÂ∫îÁî®Á®ãÂ∫è„ÄÇ",
    "MyShell ‡§è‡§ï ‡§µ‡§ø‡§ï‡•á‡§Ç‡§¶‡•ç‡§∞‡•Ä‡§ï‡•É‡§§ ‡§î‡§∞ ‡§µ‡•ç‡§Ø‡§æ‡§™‡§ï ‡§Æ‡§Ç‡§ö ‡§π‡•à, ‡§ú‡•ã AI-‡§Æ‡•Ç‡§≤ ‡§ê‡§™‡•ç‡§∏ ‡§ï‡•Ä ‡§ñ‡•ã‡§ú, ‡§∏‡•É‡§ú‡§® ‡§î‡§∞ ‡§∏‡•ç‡§ü‡•á‡§ï‡§ø‡§Ç‡§ó ‡§ï‡•á ‡§≤‡§ø‡§è ‡§π‡•à‡•§",
    "MyShell √© uma plataforma descentralizada e abrangente para descobrir, criar e apostar em aplicativos nativos de IA."
]
src_path = f'{output_dir}/tmp.wav'
output_files = []

for i, t in enumerate(text):
    # ‚úÖ Use streaming version of the TTS request
    with client.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="nova",
        input=t,
    ) as response:
        response.stream_to_file(src_path)

    save_path = f'{output_dir}/output_crosslingual_{i}.wav'
    output_files.append(save_path)

    # üéØ Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message
    )

In [None]:
Audio(output_files[0])

In [None]:
Audio(output_files[1])

In [None]:
Audio(output_files[2])

In [None]:
Audio(output_files[3])

In [None]:
Audio(output_files[4])

In [None]:
Audio(output_files[5])

In [None]:
Audio(output_files[6])

In [None]:
Audio(output_files[7])

In [None]:
Audio(output_files[8])

In [None]:
Audio(output_files[9])

In [None]:
Audio(output_files[10])