# OpenVoice v2
Based on https://github.com/myshell-ai/OpenVoice/blob/main/demo_part3.ipynb

## Download voice samples repo

In [None]:
!cd /content && wget https://github.com/ekkus93/voice_samples/archive/refs/heads/master.zip && unzip master.zip && rm master.zip

## Download and Install OpenVoice

In [None]:
!cd /content && wget https://github.com/myshell-ai/OpenVoice/archive/refs/heads/main.zip && unzip main.zip && rm main.zip

In [None]:
!cd /content/OpenVoice-main && pip install .

In [None]:
!pip install faster_whisper

### Fix numpy version

In [None]:
!pip install numpy==1.26.4

In [None]:
!pip install git+https://github.com/myshell-ai/MeloTTS.git

In [None]:
!pip install huggingface-hub==0.23.5

In [None]:
# prompt: Prompt user for a Huggingface access token and save it as an environment variable

import getpass
import os

# Prompt user for Hugging Face access token
hf_token = getpass.getpass('Enter your Hugging Face access token: ')

# Save the token as an environment variable
os.environ['HF_TOKEN'] = hf_token

## Voice Style Control Demo

In [None]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter

## Download models

In [None]:
!cd /content/OpenVoice-main && wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip && unzip -o checkpoints_1226.zip

In [None]:
!cd /content/OpenVoice-main && wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip && unzip -o checkpoints_v2_0417.zip

### Initialization

In [None]:
import os

os.chdir("/content/OpenVoice-main")
!pwd

In [None]:
ckpt_base = 'checkpoints/base_speakers/EN'
ckpt_converter = 'checkpoints/converter'
device="cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs'

base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

os.makedirs(output_dir, exist_ok=True)

## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS

In [None]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter

### Initialization

In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases.

In [None]:
ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs_v2'

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

os.makedirs(output_dir, exist_ok=True)

### Obtain Tone Color Embedding
We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder.

#### Option 1: Default reference speaker

In [None]:
reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)

#### Option 2: Pick a voice from voice samples

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display

voice_samples_dir = "/content/voice_samples-master/voice_samples"
voice_files = sorted([f for f in os.listdir(voice_samples_dir) if os.path.isfile(os.path.join(voice_samples_dir, f))])

# Create the dropdown menu using ipywidgets
dropdown = widgets.Dropdown(
    options=["None"] + voice_files,
    value="None",
    description='Select voice file:',
    disabled=False,
)

display(dropdown)

In [None]:
reference_speaker = f"/content/voice_samples-master/voice_samples/{dropdown.value}"
print(f"Selected voice file: {reference_speaker}")
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

In [None]:
from IPython.display import Audio
Audio(reference_speaker)

#### Option 3: Use your own voice

In [None]:
from google.colab import output
from IPython.display import display, HTML, Javascript
from base64 import b64decode

sample_text = """
The year I began to say vahz instead of vase, a man I barely
knew nearly accidentally killed me.

The man was not hurt when the other car hit ours. The man I
had known for one week held me in the street in a way that
meant I couldn’t see my legs. I remember knowing that I shouldn’t
look, and knowing that I would look if it wasn’t that I couldn’t.

My blood was on the front of this man’s clothes.

He said, “You’ll be okay, but this sweater is ruined.”

I screamed from the fear of pain. But I did not feel any pain. In
the hospital, after injections, I knew there was pain in the room —“
I just didn’t know whose pain it was.

What happened to one of my legs required four hundred stitches,
which, when I told it, became five hundred stitches, because nothing
is ever quite as bad as it could be.

The five days they didn’t know if they could save my leg or not I
stretched to ten.
"""
print(sample_text)

# Register the save function to receive audio data from JS
def save_audio(data, filename="/content/myvoice.wav"):
    audio_data = b64decode(data)
    with open(filename, "wb") as f:
        f.write(audio_data)
    print(f"Audio saved as {filename}")

output.register_callback('notebook.save_audio', save_audio)

# Display recording controls in Colab
display(HTML('''
  <div>
    <button id="start-record">Start Recording</button>
    <button id="stop-record" disabled>Stop Recording</button>
  </div>
'''))

display(Javascript('''
let mediaRecorder;
let audioChunks;

document.getElementById("start-record").onclick = async () => {
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  mediaRecorder = new MediaRecorder(stream);
  audioChunks = [];

  mediaRecorder.ondataavailable = event => {
    if (event.data.size > 0) {
      audioChunks.push(event.data);
    }
  };

  mediaRecorder.onstop = async () => {
    const audioBlob = new Blob(audioChunks);
    const reader = new FileReader();
    reader.readAsDataURL(audioBlob);
    reader.onloadend = () => {
      const base64data = reader.result.split(',')[1];
      google.colab.kernel.invokeFunction('notebook.save_audio', [base64data], {});
    };
  };

  mediaRecorder.start();
  document.getElementById("start-record").disabled = true;
  document.getElementById("stop-record").disabled = false;
};

document.getElementById("stop-record").onclick = () => {
  mediaRecorder.stop();
  document.getElementById("start-record").disabled = false;
  document.getElementById("stop-record").disabled = true;
};
'''))


In [None]:
# Convert to proper wav format using ffmpeg
!ffmpeg -y -i /content/myvoice.wav -acodec pcm_s16le -ar 44100 /content/myvoice_fixed.wav && mv /content/myvoice_fixed.wav /content/myvoice.wav

In [None]:
reference_speaker = "/content/myvoice.wav"
print(f"Selected voice file: {reference_speaker}")
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

In [None]:
from IPython.display import Audio
Audio(reference_speaker)

#### Use MeloTTS as Base Speakers

MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers.

In [None]:
!ln -s /usr/local/lib/python3.11/dist-packages/unidic_lite/dicdir /usr/local/lib/python3.11/dist-packages/unidic/dicdir

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
!rm -rf {output_dir}/*

In [None]:
from melo.api import TTS

texts = {
    'EN_NEWEST': "Did you ever hear a folk tale about a giant turtle?",  # The newest English base speaker model
    'EN': "Did you ever hear a folk tale about a giant turtle?",
    'ES': "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
    'FR': "La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.",
    'ZH': "在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。",
    'JP': "彼は毎朝ジョギングをして体を健康に保っています。",
    'KR': "안녕하세요! 오늘은 날씨가 정말 좋네요.",
}


src_path = f'{output_dir}/tmp.wav'

# Speed is adjustable
speed = 1.0

output_files = []

for language, text in texts.items():
    model = TTS(language=language, device=device)
    speaker_ids = model.hps.data.spk2id

    for speaker_key in speaker_ids.keys():
        speaker_id = speaker_ids[speaker_key]
        speaker_key = speaker_key.lower().replace('_', '-')

        source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
        if torch.backends.mps.is_available() and device == 'cpu':
            torch.backends.mps.is_available = lambda: False
        model.tts_to_file(text, speaker_id, src_path, speed=speed)
        save_path = f'{output_dir}/output_v2_{speaker_key}.wav'

        output_files.append(save_path)

        # Run the tone color converter
        encode_message = "@MyShell"
        tone_color_converter.convert(
            audio_src_path=src_path,
            src_se=source_se,
            tgt_se=target_se,
            output_path=save_path,
            message=encode_message)

In [None]:
from IPython.display import Audio

In [None]:
Audio(output_files[0])

In [None]:
Audio(output_files[1])

In [None]:
Audio(output_files[2])

In [None]:
Audio(output_files[3])

In [None]:
Audio(output_files[4])

In [None]:
Audio(output_files[5])

In [None]:
Audio(output_files[6])

In [None]:
Audio(output_files[7])

In [None]:
Audio(output_files[8])

In [None]:
Audio(output_files[9])

In [None]:
Audio(output_files[10])