# OpenVoice v1
Based on https://github.com/myshell-ai/OpenVoice/blob/main/demo_part1.ipynb

## Download voice samples repo

In [None]:
!cd /content && wget https://github.com/ekkus93/voice_samples/archive/refs/heads/master.zip && unzip master.zip && rm master.zip

## Download and Install OpenVoice

In [None]:
!cd /content && wget https://github.com/myshell-ai/OpenVoice/archive/refs/heads/main.zip && unzip main.zip && rm main.zip

In [None]:
!cd /content/OpenVoice-main && pip install .

In [None]:
!pip install faster_whisper

### Fix numpy version

In [None]:
!pip install numpy==1.26.4

## Voice Style Control Demo

In [None]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter

In [None]:
# prompt: Prompt user for a Huggingface access token and save it as an environment variable

import getpass
import os

# Prompt user for Hugging Face access token
hf_token = getpass.getpass('Enter your Hugging Face access token: ')

# Save the token as an environment variable
os.environ['HF_TOKEN'] = hf_token

## Download models

In [None]:
!cd /content/OpenVoice-main && wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_1226.zip && unzip checkpoints_1226.zip

In [None]:
!cd /content/OpenVoice-main && wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip && unzip checkpoints_v2_0417.zip

### Initialization

In [None]:
# prompt: change python current path to /content/OpenVoice-main

os.chdir("/content/OpenVoice-main")
!pwd

In [None]:
ckpt_base = 'checkpoints/base_speakers/EN'
ckpt_converter = 'checkpoints/converter'
device="cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = 'outputs'

base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

os.makedirs(output_dir, exist_ok=True)

### Obtain Tone Color Embedding

The `source_se` is the tone color embedding of the base speaker.
It is an average of multiple sentences generated by the base speaker. We directly provide the result here but
the readers feel free to extract `source_se` by themselves.

In [None]:
source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)

The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**

#### Option 1: Default reference speaker

In [None]:
reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

#### Option 2: Pick a voice from voice samples

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display

voice_samples_dir = "/content/voice_samples-master/voice_samples"
voice_files = sorted([f for f in os.listdir(voice_samples_dir) if os.path.isfile(os.path.join(voice_samples_dir, f))])

# Create the dropdown menu using ipywidgets
dropdown = widgets.Dropdown(
    options=["None"] + voice_files,
    value="None",
    description='Select voice file:',
    disabled=False,
)

display(dropdown)

In [None]:
reference_speaker = f"/content/voice_samples-master/voice_samples/{dropdown.value}"
print(f"Selected voice file: {reference_speaker}")
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

In [None]:
from IPython.display import Audio
Audio(reference_speaker)

#### Option 3: Use your own voice

In [None]:
from google.colab import output
from IPython.display import display, HTML, Javascript
from base64 import b64decode

sample_text = """
With the police helicopters thundering towards us,
and Marla and all the support group people who couldn't save themselves,
with all of them trying to save me, I had to pull the trigger.
This was better than real life.
"""
print(sample_text)

# Register the save function to receive audio data from JS
def save_audio(data, filename="/content/myvoice.wav"):
    audio_data = b64decode(data)
    with open(filename, "wb") as f:
        f.write(audio_data)
    print(f"Audio saved as {filename}")

output.register_callback('notebook.save_audio', save_audio)

# Display recording controls in Colab
display(HTML('''
  <div>
    <button id="start-record">Start Recording</button>
    <button id="stop-record" disabled>Stop Recording</button>
  </div>
'''))

display(Javascript('''
let mediaRecorder;
let audioChunks;

document.getElementById("start-record").onclick = async () => {
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  mediaRecorder = new MediaRecorder(stream);
  audioChunks = [];

  mediaRecorder.ondataavailable = event => {
    if (event.data.size > 0) {
      audioChunks.push(event.data);
    }
  };

  mediaRecorder.onstop = async () => {
    const audioBlob = new Blob(audioChunks);
    const reader = new FileReader();
    reader.readAsDataURL(audioBlob);
    reader.onloadend = () => {
      const base64data = reader.result.split(',')[1];
      google.colab.kernel.invokeFunction('notebook.save_audio', [base64data], {});
    };
  };

  mediaRecorder.start();
  document.getElementById("start-record").disabled = true;
  document.getElementById("stop-record").disabled = false;
};

document.getElementById("stop-record").onclick = () => {
  mediaRecorder.stop();
  document.getElementById("start-record").disabled = false;
  document.getElementById("stop-record").disabled = true;
};
'''))


In [None]:
# Convert to proper wav format using ffmpeg
!ffmpeg -y -i /content/myvoice.wav -acodec pcm_s16le -ar 44100 /content/myvoice_fixed.wav && mv /content/myvoice_fixed.wav /content/myvoice.wav

In [None]:
reference_speaker = "/content/myvoice.wav"
print(f"Selected voice file: {reference_speaker}")
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)

In [None]:
from IPython.display import Audio
Audio(reference_speaker)

### Inference

In [None]:
save_path = f'{output_dir}/output_en_default.wav'

# Run the base speaker tts
text = "This audio is generated by OpenVoice."
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)

# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    message=encode_message)

In [None]:
from IPython.display import Audio

print(save_path)

# Path to your .wav file
Audio(save_path)

**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9.

In [None]:
ckpt_base

In [None]:
source_se = torch.load(f'checkpoints/base_speakers/EN/en_style_se.pth').to(device)
save_path = f'{output_dir}/output_whispering.wav'

# Run the base speaker tts
text = "This audio is generated by OpenVoice."
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)

# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    message=encode_message)

In [None]:
from IPython.display import Audio

print(save_path)

# Path to your .wav file
Audio(save_path)

**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo.

In [None]:

ckpt_base = 'checkpoints/base_speakers/ZH'
base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')

source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
save_path = f'{output_dir}/output_chinese.wav'

# Run the base speaker tts
text = "今天天气真好，我们一起出去吃饭吧。"
src_path = f'{output_dir}/tmp.wav'
base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)

# Run the tone color converter
encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    message=encode_message)

In [None]:
from IPython.display import Audio

print(save_path)

# Path to your .wav file
Audio(save_path)

**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not.