Welcome to Tortoise! 🐢🐢🐢🐢

In case of bugs, compare against original notebook [here](https://colab.research.google.com/drive/1wVVqUPqwiDBUVeWWOUNglpGhU3hg_cbR?usp=sharing) and Github repository [here](https://github.com/neonbjb/tortoise-tts). Quality of Life improvements and additional voices added by [Downy](http://www.twitter.com/tooltrackers).

If you can't get a voice to work, try [Demucs](https://colab.research.google.com/drive/1qlpoIAb-nD-L29kFP976syIN4e6QiP4i?usp=sharing) and [VoiceFixer](https://colab.research.google.com/drive/1rypU23DARH3VsoJTKgDlviPDClOsXtXa?usp=sharing). Tortoise appears to work best with Standard American English "news anchor" voices, as it has trouble with cartoonish or noise-heavy (such as gravelly) ones.


In [None]:
#@title Check GPU
#@markdown - Tier List: (K80 < T4 < P100 < V100 < A100)
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-02a48a7b-d8d6-03e6-66fe-e0f05b7229fd)


In [None]:
# @title Optional RAM check

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
#@title Install libraries

# the scipy version packaged with colab is not tolerant of misformated WAV files.
# install the latest version.

!pip3 install -U scipy

!git clone https://github.com/jnordberg/tortoise-tts.git
%cd tortoise-tts
!pip3 install -r requirements.txt
!python3 setup.py install

In [None]:
#@title Mount Google Drive

#@markdown This will also transfer saved voices and three large files.

from google.colab import drive
drive.mount('/content/drive')

!gdown https://drive.google.com/uc?id=1SxZ3Qz9xIgCBxY7gxypg9o8E6sOORK49 #autoregressive.pth
!gdown https://drive.google.com/uc?id=1Q-uShpp_81PNV1o8LZ2bKDhJ4szGmaaa #clvp2.pth
!gdown https://drive.google.com/uc?id=1SxQNjL3VS5E1b5SMAKP69qLOEpsX7hRV #diffusion_decoder.pth

In [None]:
#@title Import functions

import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()

!cp -r "/content/tortoise-tts/autoregressive.pth" "/content/tortoise-tts/build/lib/tortoise/models"
!cp -r "/content/tortoise-tts/clvp2.pth" "/content/tortoise-tts/build/lib/tortoise/models"
!cp -r "/content/tortoise-tts/diffusion_decoder.pth" "/content/tortoise-tts/build/lib/tortoise/models"

In [None]:
#@title Download and upload voices

upload_voice = False #@param{type:'boolean'}

#Download voices from Google Drive
!gdown https://drive.google.com/uc?id=1T9AOI4lTjF3gGZr2gxU66Qj3ygfvK6jx #voices.zip
!unzip /content/tortoise-tts/voices.zip -d /content/tortoise-tts/tortoise/voices


#Upload a new voice
if (upload_voice):

  from google.colab import files

  %cd /content/tortoise-tts/tortoise/voices

  new_voice_name = "skeletor" #@param {type: 'string'}

  !mkdir $new_voice_name

  %cd $new_voice_name

  uploaded = files.upload()

  for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))
    
    %cd /content/tortoise-tts

Downloading...
From: https://drive.google.com/uc?id=1T9AOI4lTjF3gGZr2gxU66Qj3ygfvK6jx
To: /content/tortoise-tts/voices.zip
100% 31.7M/31.7M [00:00<00:00, 72.5MB/s]
Archive:  /content/tortoise-tts/voices.zip
replace /content/tortoise-tts/tortoise/voices/bella/1.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
#@title Preview voices

#@markdown Tortoise will attempt to mimic voices you provide. It comes pre-packaged with some voices you might recognize. Let's list all the voices available. These are just some random clips I've gathered from the internet as well as a few voices from the training dataset.  Feel free to add your own clips to the voices/ folder.
new_voice_name = "unicole" #@param ["bella", "michelle_yeoh", "skeletor", "spaceboy", "spaceboy-alt", "unicole"] {allow-input: true}

%ls tortoise/voices

IPython.display.Audio('tortoise/voices/' + new_voice_name + '/1.wav')

[0m[01;34mangie[0m/      [01;34mgeralt[0m/         [01;34mpat[0m/           [01;34mtim_reynolds[0m/   [01;34mtrain_grace[0m/
[01;34mapplejack[0m/  [01;34mhalle[0m/          [01;34mpat2[0m/          [01;34mtom[0m/            [01;34mtrain_kennard[0m/
[01;34mbella[0m/      [01;34mjlaw[0m/           [01;34mrainbow[0m/       [01;34mtrain_atkins[0m/   [01;34mtrain_lescault[0m/
[01;34mdaniel[0m/     [01;34mlj[0m/             [01;34mskeletor[0m/      [01;34mtrain_daws[0m/     [01;34mtrain_mouse[0m/
[01;34mdeniro[0m/     [01;34mmichelle_yeoh[0m/  [01;34msnakes[0m/        [01;34mtrain_dotrice[0m/  [01;34municole[0m/
[01;34memma[0m/       [01;34mmol[0m/            [01;34mspaceboy[0m/      [01;34mtrain_dreams[0m/   [01;34mweaver[0m/
[01;34mfreeman[0m/    [01;34mmyself[0m/         [01;34mspaceboy-alt[0m/  [01;34mtrain_empire[0m/   [01;34mwilliam[0m/


In [None]:
#@title Text to speak

text = "I will tell YOU what to do!  It's time for SKELETOR to be in charge!" #@param {type:"string"}

# Enter long text strings between triple-quotes here.
#text = """
#Space-Heaven 
#"""

preset = "fast" #@param ["ultra_fast", "fast", "standard", "high_quality"]

# Pick one of the voices from the output above
voice = "unicole" #@param ["bella", "michelle_yeoh", "skeletor", "spaceboy", "spaceboy-alt", "unicole"]

take = "take1" #@param ["take1", "take2", "take3", "take4", "take5"]

# Load it and send it through Tortoise.
voice_samples, conditioning_latents = load_voice(voice)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 
                          preset=preset)
torchaudio.save(take + '.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio(take + '.wav')

Generating autoregressive samples..


100%|██████████| 6/6 [00:34<00:00,  5.68s/it]


Computing best candidates using CLVP and CVVP


100%|██████████| 6/6 [00:04<00:00,  1.22it/s]


Transforming autoregressive outputs into audio..


100%|██████████| 80/80 [00:11<00:00,  7.15it/s]


SystemError: ignored

In [None]:
# This is the text that will be spoken.
#text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"

# Here's something for the poetically inclined.. (set text=)
text = """
People have been bombarding me on social media with comparatives, scale drawings and everything, 
and coming up with all sorts of theories about whether or not there was any kind of conscious decision behind it. 
All I can say is this: 
In many ways, it’s really great because it’s always very helpful in media to have a foil. 
Which one do you like better? 
Which one was first? 
It just is another kind of narrative you can use to drive people’s engagement."""

# Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py
preset = "high_quality"

In [None]:
# Pick one of the voices from the output above
voice = 'starmagic'

# Load it and send it through Tortoise.
voice_samples, conditioning_latents = load_voice(voice)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 
                          preset=preset)
torchaudio.save('take21.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio('take21.wav')

In [None]:
# Tortoise can also generate speech using a random voice. The voice changes each time you execute this!
# (Note: random voices can be prone to strange utterances)
gen = tts.tts_with_preset(text, voice_samples=None, conditioning_latents=None, preset=preset)
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio('generated.wav')

In [None]:
# Optionally, upload use your own voice by running the next two cells. I recommend
# you upload at least 2 audio clips. They must be a WAV file, 6-10 seconds long.
CUSTOM_VOICE_NAME = "custom"

import os
from google.colab import files

custom_voice_folder = f"tortoise/voices/{CUSTOM_VOICE_NAME}"
os.makedirs(custom_voice_folder)
for i, file_data in enumerate(files.upload().values()):
  with open(os.path.join(custom_voice_folder, f'{i}.wav'), 'wb') as f:
    f.write(file_data)

In [None]:
# Generate speech with the custotm voice.
voice_samples, conditioning_latents = load_voice(CUSTOM_VOICE_NAME)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 
                          preset=preset)
torchaudio.save(f'generated-{CUSTOM_VOICE_NAME}.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio(f'generated-{CUSTOM_VOICE_NAME}.wav')

In [None]:
# You can also combine conditioning voices. Combining voices produces a new voice
# with traits from all the parents.
#
# Lets see what it would sound like if Picard and Kirk had a kid with a penchant for philosophy:
voice_samples, conditioning_latents = load_voices(['pat', 'william'])

gen = tts.tts_with_preset("They used to say that if man was meant to fly, he’d have wings. But he did fly. He discovered he had to.", 
                          voice_samples=voice_samples, conditioning_latents=conditioning_latents, 
                          preset=preset)
torchaudio.save('captain_kirkard.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio('captain_kirkard.wav')