# SpeechSplit 2.0 Demo

This notebook is a demonstration of how to use Speech Split 2. First you need to download the source code from the repository on github:

In [None]:
!git clone https://github.com/biggytruck/SpeechSplit2

Cloning into 'SpeechSplit2'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 36 (delta 10), reused 25 (delta 5), pack-reused 0[K
Unpacking objects: 100% (36/36), done.


In [None]:
import os
os.chdir('/content/SpeechSplit2')

Install requirments

In [None]:
!pip install pyyaml numpy pysoundfile tqdm wavenet_vocoder pysptk librosa pyworld 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pysoundfile
  Downloading PySoundFile-0.9.0.post1-py2.py3-none-any.whl (24 kB)
Collecting wavenet_vocoder
  Downloading wavenet_vocoder-0.1.1.tar.gz (13 kB)
Collecting pysptk
  Downloading pysptk-0.1.21.tar.gz (420 kB)
[K     |████████████████████████████████| 420 kB 6.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyworld
  Downloading pyworld-0.3.0.tar.gz (212 kB)
[K     |████████████████████████████████| 212 kB 53.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: wavenet-vocoder, pysptk, pyworld
  Building wheel for wavenet-vocoder (setup.py) ... [?25l[?25hdone
  Created

In [None]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Download Checkpoints

In [None]:
!pip install -U gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Download wavenet checkpoint

In [None]:
!mkdir /content/SpeechSplit2/models

In [None]:
# https://drive.google.com/file/d/1Zksy0ndlDezo9wclQNZYkGi_6i7zi4nQ/view
!gdown 1Zksy0ndlDezo9wclQNZYkGi_6i7zi4nQ

Downloading...
From: https://drive.google.com/uc?id=1Zksy0ndlDezo9wclQNZYkGi_6i7zi4nQ
To: /content/SpeechSplit2/checkpoint_step001000000_ema.pth
100% 297M/297M [00:05<00:00, 53.5MB/s]


In [None]:
!mv /content/SpeechSplit2/checkpoint_step001000000_ema.pth /content/SpeechSplit2/models/wavenet_vocoder.pth

Download smaller bottleneck checkpoints

In [None]:
# Small Generator
# https://drive.google.com/uc?export=download&id=1_Eo6_XxcZpk4P0jzjudkgjTKeb3Y-wMu
!gdown 1_Eo6_XxcZpk4P0jzjudkgjTKeb3Y-wMu

Downloading...
From: https://drive.google.com/uc?id=1_Eo6_XxcZpk4P0jzjudkgjTKeb3Y-wMu
To: /content/SpeechSplit2/spsp2-small-G-800000.ckpt
100% 77.8M/77.8M [00:01<00:00, 65.1MB/s]


In [None]:
# Small F0 Converter
# https://drive.google.com/uc?export=download&id=1MhWkz3UGeZSolKfw0FF0DqhHNN1e5C82
!gdown 1MhWkz3UGeZSolKfw0FF0DqhHNN1e5C82

Downloading...
From: https://drive.google.com/uc?id=1MhWkz3UGeZSolKfw0FF0DqhHNN1e5C82
To: /content/SpeechSplit2/spsp2-small-F-800000.ckpt
100% 14.4M/14.4M [00:00<00:00, 39.9MB/s]


Download large bottleneck checkpoints

In [None]:
# Large Generator
# https://drive.google.com/uc?export=download&id=1yTVy4BjonLdXW7kTxvEMfDf_RhuDCyBZ
!gdown 1yTVy4BjonLdXW7kTxvEMfDf_RhuDCyBZ

Downloading...
From: https://drive.google.com/uc?id=1yTVy4BjonLdXW7kTxvEMfDf_RhuDCyBZ
To: /content/SpeechSplit2/spsp2-large-G-800000.ckpt
100% 80.3M/80.3M [00:01<00:00, 66.2MB/s]


In [None]:
# Large F0 Converter
# https://drive.google.com/uc?export=download&id=1th0OFjM1k7y3dtNcijhUy1teKY23bHL8
! gdown 1th0OFjM1k7y3dtNcijhUy1teKY23bHL8

Downloading...
From: https://drive.google.com/uc?id=1th0OFjM1k7y3dtNcijhUy1teKY23bHL8
To: /content/SpeechSplit2/spsp2-large-F-800000.ckpt
100% 15.0M/15.0M [00:00<00:00, 35.9MB/s]


In [None]:
!mv spsp2-* /content/SpeechSplit2/models

# Executing

In [None]:
import os
import yaml
from collections import OrderedDict

import torch
import numpy as np
from soundfile import read, write

from model import Generator_3 as Generator
from model import Generator_6 as F0_Converter
from wavenet import Synthesizer
from utils import *

In [None]:
def load_ckpt(model, ckpt_path):
    ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
    try:
        model.load_state_dict(ckpt['model'])
    except:
        new_state_dict = OrderedDict()
        for k, v in ckpt['model'].items():
            new_state_dict[k[7:]] = v
        model.load_state_dict(new_state_dict)

def pad_fea(fea):
    return np.pad(fea, ((0,T-len(fea)), (0,0)), 'constant')

def create_feats(wav, gen, spk_id, config):
    if gen == 'M':
        lo, hi = 50, 250
    else:
        lo, hi = 100, 600

    if wav.shape[0] % 256 == 0:
        wav = np.concatenate((wav, np.array([1e-06])), axis=0)
    _, f0_norm = extract_f0(wav, fs, lo, hi)
    f0, sp, ap = get_world_params(wav, fs)
    f0 = average_f0s([f0])[0]
    wav_mono = get_monotonic_wav(wav, f0, sp, ap, fs)

    rhy_input = pad_fea(get_spenv(wav_mono))
    con_input = pad_fea(get_spmel(wav_mono))
    pit_input = pad_fea(quantize_f0_numpy(f0_norm)[0])
    tim_input = np.zeros((82,), dtype=np.float32)
    tim_input[int(spk_id)] = 1.0

    return (torch.FloatTensor(x).unsqueeze(0).to(device) for x in (rhy_input, con_input, pit_input, tim_input))

def convert_sp(model, rhy_input, con_input, pit_input, tim_input):
    rhy_code = model.rhythm(rhy_input)
    con_code, pit_code = model.content_pitch(torch.cat((con_input, pit_input), dim=-1), rr=False)
    sp_output = model.decode(con_code, rhy_code, pit_code, tim_input, T).cpu().numpy()[0]
    
    return sp_output

def convert_pit(model, rhy_input, con_input, pit_input):
    pit_input = torch.cat([con_input, pit_input], dim=-1)
    rhy_input = torch.nn.functional.pad(rhy_input, (0, 0, 0, T-rhy_input.size(1), 0, 0))
    pit_input = torch.nn.functional.pad(pit_input, (0, 0, 0, T-pit_input.size(1), 0, 0))
    pit_input = model(rhy_input, pit_input, rr=False) # disable random resampling at inference time

    return pit_input

In [None]:
config_name = 'spsp2-large' # or 'spsp2-small'
config = yaml.safe_load(open(f'configs/{config_name}.yaml', 'r'))
config = Dict2Class(config)
config.train = False

T = 192 # maximum number of frames in the output mel-spectrogram
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fs = 16000
S = Synthesizer(device)
S.load_ckpt('models/wavenet_vocoder.pth')

G = Generator(config).eval().to(device)
load_ckpt(G, f'models/{config_name}-G-800000.ckpt')

config.dim_pit = config.dim_con+config.dim_pit
F = F0_Converter(config).eval().to(device)
load_ckpt(F, f'models/{config_name}-F-800000.ckpt')

In [None]:
result_dir = 'result'
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
src_wav, _ = read('data/test/p225_001.wav')
tgt_wav, _ = read('data/test/p258_001.wav')

with torch.no_grad():
    conds = ['R', 'F', 'U', 'RF', 'RU', 'FU', 'RFU']
    for cond in conds:
        src_rhy, src_con, src_pit, src_tim = create_feats(src_wav, 'F', 0, config)
        tgt_rhy, tgt_con, tgt_pit, tgt_tim = create_feats(tgt_wav, 'M', 31, config)
        inp_rhy, inp_con, inp_pit, inp_tim = src_rhy, src_con, src_pit, src_tim
        if 'R' in cond:
            inp_rhy = tgt_rhy
        if 'U' in cond:
            inp_tim = tgt_tim
        if 'F' in cond:
            inp_pit = convert_pit(F, src_rhy, tgt_con, tgt_pit)
        out_sp = convert_sp(G, inp_rhy, inp_con, inp_pit, inp_tim)
        out_wav = S.spect2wav(out_sp)
        write(os.path.join(result_dir, f'p225_p258_001_{cond}.wav'), out_wav, fs)


100%|██████████| 49152/49152 [07:16<00:00, 112.52it/s]
100%|██████████| 49152/49152 [07:15<00:00, 112.76it/s]
100%|██████████| 49152/49152 [07:13<00:00, 113.44it/s]
100%|██████████| 49152/49152 [07:10<00:00, 114.08it/s]
100%|██████████| 49152/49152 [07:10<00:00, 114.14it/s]
100%|██████████| 49152/49152 [07:10<00:00, 114.21it/s]
100%|██████████| 49152/49152 [07:09<00:00, 114.36it/s]


Let's download the results

In [None]:
!tar -jcvf result.tar.bz result

result/
result/p225_p258_001_F.wav
result/p225_p258_001_R.wav
result/p225_p258_001_RF.wav
result/p225_p258_001_U.wav
result/p225_p258_001_FU.wav
result/p225_p258_001_RU.wav
result/p225_p258_001_RFU.wav


In [None]:
from google.colab import files
files.download('/content/SpeechSplit2/result.tar.bz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>