In [1]:
#imports
import random
import os
import json
import librosa
from tqdm import tqdm

import torch
import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt
from pathlib import Path

from model import FastSpeech2
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.parts.utils.tts_dataset_utils import BetaBinomialInterpolator
import soundfile as sf
import numpy as np

[NeMo W 2023-07-28 00:32:44 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 00:32:44 experimental:27] Module <class 'nemo.collections.tts.parts.utils.callbacks.LoggingCallback'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 00:32:44 experimental:27] Module <class 'nemo.collections.tts.models.fastpitch_ssl.FastPitchModel_SSL'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 00:32:44 experimental:27] Module <class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 00:32:44 experimental:27] Module <class 'nemo.collections.tts.models.radtts.RadTTSModel'> 

In [2]:
import yaml
model_config = yaml.load(open('/workspace/nemo/vol/FastSpeech2/config/RAVDESS/model.yaml',
                              "r"), Loader=yaml.FullLoader)
preprocess_config = yaml.load(open('/workspace/nemo/vol/FastSpeech2/config/RAVDESS/preprocess.yaml',
                              "r"), Loader=yaml.FullLoader)

In [3]:
fp = FastSpeech2(model_config=model_config, preprocess_config=preprocess_config)
ckpt_file_path = '/workspace/nemo/vol/FastSpeech2/output/ckpt/RAVDESS/800000.pth.tar'
checkpoint = torch.load(ckpt_file_path, map_location=torch.device('cuda'))
fp.load_state_dict(checkpoint['model'])
fp = fp.to('cuda')

In [4]:
#helper functions
def load_wav(audio_file, target_sr=None):
    with sf.SoundFile(audio_file, 'r') as f:
        samples = f.read(dtype='float32')
        sample_rate = f.samplerate
        if target_sr is not None and target_sr != sample_rate:
            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
    return samples.transpose()

In [10]:
voices_path = 'raw_data/RAVDESS/'
speaker = 'josh'
base_path = os.path.join(voices_path, speaker)
speakers_path = "preprocessed_data/RAVDESS/speakers.json"
emotions_path = "preprocessed_data/RAVDESS/emotions.json"

rec = []

for emotion in os.listdir(base_path):
    emotion_path = os.path.join(base_path, emotion)
    for file in os.listdir(emotion_path):
        filename = file.split('.')[0]
        audio_filepath = os.path.join(emotion_path, '.'.join([filename, 'wav'])) 
        text_filepath = os.path.join(emotion_path, '.'.join([filename, 'lab']))
        duration = librosa.get_duration(filename=audio_filepath)
        with open(text_filepath, 'r') as f:
            text = f.read()
        with open(speakers_path, "r") as f:
            speakers = json.load(f)
            speaker_ = [v for k,v in speakers.items() if k==speaker][0]
        with open(emotions_path, "r") as f:
            emotions = json.load(f)
            emotion_ = [v for k,v in emotions.items() if k==emotion][0]   
        r = {
           "audio_filepath" : audio_filepath,
           "text" : text,
           "speaker": speaker_,
           "emotion": emotion_,
           "duration" : round(duration,1),
           "text_no_preprocessing" : text
        }
        rec.append(r)
random.shuffle(rec)
train_len = int(0.9*len(rec))
train_rec = rec[:train_len]
val_rec = rec[train_len:]

In [11]:
train_manifest = f'./fastpitch_train.json'
with open(train_manifest, "w") as f:
    for s in train_rec:
        f.write(json.dumps(s) + '\n')
        
val_manifest = f'./fastpitch_val.json'
with open(val_manifest, "w") as f:
    for s in val_rec:
        f.write(json.dumps(s) + '\n')


In [12]:
train_rec[0]

{'audio_filepath': 'raw_data/RAVDESS/josh/neutral/neutral_225-252_0238.wav',
 'text': ' it happened to him at the garland society in oaklands one afternoon.',
 'speaker': 1,
 'emotion': 4,
 'duration': 5.2,
 'text_no_preprocessing': ' it happened to him at the garland society in oaklands one afternoon.'}

In [46]:
from synthesize import preprocess_english

def create_hifigan_finetune_data(records, speaker, run= 'train', device='cuda'):    
    beta_binomial_interpolator = BetaBinomialInterpolator()

    save_dir = Path(f"/workspace/nemo/vol/FastSpeech2/mels/{speaker}_{run}")
    save_dir.mkdir(exist_ok=True, parents=True)

    # Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
    for i, r in tqdm(enumerate(records)):
        audio = load_wav(r["audio_filepath"])
        audio = torch.from_numpy(audio).unsqueeze(0).to(device)
        audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)


        with torch.no_grad():
            text = r['text']
            speaker = r['speaker']
            emotion = r['emotion']
            ids = raw_texts = [text[:100]]
            speaker = torch.tensor([speaker]).to('cuda')
            emotion = torch.tensor([emotion]).to('cuda') 
            texts = torch.tensor([preprocess_english(text, preprocess_config)]).to('cuda')
            text_lens = torch.tensor([len(texts[0])]).to('cuda')
            batchs = [(ids, raw_texts, speaker, emotion, texts, text_lens, max(text_lens))]
            predictions = fp(*(batchs[0][2:]))
            spectrogram = predictions[1].transpose(1, 2)

    #         if "normalized_text" in r:
    #             text = spec_model.parse(r["normalized_text"], normalize=False)
    #         else:
    #             text = spec_model.parse(r['text'])

    #         text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)

    #         spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

    #         # Generate attention prior and spectrogram inputs for HiFi-GAN
    #         attn_prior = torch.from_numpy(
    #           beta_binomial_interpolator(spect_len.item(), text_len.item())
    #         ).unsqueeze(0).to(text.device)

    #         spectrogram = spec_model.forward(
    #           text=text, 
    #           input_lens=text_len, 
    #           spec=spect, 
    #           mel_lens=spect_len, 
    #           attn_prior=attn_prior,
    #         )[0]

            save_path = save_dir / f"mel_{i}.npy"
            np.save(save_path, spectrogram[0].to('cpu').numpy())
            r["mel_filepath"] = str(save_path)

    hifigan_manifest_path = f"./hifigan_{run}_ft.json"
    with open(hifigan_manifest_path, "w") as f:
        for r in records:
            f.write(json.dumps(r) + '\n')

In [47]:
create_hifigan_finetune_data(train_rec, 'josh', 'train', 'cuda')

1553it [19:10,  1.35it/s]


In [48]:
create_hifigan_finetune_data(val_rec, 'josh', 'val', 'cuda')

173it [02:08,  1.35it/s]


In [21]:
! cd hifigan_conf && unzip hifigan.zip
! cd hifigan_conf && cd hifigan && wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/hifigan/hifigan.yaml && cd .. 

Archive:  hifigan.zip
   creating: hifigan/
   creating: hifigan/model/
   creating: hifigan/model/validation_ds/
 extracting: hifigan/model/validation_ds/val_ds.yaml  
 extracting: hifigan/model/validation_ds/val_ds_finetune.yaml  
   creating: hifigan/model/train_ds/
 extracting: hifigan/model/train_ds/train_ds.yaml  
 extracting: hifigan/model/train_ds/train_ds_finetune.yaml  
   creating: hifigan/model/generator/
 extracting: hifigan/model/generator/v1.yaml  
 extracting: hifigan/model/generator/v1_44100.yaml  
 extracting: hifigan/model/generator/v2.yaml  
 extracting: hifigan/model/generator/v3.yaml  
 extracting: hifigan/hifigan.yaml    
 extracting: hifigan/hifigan_44100.yaml  
--2023-07-28 01:28:08--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/conf/hifigan/hifigan.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|

In [33]:
home_path = !(echo $HOME)
home_path = home_path[0]
print(home_path)

nemo_files = [p for p in Path(f"{home_path}/.cache/torch/NeMo/").glob("**/tts_hifigan.nemo")]
print(f"Copying {nemo_files[0]} to ./")
Path("./tts_hifigan.nemo").write_bytes(nemo_files[0].read_bytes())

/root
Copying /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo to ./


315386678

In [25]:
#!pip install wandb -qU
#import wandb
#wandb.login()

In [50]:
wandb_api_key = "108bad0089a140932fbbe1c9e2ae182a1a228ffe"
#wandb.login()

In [30]:
!wget https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/hifigan_finetune.py

--2023-07-28 01:31:09--  https://raw.githubusercontent.com/nvidia/NeMo/main/examples/tts/hifigan_finetune.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1192 (1.2K) [text/plain]
Saving to: ‘hifigan_finetune.py’


2023-07-28 01:31:09 (261 MB/s) - ‘hifigan_finetune.py’ saved [1192/1192]



In [51]:
!(HYDRA_FULL_ERROR=1 python hifigan_finetune.py \
--config-name=hifigan.yaml \
model.train_ds.dataloader_params.batch_size=32 \
model.max_steps=100000 \
model.optim.lr=1e-4 \
~model.optim.sched \
train_dataset=hifigan_train_ft.json \
validation_datasets=hifigan_val_ft.json \
exp_manager.exp_dir=hifigan_ft \
+init_from_pretrained_model=nvidia/tts_hifigan \
trainer.check_val_every_n_epoch=5 \
trainer.log_every_n_steps=3 \
)
# exp_manager.create_wandb_logger=true \
# exp_manager.wandb_logger_kwargs.name='Josh_emotional_HG' \
# exp_manager.wandb_logger_kwargs.project="TTS_convai"  \
# model/train_ds=train_ds_finetune \
# model/validation_ds=val_ds_finetune \

[NeMo W 2023-07-28 02:15:51 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 02:15:51 experimental:27] Module <class 'nemo.collections.tts.parts.utils.callbacks.LoggingCallback'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 02:15:51 experimental:27] Module <class 'nemo.collections.tts.models.fastpitch_ssl.FastPitchModel_SSL'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 02:15:51 experimental:27] Module <class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-28 02:15:51 experimental:27] Module <class 'nemo.collections.tts.models.radtts.RadTTSModel'> 

In [42]:
from nemo.collections.tts.torch.data import VocoderDataset