#**VOICE CONVERSION (ML PROJECT PHASE 2) - GROUP 30 - PART_2 - FINAL**

##**SETTING UP THE SYSTEM**

In [2]:
from google.colab import drive

##**GETTING THE AUDIO FILES**

In [3]:
# Mounting the drive and getting the data
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pysptk pyworld librosa tqdm

In [None]:
!pip install nnmnkwii

In [None]:
from os.path import join, expanduser
DIRECTORY = join(expanduser("~"), "/content/drive/MyDrive/ML_PROJECT/PHASE_2/", "Audio_Testing")
!ls $DIRECTORY

In [None]:
%pylab inline
rcParams["figure.figsize"] = (16,5)

from nnmnkwii.datasets import PaddedFileSourceDataset as padding
from nnmnkwii.preprocessing.alignment import DTWAligner as dtw
from nnmnkwii.preprocessing import trim_zeros_frames as trim_frames
from nnmnkwii.preprocessing import remove_zeros_frames as remove_frames
from nnmnkwii.preprocessing import delta_features as first_der
from nnmnkwii.util import apply_each2d_trim as trim_2d
from nnmnkwii.metrics import melcd as mel_CD
from nnmnkwii.baseline.gmm import MLPG
from nnmnkwii.datasets import FileDataSource
from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource

from os.path import basename, splitext
from os import listdir
from os.path import isdir, join, splitext

import numpy as np
from scipy.io import wavfile
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import pyworld
import pysptk
from pysptk.synthesis import MLSADF, Synthesizer
import IPython
from IPython.display import Audio

In [9]:
sampling_rate = 48000
alpha_val = pysptk.util.mcepalpha(sampling_rate)
n_mcc = 30
frame_quantum = 5
hopsize = int(sampling_rate * (frame_quantum * 0.001))
fft_len=pyworld.get_cheaptrick_fft_size(sampling_rate)
windows = [
    (0, 0, np.array([1.0])),
    (1, 1, np.array([-0.5, 0.0, 0.5])),
    (1, 1, np.array([1.0, -2.0, 1.0])),
]


In [10]:
# Make sure to add the speaker folders with wav files in the defined directory
available_speakers= ['c1' ,'c2','c3','c4','c5','c6']

def _name_to_dirname(name):
    # assert len(name) == 3
    return join("{}".format(name), "wav")

# Reference to the below class: https://r9y9.github.io/nnmnkwii/latest/_modules/nnmnkwii/datasets/cmu_arctic.html
class WavFileDataSource(FileDataSource):
    def _init_(self, data_root, speakers, labelmap=None, max_files=None):
        for speaker in speakers:
            if speaker not in available_speakers:
                raise ValueError(
                    "Unknown speaker '{}'. It should be one of {}".format(
                        speaker, available_speakers
                    )
                )

        self.data_root = data_root
        self.speakers = speakers
        if labelmap is None:
            labelmap = {}
            for idx, speaker in enumerate(speakers):
                labelmap[speaker] = idx
        self.labelmap = labelmap
        self.max_files = max_files
        self.labels = None

    def collect_files(self):
        """Collect wav files for specific speakers.

        Returns:
            list: List of collected wav files.
        """
        speaker_dirs = list(
            map(lambda i: join(self.data_root, _name_to_dirname(i)), self.speakers)
        )
        print(speaker_dirs)
        paths = []
        labels = []

        if self.max_files is None:
            max_files_per_speaker = None
        else:
            max_files_per_speaker = self.max_files // len(self.speakers)
        for (i, d) in enumerate(speaker_dirs):
            if not isdir(d):
                raise RuntimeError("{} doesn't exist.".format(d))
            files = [join(speaker_dirs[i], f) for f in listdir(d)]
            files = list(filter(lambda i: splitext(i)[1] == ".wav", files))
            files = sorted(files)
            files = files[:max_files_per_speaker]
            for f in files:
                paths.append(f)
                labels.append(self.labelmap[self.speakers[i]])

        self.labels = np.array(labels, dtype=np.int32)
        return paths

In [20]:
class MyFileDataSource(WavFileDataSource):
    def __init__(self, *args, **kwargs):
        super(MyFileDataSource, self).__init__(*args, **kwargs)
        self.test_paths = None

    def collect_files(self):
        paths = super(
            MyFileDataSource, self).collect_files()
        train_paths, test_paths = train_test_split(
            paths, test_size=0.03, random_state=1234)
        self.test_paths = test_paths
        return train_paths

    def collect_features(self, path):
        sampling_rate, audio_data = wavfile.read(path)
        audio_data = audio_data.astype(np.float64)
        freq, timeaxis = pyworld.dio(audio_data, sampling_rate, frame_period=frame_quantum)
        freq = pyworld.stonemask(audio_data, freq, timeaxis, sampling_rate)
        spect = pyworld.cheaptrick(audio_data, freq, timeaxis, sampling_rate)
        spect = trim_frames(spect)
        mcc = pysptk.sp2mc(spect, order=n_mcc, alpha=alpha_val)
        return mcc


In [21]:
src_attr = MyFileDataSource(data_root=DIRECTORY,
                                         speakers=["bdl"], max_files=100)
tgt_attr = MyFileDataSource(data_root=DIRECTORY,
                                         speakers=["slt"], max_files=100)

In [22]:
src = padding(src_attr, 2000).asarray()
tgt = padding(tgt_attr, 2000).asarray()

In [23]:
aligned_src, aligned_tgt = dtw(verbose=0, dist=mel_CD).transform((src, tgt))
aligned_src, aligned_tgt = aligned_src[:, :, 1:], aligned_tgt[:, :, 1:]
dim_static = aligned_src.shape[-1]
aligned_src = trim_2d(first_der, aligned_src, windows)
aligned_tgt = trim_2d(first_der, aligned_tgt, windows)
combined_features = np.concatenate((aligned_src, aligned_tgt), axis=-1).reshape(-1, aligned_src.shape[-1]*2)
combined_features = remove_frames(combined_features)

In [None]:

gmm_model = GaussianMixture(n_components=64, covariance_type="full", max_iter=100, verbose=1)
%time gmm_model.fit(combined_features)

In [25]:
def features_collect(source_path):
    sampling_rate, audio_data = wavfile.read(source_path)
    audio_data = audio_data.astype(np.float64)
    freq, timeaxis = pyworld.dio(audio_data, sampling_rate, frame_period=frame_quantum)
    freq = pyworld.stonemask(audio_data, freq, timeaxis, sampling_rate)
    spect = pyworld.cheaptrick(audio_data, freq, timeaxis, sampling_rate)
    mcc = pysptk.sp2mc(spect, order=n_mcc, alpha=alpha_val)
    return mcc, audio_data

In [26]:
def test(source_path, enable_mlpg=True, vc=True):
    if enable_mlpg:
        paramgen = MLPG(gmm_model, windows=windows, diff=vc)
    else:
        paramgen = MLPG(gmm_model, windows=[(0,0, np.array([1.0]))], diff=vc)

    mcc, audio_data=features_collect(source_path)
    mcc0, mcc = mcc[:, 0], mcc[:, 1:]
    mcc = first_der(mcc, windows)
    mcc = paramgen.transform(mcc)
    if (not enable_mlpg) and (mcc.shape[-1] != dim_static):
        mcc = mcc[:,:dim_static]
    assert mcc.shape[-1] == dim_static
    mcc = np.hstack((mcc0[:, None], mcc))
    mcc[:, 0] = 0
    engine = Synthesizer(MLSADF(order=n_mcc, alpha=alpha_val), hopsize=hopsize)
    b = pysptk.mc2b(mcc.astype(np.float64), alpha=alpha_val)
    waveform = engine.synthesis(audio_data, b)

    return waveform

In [None]:
for i, (src_path, tgt_path) in enumerate(zip(src_attr.test_paths, tgt_attr.test_paths)):
    print("Test - {}".format(i+1))
    without_MLPG = test(src_path, enable_mlpg=False)
    with_MLPG = test(src_path, enable_mlpg=True)
    _, src = wavfile.read(src_path)
    _, tgt = wavfile.read(tgt_path)

    print("Source Audio:", basename(src_path))
    IPython.display.display(Audio(src, rate=sampling_rate))
    print("Target Audio:", basename(tgt_path))
    IPython.display.display(Audio(tgt, rate=sampling_rate))
    print("With MLPG Converted Audio:")
    IPython.display.display(Audio(with_MLPG, rate=sampling_rate))
    print("Without MLPG Converted Audio:")
    IPython.display.display(Audio(without_MLPG, rate=sampling_rate))

In [None]:
from joblib import dump
# from joblib import load
import os
model_filename = f"gmm_model_3.joblib"
model_path = os.path.join("/content/", model_filename)
dump(gmm_model, model_path)