In [None]:
import numpy as np
import pyworld
import glob
from hparams import hparams
import librosa
import os
from module import *
import soundfile as sf

In [None]:
! pip install pyworld
! pip install hparams
! pip install soundfile
! pip install module

In [None]:
import torch
class hparams():
    def __init__(self):
        
        self.spk_list = ['VCC2SM1','VCC2SF1','VCC2SF2','VCC2SM2']
#         self.spk_id_list = ['VCC2SM1','VCC2SF1','VCC2SF2','VCC2SM2']
        self.path_train_wavs = 'data/train'
        self.path_test_wavs = 'data/test'
        self.path_catch_feas = 'data_catch'
        #self.path_catch_feas = 'data_catch_test'
        self.n_spk = 4
 
        # 特征提取相关
        self.fs = 16000         # 采样率 
        self.frame_period = 5.0 # 帧移
        self.coded_dim = 36     # mepc 特征维度 
        #self.coded_dim = 35
        
        # 模型训练相关
        self.n_epoch = 300
        
        self.g_lr = 1e-4
        self.d_lr = 1e-4
        self.c_lr = 1e-4
        
        
        self.decay_g = 1e-9
        self.decay_d = 1e-9
        self.decay_c = 1e-9
        
        self.lambda_cycle = 3
        self.lambda_classifier = 2
        self.lambda_identity = 2
        
      
        self.lr_update_step = 1e4
        self.start_decay = 1e5 # 开始进行lr衰减的步数
        
        
        self.path_save = 'save'
        self.save_step = 5000
        
        
        self.path_eval = 'eval'

In [None]:
def feature_world(wav,para):
    fs = para.fs
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period=para.frame_period, f0_floor=71.0, f0_ceil=800.0)
    
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)
    coded_sp =pyworld.code_spectral_envelope(sp, fs, para.coded_dim)
    #print(type(f0),type(timeaxis),type(sp),type(ap),type(coded_sp))
    #print(f0.shape, timeaxis.shape, sp.shape, ap.shape, coded_sp.shape)
    return f0,timeaxis,sp,ap,coded_sp

In [None]:
def processing_wavs(file_wavs,para):
    
    f0s = []
    coded_sps = []
    for file in file_wavs:
        print("processing %s"%(file))
        # 读取音频文件
        fs = para.fs
        wav, _ = librosa.load(file, sr=fs, mono=True)
        
        # 提取world 特征,采集f0和coded_sp
        f0,_,_,_,coded_sp=feature_world(wav,para)
        print(coded_sp.shape)
        f0s.append(f0)
        coded_sps.append(coded_sp)
        
    # 计算log_f0的 均值和std
    log_f0s = np.ma.log(np.concatenate(f0s))
    log_f0s_mean = log_f0s.mean()
    log_f0s_std = log_f0s.std()
    
    # 计算 coded_sp 的均值和 标准差
    coded_sps_array = np.concatenate(coded_sps,axis=0)  # coded_sp的维度  T * D
    coded_sps_mean = np.mean(coded_sps_array,axis=0,keepdims = True)
    coded_sps_std = np.std(coded_sps_array,axis=0,keepdims = True)

    # 利用 coded_sp 的均值和 标准差 对特征进行正则
    coded_sps_norm = []
    for coded_sp in coded_sps:
        coded_sps_norm.append(  (coded_sp- coded_sps_mean)/ coded_sps_std )
    
    #print("Function processing wave:{},{},{}".format(len(coded_sps_norm[0][0]),len(coded_sps_norm[0]),len(coded_sps_norm)))
    return log_f0s_mean,log_f0s_std,coded_sps_mean,coded_sps_std,coded_sps_norm
    

In [None]:
if __name__ == "__main__":
    
    para = hparams()
    print(para.path_train_wavs)
    #print(para.path_test_wavs)
    # 遍历所有 spks    
    for spk in para.spk_list:
        #print("processing features for %s"%(spk_id))
        print("processing features for %s"%(spk))
        # 获取每个spk的wav文件存放路径 
        dir_train = os.path.join(para.path_train_wavs,spk)
        #dir_test = os.path.join(para.path_test_wavs,spk) 
        wavs = glob.glob(dir_train+'/*wav') 
        #wavs = glob.glob(dir_test+'/*wav')
        f0_mean,f0_std,mecp_mean,mecp_std, mecps = processing_wavs(wavs,para)
        
        #print("mecps:{},{},{},{}".format(len(mecps[0][0]),len(mecps[1]),len(mecps[0]),len(mecps)))
        
        # 获取保存路径
        #path_save = os.path.join(para.path_catch_feas,spk_id)
        path_save = os.path.join(para.path_catch_feas,spk)
        os.makedirs(path_save,exist_ok = True)
        
        # 进行数据保存
        np.save(os.path.join(path_save,'static_f0.npy'),np.array([f0_mean,f0_std],dtype=object))
        np.save(os.path.join(path_save,'static_mecp.npy'),np.array([mecp_mean,mecp_std],dtype=object))
        np.save(os.path.join(path_save,'data.npy'),np.array(mecps,dtype=object))

In [None]:
SF1_data = np.load("./data_catch/VCC2SF1/data.npy",allow_pickle=True)
print(SF1_data[0].shape)
SF2_data = np.load("./data_catch/VCC2SF2/data.npy",allow_pickle=True)
#SF2_data[0].shape
SM1_data = np.load("./data_catch/VCC2SM2/data.npy",allow_pickle=True)
print(SM1_data[0].shape)
SM2_data = np.load("./data_catch/VCC2SM1/data.npy",allow_pickle=True)
#SM2_data.shape

In [None]:
print(SM1_data.shape,SF1_data[0].shape)

In [None]:
def load_static(catch_path):
    static = {}
    info_f0 = np.load(os.path.join(catch_path,'static_f0.npy'),allow_pickle=True)
    static['mean_log_f0'] = np.float64(info_f0[0])
    static['std_log_f0'] = np.float64(info_f0[1])
    info_mepc = np.load(os.path.join(catch_path,'static_mecp.npy'),allow_pickle=True)
    static['coded_sps_mean'] = np.float64(info_mepc[0])
    static['coded_sps_std'] = np.float64(info_mepc[1])
    return static
    
def pitch_conversion(f0,static_A, static_B):
    mean_log_f0_A = static_A["mean_log_f0"]
    std_log_f0_A = static_A["std_log_f0"]
    
    mean_log_f0_B = static_B["mean_log_f0"]
    std_log_f0_B = static_B["std_log_f0"]
    
    f0_converted = np.exp((np.ma.log(f0)-mean_log_f0_A) / std_log_f0_A*std_log_f0_B + mean_log_f0_B)
    return f0_converted

def featu_normlize(data, data_mean, data_std, de_conver=False):
    if not de_conver:
        data_out = (data - data_mean) / data_std
    else:
        data_out = data*data_std + data_mean
        
    return data_out

def synthesis_world(coded_sp,f0,ap,para):
    #将coded_sp 转化为sp
    fs = para.fs
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    coded_sp = np.ascontiguousarray(coded_sp)
    print(coded_sp.shape)
    decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen)
    frame_period = para.frame_period
    print(f0.shape,decoded_sp.shape,ap.shape,fs)
    wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period=frame_period)
    wav = wav.astype(np.float32)
    return wav

In [None]:
para = hparams()
#提取spk A的特征
catch_path = os.path.join("../input/voiceconversion/data_catch",'VCC2SF1')
static_A = load_static(catch_path)

catch_path = os.path.join("../input/voiceconversion/data_catch","VCC2SM1")
static_B = load_static(catch_path)

normlize_coded_sp_B = np.load("../input/conversion-npy/sf1-sm1.npy",allow_pickle=True)
normlize_coded_sp_B = np.float64(normlize_coded_sp_B)
print(normlize_coded_sp_B.shape)

coded_sp_B = featu_normlize(normlize_coded_sp_B,static_B['coded_sps_mean'],static_B['coded_sps_std'],de_conver=True)

fs = para.fs
wav, _ = librosa.load("../input/voiceconvert/VCC2SF1/10001.wav", sr=fs, mono=True)

f0,_,_,ap,_ = feature_world(wav,para)
#print(ap.shape)
f0_B = pitch_conversion(f0,static_A,static_B)
#print(f0_B.shape)
#print(para)
wav_B = synthesis_world(coded_sp_B,f0_B,ap,para)

In [None]:
name = "sf1_sm1_10001(4).wav"
sf.write(name,wav_B,22500)

In [None]:
fs = para.fs
wav, _ = librosa.load("./sf1_sm1_10001(4).wav", sr=fs, mono=True)
print(wav)

In [None]:
para = hparams()
fs = para.fs
wav, _ = librosa.load("../input/voiceconvert/VCC2SF1/10001.wav", sr=fs, mono=True)
print(len(wav))