In [26]:
from espnet2.train.dataset import ESPnetDataset
from espnet2.tts.feats_extract.dio import Dio
import torch
import numpy as np
import warnings

In [27]:
from espnet2.tts.feats_extract.dio import Dio

class DioPreprocessor(object):
    def __init__(self, 
            fs=16000,
            n_fft=1024,
            hop_length=128,
            use_log_f0=True,
            num_sub=4,):
        self.dio = Dio(fs=fs, n_fft=n_fft, hop_length=hop_length, use_log_f0=use_log_f0,
                       use_token_averaged_f0=False, use_continuous_f0=False)
        self.num_sub = num_sub

    def __call__(self, text, data):
        wav = data["wav"]
        wav_tensor = torch.tensor(wav, dtype=torch.float32)
        wav_lengths = torch.tensor([len(wav)], dtype=torch.long)
        f0, f0_lengths = self.dio(wav_tensor.unsqueeze(0), wav_lengths)

        f0 = f0.squeeze(0, 2).cpu().numpy()
        print(f0.shape)
        
        num_sub = self.num_sub

        # 長さがnum_subの倍数になるようにパディング
        if f0.shape[0] % num_sub != 0:
            pad = num_sub - f0.shape[0] % num_sub
            f0 = np.concatenate([f0, np.zeros(pad)], axis=0)

        # サブサンプルの準備
        f0s = f0.reshape(-1, num_sub)

        # 代表値（f0p）の計算
        if num_sub % 2 == 0:
            # num_subが偶数の場合は，中間の２つの値の平均を取る
            # ただし，0は平均の計算に含めず，両者0の場合は0を返す
            f0p = f0s[:, num_sub//2-1:num_sub//2+1].copy()
            f0p[f0p == 0] = np.nan
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", category=RuntimeWarning)
                f0p = np.nanmean(f0p, axis=1)
            f0p[np.isnan(f0p)] = 0
        else:
            # num_subが奇数の場合は，単純に中間の値を取る
            f0p = f0s[:, num_sub//2].copy()

        # 1次近似の計算
        f0dy = f0s.copy()
        f0dx = np.arange(num_sub) - (num_sub - 1) / 2
        f0ds = []
        for i in range(f0dy.shape[0]):
            nonzero_count = np.sum(f0dy[i] > 0)
            if nonzero_count > num_sub / 2:
                dx = f0dx[f0dy[i] > 0]
                
                dy = f0dy[i][f0dy[i] > 0]
                f0ds.append(np.polyfit(dx, dy, 1))
            else:
                f0ds.append([0, 0])
        f0ds = np.stack(f0ds)

        result = np.concatenate([f0p.reshape(-1, 1), f0ds], axis=1)

        data["f0data"] = result

        return data       


In [28]:
dio_processor = DioPreprocessor()

dataset = ESPnetDataset(
    [("./dump/raw/train_nodup/wav.scp", "wav", "sound"),],
    preprocess=dio_processor,
)

In [36]:
dataset[7]

(178,)


('A01M0007_0245593_0247015',
 {'wav': array([-0.00073242, -0.00064087, -0.00064087, ...,  0.00076294,
          0.00057983,  0.00064087], dtype=float32),
  'f0data': array([[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
         [ 5.2459526e+00, -2.7206898e-02,  5.2524261e+00],
         [ 5.2265215e+00, -1.8979072e-03,  5.2249317e+00],
         [ 5.2067995e+00,  2.4906634e-03,  5.2055068e+00],
         [ 5.1920223e+00, -7.7142718e-04,  5.1943512e+00],
         [ 5.1944857e+00,  1.2005329e-03,  5.1954870e+00],
         [ 5.1556168e+00, -3.4804963e-02,  5.1511145e+00],
         [ 5.1340590e+00,  3.7063598e-03,  5.1334987e+00],
         [ 5.1399641e+00,  1.7378330e-03,  5.1391382e+00],
         [ 5.1121502e+00, -6.9402219e-03,  5.1127477e+00],
         [ 5.1001673e+00, -4.3285848e-03,  5.0989990e+00],
         [ 5.0692706e+00, -9.6879955e-03,  5.0687618e+00],
         [ 4.9456758e+00, -3.4686230e-02,  4.9596663e+00],
         [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
         [